Compare commits
7 Commits
v0.8.8
...
csi-use-vi
Author | SHA1 | Date | |
---|---|---|---|
2b4e0de397 | |||
726c6d3470 | |||
2389b49a16 | |||
fe1ee67b05 | |||
c775a52a7d | |||
e307dd13ed | |||
a7f63f7c29 |
@@ -1,7 +1,7 @@
|
||||
cmake_minimum_required(VERSION 2.8.12)
|
||||
cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
set(VERSION "0.8.8")
|
||||
set(VERSION "0.8.5")
|
||||
|
||||
add_subdirectory(src)
|
||||
|
@@ -1,4 +1,4 @@
|
||||
VERSION ?= v0.8.8
|
||||
VERSION ?= v0.8.5
|
||||
|
||||
all: build push
|
||||
|
||||
|
@@ -49,7 +49,7 @@ spec:
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
allowPrivilegeEscalation: true
|
||||
image: vitalif/vitastor-csi:v0.8.8
|
||||
image: vitalif/vitastor-csi:v0.8.5
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -116,7 +116,7 @@ spec:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add: ["SYS_ADMIN"]
|
||||
image: vitalif/vitastor-csi:v0.8.8
|
||||
image: vitalif/vitastor-csi:v0.8.5
|
||||
args:
|
||||
- "--node=$(NODE_ID)"
|
||||
- "--endpoint=$(CSI_ENDPOINT)"
|
||||
|
@@ -5,7 +5,7 @@ package vitastor
|
||||
|
||||
const (
|
||||
vitastorCSIDriverName = "csi.vitastor.io"
|
||||
vitastorCSIDriverVersion = "0.8.8"
|
||||
vitastorCSIDriverVersion = "0.8.5"
|
||||
)
|
||||
|
||||
// Config struct fills the parameters of request or user input
|
||||
|
@@ -6,7 +6,6 @@ package vitastor
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"bytes"
|
||||
"strconv"
|
||||
@@ -179,7 +178,7 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
|
||||
}
|
||||
|
||||
// Create image using vitastor-cli
|
||||
_, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) })
|
||||
_, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", string(volSize), "--pool", string(poolId) })
|
||||
if (err != nil)
|
||||
{
|
||||
if (strings.Index(err.Error(), "already exists") > 0)
|
||||
|
4
debian/changelog
vendored
4
debian/changelog
vendored
@@ -1,10 +1,10 @@
|
||||
vitastor (0.8.8-1) unstable; urgency=medium
|
||||
vitastor (0.8.5-1) unstable; urgency=medium
|
||||
|
||||
* Bugfixes
|
||||
|
||||
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
|
||||
|
||||
vitastor (0.8.8-1) unstable; urgency=medium
|
||||
vitastor (0.8.5-1) unstable; urgency=medium
|
||||
|
||||
* Implement NFS proxy
|
||||
* Add documentation
|
||||
|
8
debian/vitastor.Dockerfile
vendored
8
debian/vitastor.Dockerfile
vendored
@@ -34,8 +34,8 @@ RUN set -e -x; \
|
||||
mkdir -p /root/packages/vitastor-$REL; \
|
||||
rm -rf /root/packages/vitastor-$REL/*; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
cp -r /root/vitastor vitastor-0.8.8; \
|
||||
cd vitastor-0.8.8; \
|
||||
cp -r /root/vitastor vitastor-0.8.5; \
|
||||
cd vitastor-0.8.5; \
|
||||
ln -s /root/fio-build/fio-*/ ./fio; \
|
||||
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
|
||||
@@ -48,8 +48,8 @@ RUN set -e -x; \
|
||||
rm -rf a b; \
|
||||
echo "dep:fio=$FIO" > debian/fio_version; \
|
||||
cd /root/packages/vitastor-$REL; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.8.orig.tar.xz vitastor-0.8.8; \
|
||||
cd vitastor-0.8.8; \
|
||||
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.5.orig.tar.xz vitastor-0.8.5; \
|
||||
cd vitastor-0.8.5; \
|
||||
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
|
||||
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
|
||||
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
|
||||
|
@@ -19,7 +19,6 @@ between clients, OSDs and etcd.
|
||||
- [rdma_max_sge](#rdma_max_sge)
|
||||
- [rdma_max_msg](#rdma_max_msg)
|
||||
- [rdma_max_recv](#rdma_max_recv)
|
||||
- [rdma_max_send](#rdma_max_send)
|
||||
- [peer_connect_interval](#peer_connect_interval)
|
||||
- [peer_connect_timeout](#peer_connect_timeout)
|
||||
- [osd_idle_timeout](#osd_idle_timeout)
|
||||
@@ -75,12 +74,6 @@ to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
||||
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
||||
root to list available RDMA devices and their features.
|
||||
|
||||
Remember that you also have to configure your network switches if you use
|
||||
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
||||
the manual of your network vendor for details about setting up the switch
|
||||
for RoCEv2 correctly. Usually it means setting up Lossless Ethernet with
|
||||
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
|
||||
|
||||
## rdma_port_num
|
||||
|
||||
- Type: integer
|
||||
@@ -123,30 +116,20 @@ required to change this parameter.
|
||||
## rdma_max_msg
|
||||
|
||||
- Type: integer
|
||||
- Default: 132096
|
||||
- Default: 1048576
|
||||
|
||||
Maximum size of a single RDMA send or receive operation in bytes.
|
||||
|
||||
## rdma_max_recv
|
||||
|
||||
- Type: integer
|
||||
- Default: 16
|
||||
|
||||
Maximum number of RDMA receive buffers per connection (RDMA requires
|
||||
preallocated buffers to receive data). Each buffer is `rdma_max_msg` bytes
|
||||
in size. So this setting directly affects memory usage: a single Vitastor
|
||||
RDMA client uses `rdma_max_recv * rdma_max_msg * OSD_COUNT` bytes of memory.
|
||||
Default is roughly 2 MB * number of OSDs.
|
||||
|
||||
## rdma_max_send
|
||||
|
||||
- Type: integer
|
||||
- Default: 8
|
||||
|
||||
Maximum number of outstanding RDMA send operations per connection. Should be
|
||||
less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
|
||||
Doesn't affect memory usage - additional memory isn't allocated for send
|
||||
operations.
|
||||
Maximum number of parallel RDMA receive operations. Note that this number
|
||||
of receive buffers `rdma_max_msg` in size are allocated for each client,
|
||||
so this setting actually affects memory usage. This is because RDMA receive
|
||||
operations are (sadly) still not zero-copy in Vitastor. It may be fixed in
|
||||
later versions.
|
||||
|
||||
## peer_connect_interval
|
||||
|
||||
|
@@ -19,7 +19,6 @@
|
||||
- [rdma_max_sge](#rdma_max_sge)
|
||||
- [rdma_max_msg](#rdma_max_msg)
|
||||
- [rdma_max_recv](#rdma_max_recv)
|
||||
- [rdma_max_send](#rdma_max_send)
|
||||
- [peer_connect_interval](#peer_connect_interval)
|
||||
- [peer_connect_timeout](#peer_connect_timeout)
|
||||
- [osd_idle_timeout](#osd_idle_timeout)
|
||||
@@ -79,13 +78,6 @@ Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Наприме
|
||||
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
||||
параметры и возможности.
|
||||
|
||||
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
||||
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
||||
нестабильной производительностью. Подробную информацию о настройке
|
||||
коммутатора для RoCEv2 ищите в документации производителя. Обычно это
|
||||
подразумевает настройку сети без потерь на основе PFC (Priority Flow
|
||||
Control) и ECN (Explicit Congestion Notification).
|
||||
|
||||
## rdma_port_num
|
||||
|
||||
- Тип: целое число
|
||||
@@ -129,32 +121,22 @@ OSD в любом случае согласовывают реальное зн
|
||||
## rdma_max_msg
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 132096
|
||||
- Значение по умолчанию: 1048576
|
||||
|
||||
Максимальный размер одной RDMA-операции отправки или приёма.
|
||||
|
||||
## rdma_max_recv
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 16
|
||||
|
||||
Максимальное число буферов для RDMA-приёма данных на одно соединение
|
||||
(RDMA требует заранее выделенных буферов для приёма данных). Каждый буфер
|
||||
имеет размер `rdma_max_msg` байт. Таким образом, настройка прямо влияет на
|
||||
потребление памяти - один Vitastor-клиент с RDMA использует
|
||||
`rdma_max_recv * rdma_max_msg * ЧИСЛО_OSD` байт памяти, по умолчанию -
|
||||
примерно 2 МБ * число OSD.
|
||||
|
||||
## rdma_max_send
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 8
|
||||
|
||||
Максимальное число RDMA-операций отправки, отправляемых в очередь одного
|
||||
соединения. Желательно, чтобы оно было меньше `rdma_max_recv`, чтобы
|
||||
у принимающей стороны в процессе работы не заканчивались буферы на приём.
|
||||
Не влияет на потребление памяти - дополнительная память на операции отправки
|
||||
не выделяется.
|
||||
Максимальное число параллельных RDMA-операций получения данных. Следует
|
||||
иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется
|
||||
для каждого подключённого клиентского соединения, так что данная настройка
|
||||
влияет на потребление памяти. Это так потому, что RDMA-приём данных в
|
||||
Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз
|
||||
копирует данные в памяти. Данная особенность, возможно, будет исправлена в
|
||||
более новых версиях Vitastor.
|
||||
|
||||
## peer_connect_interval
|
||||
|
||||
|
@@ -53,12 +53,6 @@
|
||||
to work. For example, Mellanox ConnectX-3 and older adapters don't have
|
||||
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
|
||||
root to list available RDMA devices and their features.
|
||||
|
||||
Remember that you also have to configure your network switches if you use
|
||||
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
|
||||
the manual of your network vendor for details about setting up the switch
|
||||
for RoCEv2 correctly. Usually it means setting up Lossless Ethernet with
|
||||
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
|
||||
info_ru: |
|
||||
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
|
||||
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
|
||||
@@ -67,13 +61,6 @@
|
||||
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
|
||||
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
|
||||
параметры и возможности.
|
||||
|
||||
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
|
||||
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
|
||||
нестабильной производительностью. Подробную информацию о настройке
|
||||
коммутатора для RoCEv2 ищите в документации производителя. Обычно это
|
||||
подразумевает настройку сети без потерь на основе PFC (Priority Flow
|
||||
Control) и ECN (Explicit Congestion Notification).
|
||||
- name: rdma_port_num
|
||||
type: int
|
||||
default: 1
|
||||
@@ -127,39 +114,26 @@
|
||||
так что менять этот параметр обычно не нужно.
|
||||
- name: rdma_max_msg
|
||||
type: int
|
||||
default: 132096
|
||||
default: 1048576
|
||||
info: Maximum size of a single RDMA send or receive operation in bytes.
|
||||
info_ru: Максимальный размер одной RDMA-операции отправки или приёма.
|
||||
- name: rdma_max_recv
|
||||
type: int
|
||||
default: 16
|
||||
info: |
|
||||
Maximum number of RDMA receive buffers per connection (RDMA requires
|
||||
preallocated buffers to receive data). Each buffer is `rdma_max_msg` bytes
|
||||
in size. So this setting directly affects memory usage: a single Vitastor
|
||||
RDMA client uses `rdma_max_recv * rdma_max_msg * OSD_COUNT` bytes of memory.
|
||||
Default is roughly 2 MB * number of OSDs.
|
||||
info_ru: |
|
||||
Максимальное число буферов для RDMA-приёма данных на одно соединение
|
||||
(RDMA требует заранее выделенных буферов для приёма данных). Каждый буфер
|
||||
имеет размер `rdma_max_msg` байт. Таким образом, настройка прямо влияет на
|
||||
потребление памяти - один Vitastor-клиент с RDMA использует
|
||||
`rdma_max_recv * rdma_max_msg * ЧИСЛО_OSD` байт памяти, по умолчанию -
|
||||
примерно 2 МБ * число OSD.
|
||||
- name: rdma_max_send
|
||||
type: int
|
||||
default: 8
|
||||
info: |
|
||||
Maximum number of outstanding RDMA send operations per connection. Should be
|
||||
less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
|
||||
Doesn't affect memory usage - additional memory isn't allocated for send
|
||||
operations.
|
||||
Maximum number of parallel RDMA receive operations. Note that this number
|
||||
of receive buffers `rdma_max_msg` in size are allocated for each client,
|
||||
so this setting actually affects memory usage. This is because RDMA receive
|
||||
operations are (sadly) still not zero-copy in Vitastor. It may be fixed in
|
||||
later versions.
|
||||
info_ru: |
|
||||
Максимальное число RDMA-операций отправки, отправляемых в очередь одного
|
||||
соединения. Желательно, чтобы оно было меньше `rdma_max_recv`, чтобы
|
||||
у принимающей стороны в процессе работы не заканчивались буферы на приём.
|
||||
Не влияет на потребление памяти - дополнительная память на операции отправки
|
||||
не выделяется.
|
||||
Максимальное число параллельных RDMA-операций получения данных. Следует
|
||||
иметь в виду, что данное число буферов размером `rdma_max_msg` выделяется
|
||||
для каждого подключённого клиентского соединения, так что данная настройка
|
||||
влияет на потребление памяти. Это так потому, что RDMA-приём данных в
|
||||
Vitastor, увы, всё равно не является zero-copy, т.е. всё равно 1 раз
|
||||
копирует данные в памяти. Данная особенность, возможно, будет исправлена в
|
||||
более новых версиях Vitastor.
|
||||
- name: peer_connect_interval
|
||||
type: sec
|
||||
min: 1
|
||||
|
@@ -22,17 +22,14 @@
|
||||
- Add Vitastor package repository:
|
||||
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release.rpm`
|
||||
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release.rpm`
|
||||
- AlmaLinux 9 and other RHEL 9 clones (Rocky, Oracle...): `dnf install https://vitastor.io/rpms/centos/9/vitastor-release.rpm`
|
||||
- Enable EPEL: `yum/dnf install epel-release`
|
||||
- Enable additional CentOS repositories:
|
||||
- CentOS 7: `yum install centos-release-scl`
|
||||
- CentOS 8: `dnf install centos-release-advanced-virtualization`
|
||||
- RHEL 9 clones: not required
|
||||
- Enable elrepo-kernel:
|
||||
- CentOS 7: `yum install https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm`
|
||||
- CentOS 8: `dnf install https://www.elrepo.org/elrepo-release-8.el8.elrepo.noarch.rpm`
|
||||
- RHEL 9 clones: optional, not required: `dnf install https://www.elrepo.org/elrepo-release-9.el9.elrepo.noarch.rpm`
|
||||
- Install packages: `yum/dnf install vitastor lpsolve etcd qemu-kvm` and optionally `kernel-ml` if you use elrepo-kernel
|
||||
- Install packages: `yum/dnf install vitastor lpsolve etcd kernel-ml qemu-kvm`
|
||||
|
||||
## Installation requirements
|
||||
|
||||
|
@@ -70,7 +70,7 @@ For EC pools the configuration should look like the following:
|
||||
|
||||
```
|
||||
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
|
||||
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}}'
|
||||
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}'
|
||||
```
|
||||
|
||||
After you do this, one of the monitors will configure PGs and OSDs will start them.
|
||||
|
@@ -71,7 +71,7 @@ etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool",
|
||||
|
||||
```
|
||||
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
|
||||
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}}'
|
||||
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}'
|
||||
```
|
||||
|
||||
После этого один из мониторов должен сконфигурировать PG, а OSD должны запустить их.
|
||||
|
@@ -35,24 +35,15 @@ Write amplification for 4 KB blocks is usually 3-5 in Vitastor:
|
||||
If you manage to get an SSD which handles 512 byte blocks well (Optane?) you may
|
||||
lower 1, 3 and 4 to 512 bytes (1/8 of data size) and get WA as low as 2.375.
|
||||
|
||||
Implemented NVDIMM support can basically eliminate WA at all - all extra writes will
|
||||
go to DRAM memory. But this requires a test cluster with NVDIMM - please contact me
|
||||
if you want to provide me with such cluster for tests.
|
||||
|
||||
Lazy fsync also reduces WA for parallel workloads because journal blocks are only
|
||||
written when they fill up or fsync is requested.
|
||||
|
||||
## In Practice
|
||||
|
||||
In practice, using tests from [Understanding Performance](understanding.en.md), decent TCP network,
|
||||
good server-grade SSD/NVMe drives and disabled CPU power saving, you should head for:
|
||||
In practice, using tests from [Understanding Performance](understanding.en.md)
|
||||
and good server-grade SSD/NVMe drives, you should head for:
|
||||
- At least 5000 T1Q1 replicated read and write iops (maximum 0.2ms latency)
|
||||
- At least 5000 T1Q1 EC read IOPS and at least 2200 EC write IOPS (maximum 0.45ms latency)
|
||||
- At least ~80k parallel read iops or ~30k write iops per 1 core (1 OSD)
|
||||
- Disk-speed or wire-speed linear reads and writes, whichever is the bottleneck in your case
|
||||
|
||||
Lower results may mean that you have bad drives, bad network or some kind of misconfiguration.
|
||||
|
||||
Current latency records:
|
||||
- 9668 T1Q1 replicated write iops (0.103 ms latency) with TCP and NVMe
|
||||
- 9143 T1Q1 replicated read iops (0.109 ms latency) with TCP and NVMe
|
||||
|
@@ -36,25 +36,6 @@ WA (мультипликатор записи) для 4 КБ блоков в Vit
|
||||
Если вы найдёте SSD, хорошо работающий с 512-байтными блоками данных (Optane?),
|
||||
то 1, 3 и 4 можно снизить до 512 байт (1/8 от размера данных) и получить WA всего 2.375.
|
||||
|
||||
Если реализовать поддержку NVDIMM, то WA можно, условно говоря, ликвидировать вообще - все
|
||||
дополнительные операции записи смогут обслуживаться DRAM памятью. Но для этого необходим
|
||||
тестовый кластер с NVDIMM - пишите, если готовы предоставить такой для тестов.
|
||||
|
||||
Кроме того, WA снижается при использовании отложенного/ленивого сброса при параллельной
|
||||
нагрузке, т.к. блоки журнала записываются на диск только когда они заполняются или явным
|
||||
образом запрашивается fsync.
|
||||
|
||||
## На практике
|
||||
|
||||
На практике, используя тесты fio со страницы [Понимание сути производительности систем хранения](understanding.ru.md),
|
||||
нормальную TCP-сеть, хорошие серверные SSD/NVMe, при отключённом энергосбережении процессоров вы можете рассчитывать на:
|
||||
- От 5000 IOPS в 1 поток (T1Q1) и на чтение, и на запись при использовании репликации (задержка до 0.2мс)
|
||||
- От 5000 IOPS в 1 поток (T1Q1) на чтение и 2200 IOPS в 1 поток на запись при использовании EC (задержка до 0.45мс)
|
||||
- От 80000 IOPS на чтение в параллельном режиме на 1 ядро, от 30000 IOPS на запись на 1 ядро (на 1 OSD)
|
||||
- Скорость параллельного линейного чтения и записи, равная меньшему значению из скорости дисков или сети
|
||||
|
||||
Худшие результаты означают, что у вас либо медленные диски, либо медленная сеть, либо что-то неправильно настроено.
|
||||
|
||||
Зафиксированный на данный момент рекорд задержки:
|
||||
- 9668 IOPS (0.103 мс задержка) в 1 поток (T1Q1) на запись с TCP и NVMe при использовании репликации
|
||||
- 9143 IOPS (0.109 мс задержка) в 1 поток (T1Q1) на чтение с TCP и NVMe при использовании репликации
|
||||
|
@@ -1,4 +1,4 @@
|
||||
[Documentation](../../README.md#documentation) → Usage → Disk management tool
|
||||
[Documentation](../../README.md#documentation) → Usage → Disk Tool
|
||||
|
||||
-----
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
[Документация](../../README-ru.md#документация) → Использование → Инструмент управления дисками
|
||||
[Документация](../../README-ru.md#документация) → Использование → Управление дисками
|
||||
|
||||
-----
|
||||
|
||||
|
35
mon/mon.js
35
mon/mon.js
@@ -51,9 +51,8 @@ const etcd_tree = {
|
||||
// THIS IS JUST A POOR MAN'S CONFIG DOCUMENTATION
|
||||
// etcd connection
|
||||
config_path: "/etc/vitastor/vitastor.conf",
|
||||
etcd_prefix: "/vitastor",
|
||||
// etcd connection - configurable online
|
||||
etcd_address: "10.0.115.10:2379/v3",
|
||||
etcd_prefix: "/vitastor",
|
||||
// mon
|
||||
etcd_mon_ttl: 30, // min: 10
|
||||
etcd_mon_timeout: 1000, // ms. min: 0
|
||||
@@ -71,15 +70,14 @@ const etcd_tree = {
|
||||
rdma_gid_index: 0,
|
||||
rdma_mtu: 4096,
|
||||
rdma_max_sge: 128,
|
||||
rdma_max_send: 8,
|
||||
rdma_max_recv: 16,
|
||||
rdma_max_send: 64,
|
||||
rdma_max_recv: 128,
|
||||
rdma_max_msg: 132096,
|
||||
log_level: 0,
|
||||
block_size: 131072,
|
||||
disk_alignment: 4096,
|
||||
bitmap_granularity: 4096,
|
||||
immediate_commit: false, // 'all' or 'small'
|
||||
// client and osd - configurable online
|
||||
log_level: 0,
|
||||
client_dirty_limit: 33554432,
|
||||
peer_connect_interval: 5, // seconds. min: 1
|
||||
peer_connect_timeout: 5, // seconds. min: 1
|
||||
@@ -97,19 +95,22 @@ const etcd_tree = {
|
||||
osd_network: null, // "192.168.7.0/24" or an array of masks
|
||||
bind_address: "0.0.0.0",
|
||||
bind_port: 0,
|
||||
readonly: false,
|
||||
osd_memlock: false,
|
||||
// osd - configurable online
|
||||
autosync_interval: 5,
|
||||
autosync_writes: 128,
|
||||
client_queue_depth: 128, // unused
|
||||
recovery_queue_depth: 4,
|
||||
recovery_sync_batch: 16,
|
||||
readonly: false,
|
||||
no_recovery: false,
|
||||
no_rebalance: false,
|
||||
print_stats_interval: 3,
|
||||
slow_log_interval: 10,
|
||||
inode_vanish_time: 60,
|
||||
osd_memlock: false,
|
||||
scrub_interval: '30d', // 1s/1m/1h/1d
|
||||
scrub_queue_depth: 1,
|
||||
scrub_sleep: 0, // milliseconds
|
||||
scrub_list_limit: 1000, // objects to list on one scrub iteration
|
||||
// blockstore - fixed in superblock
|
||||
block_size,
|
||||
disk_alignment,
|
||||
@@ -128,15 +129,14 @@ const etcd_tree = {
|
||||
meta_offset,
|
||||
disable_meta_fsync,
|
||||
disable_device_lock,
|
||||
// blockstore - configurable offline
|
||||
// blockstore - configurable
|
||||
max_write_iodepth,
|
||||
min_flusher_count: 1,
|
||||
max_flusher_count: 256,
|
||||
inmemory_metadata,
|
||||
inmemory_journal,
|
||||
journal_sector_buffer_count,
|
||||
journal_no_same_sector_overwrites,
|
||||
// blockstore - configurable online
|
||||
max_write_iodepth,
|
||||
min_flusher_count: 1,
|
||||
max_flusher_count: 256,
|
||||
throttle_small_writes: false,
|
||||
throttle_target_iops: 100,
|
||||
throttle_target_mbs: 100,
|
||||
@@ -172,6 +172,8 @@ const etcd_tree = {
|
||||
osd_tags?: 'nvme' | [ 'nvme', ... ],
|
||||
// prefer to put primary on OSD with these tags
|
||||
primary_affinity_tags?: 'nvme' | [ 'nvme', ... ],
|
||||
// scrub interval
|
||||
scrub_interval?: '30d',
|
||||
},
|
||||
...
|
||||
}, */
|
||||
@@ -266,8 +268,8 @@ const etcd_tree = {
|
||||
<pg_id>: {
|
||||
primary: osd_num_t,
|
||||
state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
|
||||
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
|
||||
"has_invalid"|"left_on_dead")[],
|
||||
"degraded"|"has_corrupted"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
|
||||
"has_invalid"|"left_on_dead"|"scrubbing")[],
|
||||
}
|
||||
}, */
|
||||
},
|
||||
@@ -289,6 +291,7 @@ const etcd_tree = {
|
||||
osd_sets: osd_num_t[][],
|
||||
all_peers: osd_num_t[],
|
||||
epoch: uint64_t,
|
||||
scrub_ts: uint64_t,
|
||||
},
|
||||
}, */
|
||||
},
|
||||
|
@@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||
from cinder.volume import driver
|
||||
from cinder.volume import volume_utils
|
||||
|
||||
VERSION = '0.8.8'
|
||||
VERSION = '0.8.5'
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
@@ -1,169 +0,0 @@
|
||||
Index: pve-qemu-kvm-7.2.0/block/meson.build
|
||||
===================================================================
|
||||
--- pve-qemu-kvm-7.2.0.orig/block/meson.build
|
||||
+++ pve-qemu-kvm-7.2.0/block/meson.build
|
||||
@@ -113,6 +113,7 @@ foreach m : [
|
||||
[libnfs, 'nfs', files('nfs.c')],
|
||||
[libssh, 'ssh', files('ssh.c')],
|
||||
[rbd, 'rbd', files('rbd.c')],
|
||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||
]
|
||||
if m[0].found()
|
||||
module_ss = ss.source_set()
|
||||
Index: pve-qemu-kvm-7.2.0/meson.build
|
||||
===================================================================
|
||||
--- pve-qemu-kvm-7.2.0.orig/meson.build
|
||||
+++ pve-qemu-kvm-7.2.0/meson.build
|
||||
@@ -1026,6 +1026,26 @@ if not get_option('rbd').auto() or have_
|
||||
endif
|
||||
endif
|
||||
|
||||
+vitastor = not_found
|
||||
+if not get_option('vitastor').auto() or have_block
|
||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||
+ required: get_option('vitastor'), kwargs: static_kwargs)
|
||||
+ if libvitastor_client.found()
|
||||
+ if cc.links('''
|
||||
+ #include <vitastor_c.h>
|
||||
+ int main(void) {
|
||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
+ return 0;
|
||||
+ }''', dependencies: libvitastor_client)
|
||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||
+ elif get_option('vitastor').enabled()
|
||||
+ error('could not link libvitastor_client')
|
||||
+ else
|
||||
+ warning('could not link libvitastor_client, disabling')
|
||||
+ endif
|
||||
+ endif
|
||||
+endif
|
||||
+
|
||||
glusterfs = not_found
|
||||
glusterfs_ftruncate_has_stat = false
|
||||
glusterfs_iocb_has_stat = false
|
||||
@@ -1865,6 +1885,7 @@ config_host_data.set('CONFIG_NUMA', numa
|
||||
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
||||
config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
|
||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||
config_host_data.set('CONFIG_RDMA', rdma.found())
|
||||
config_host_data.set('CONFIG_SDL', sdl.found())
|
||||
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
|
||||
@@ -3957,6 +3978,7 @@ if spice_protocol.found()
|
||||
summary_info += {' spice server support': spice}
|
||||
endif
|
||||
summary_info += {'rbd support': rbd}
|
||||
+summary_info += {'vitastor support': vitastor}
|
||||
summary_info += {'smartcard support': cacard}
|
||||
summary_info += {'U2F support': u2f}
|
||||
summary_info += {'libusb': libusb}
|
||||
Index: pve-qemu-kvm-7.2.0/meson_options.txt
|
||||
===================================================================
|
||||
--- pve-qemu-kvm-7.2.0.orig/meson_options.txt
|
||||
+++ pve-qemu-kvm-7.2.0/meson_options.txt
|
||||
@@ -169,6 +169,8 @@ option('lzo', type : 'feature', value :
|
||||
description: 'lzo compression support')
|
||||
option('rbd', type : 'feature', value : 'auto',
|
||||
description: 'Ceph block device driver')
|
||||
+option('vitastor', type : 'feature', value : 'auto',
|
||||
+ description: 'Vitastor block device driver')
|
||||
option('opengl', type : 'feature', value : 'auto',
|
||||
description: 'OpenGL support')
|
||||
option('rdma', type : 'feature', value : 'auto',
|
||||
Index: pve-qemu-kvm-7.2.0/qapi/block-core.json
|
||||
===================================================================
|
||||
--- pve-qemu-kvm-7.2.0.orig/qapi/block-core.json
|
||||
+++ pve-qemu-kvm-7.2.0/qapi/block-core.json
|
||||
@@ -3213,7 +3213,7 @@
|
||||
'raw', 'rbd',
|
||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||
'pbs',
|
||||
- 'ssh', 'throttle', 'vdi', 'vhdx',
|
||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
|
||||
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
|
||||
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
|
||||
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
|
||||
@@ -4223,6 +4223,28 @@
|
||||
'*server': ['InetSocketAddressBase'] } }
|
||||
|
||||
##
|
||||
+# @BlockdevOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific block device options for vitastor
|
||||
+#
|
||||
+# @image: Image name
|
||||
+# @inode: Inode number
|
||||
+# @pool: Pool ID
|
||||
+# @size: Desired image size in bytes
|
||||
+# @config-path: Path to Vitastor configuration
|
||||
+# @etcd-host: etcd connection address(es)
|
||||
+# @etcd-prefix: etcd key/value prefix
|
||||
+##
|
||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||
+ 'data': { '*inode': 'uint64',
|
||||
+ '*pool': 'uint64',
|
||||
+ '*size': 'uint64',
|
||||
+ '*image': 'str',
|
||||
+ '*config-path': 'str',
|
||||
+ '*etcd-host': 'str',
|
||||
+ '*etcd-prefix': 'str' } }
|
||||
+
|
||||
+##
|
||||
# @ReplicationMode:
|
||||
#
|
||||
# An enumeration of replication modes.
|
||||
@@ -4671,6 +4693,7 @@
|
||||
'throttle': 'BlockdevOptionsThrottle',
|
||||
'vdi': 'BlockdevOptionsGenericFormat',
|
||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||
'virtio-blk-vfio-pci':
|
||||
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
||||
'if': 'CONFIG_BLKIO' },
|
||||
@@ -5072,6 +5095,17 @@
|
||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||
|
||||
##
|
||||
+# @BlockdevCreateOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific image creation options for Vitastor.
|
||||
+#
|
||||
+# @size: Size of the virtual disk in bytes
|
||||
+##
|
||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||
+ 'size': 'size' } }
|
||||
+
|
||||
+##
|
||||
# @BlockdevVmdkSubformat:
|
||||
#
|
||||
# Subformat options for VMDK images
|
||||
@@ -5269,6 +5303,7 @@
|
||||
'ssh': 'BlockdevCreateOptionsSsh',
|
||||
'vdi': 'BlockdevCreateOptionsVdi',
|
||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||
'vpc': 'BlockdevCreateOptionsVpc'
|
||||
} }
|
||||
Index: pve-qemu-kvm-7.2.0/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
===================================================================
|
||||
--- pve-qemu-kvm-7.2.0.orig/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
+++ pve-qemu-kvm-7.2.0/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
@@ -31,7 +31,7 @@
|
||||
--with-git=meson \
|
||||
--with-git-submodules=update \
|
||||
--target-list="x86_64-softmmu" \
|
||||
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
--audio-drv-list="" \
|
||||
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
|
||||
--with-coroutine=ucontext \
|
||||
@@ -179,6 +179,7 @@
|
||||
--enable-opengl \
|
||||
--enable-pie \
|
||||
--enable-rbd \
|
||||
+--enable-vitastor \
|
||||
--enable-rdma \
|
||||
--enable-seccomp \
|
||||
--enable-snappy \
|
@@ -1,169 +0,0 @@
|
||||
diff --git a/block/meson.build b/block/meson.build
|
||||
index deb73ca389..e269f599a1 100644
|
||||
--- a/block/meson.build
|
||||
+++ b/block/meson.build
|
||||
@@ -78,6 +78,7 @@ foreach m : [
|
||||
[libnfs, 'nfs', files('nfs.c')],
|
||||
[libssh, 'ssh', files('ssh.c')],
|
||||
[rbd, 'rbd', files('rbd.c')],
|
||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||
]
|
||||
if m[0].found()
|
||||
module_ss = ss.source_set()
|
||||
diff --git a/meson.build b/meson.build
|
||||
index 96de1a6ef9..2e3994777d 100644
|
||||
--- a/meson.build
|
||||
+++ b/meson.build
|
||||
@@ -838,6 +838,26 @@ if not get_option('rbd').auto() or have_block
|
||||
endif
|
||||
endif
|
||||
|
||||
+vitastor = not_found
|
||||
+if not get_option('vitastor').auto() or have_block
|
||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||
+ required: get_option('vitastor'), kwargs: static_kwargs)
|
||||
+ if libvitastor_client.found()
|
||||
+ if cc.links('''
|
||||
+ #include <vitastor_c.h>
|
||||
+ int main(void) {
|
||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
+ return 0;
|
||||
+ }''', dependencies: libvitastor_client)
|
||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||
+ elif get_option('vitastor').enabled()
|
||||
+ error('could not link libvitastor_client')
|
||||
+ else
|
||||
+ warning('could not link libvitastor_client, disabling')
|
||||
+ endif
|
||||
+ endif
|
||||
+endif
|
||||
+
|
||||
glusterfs = not_found
|
||||
glusterfs_ftruncate_has_stat = false
|
||||
glusterfs_iocb_has_stat = false
|
||||
@@ -1455,6 +1475,7 @@ config_host_data.set('CONFIG_LINUX_AIO', libaio.found())
|
||||
config_host_data.set('CONFIG_LINUX_IO_URING', linux_io_uring.found())
|
||||
config_host_data.set('CONFIG_LIBPMEM', libpmem.found())
|
||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||
config_host_data.set('CONFIG_SDL', sdl.found())
|
||||
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
|
||||
config_host_data.set('CONFIG_SECCOMP', seccomp.found())
|
||||
@@ -3412,6 +3433,7 @@ if spice_protocol.found()
|
||||
summary_info += {' spice server support': spice}
|
||||
endif
|
||||
summary_info += {'rbd support': rbd}
|
||||
+summary_info += {'vitastor support': vitastor}
|
||||
summary_info += {'xfsctl support': config_host.has_key('CONFIG_XFS')}
|
||||
summary_info += {'smartcard support': cacard}
|
||||
summary_info += {'U2F support': u2f}
|
||||
diff --git a/meson_options.txt b/meson_options.txt
|
||||
index e392323732..5b56007475 100644
|
||||
--- a/meson_options.txt
|
||||
+++ b/meson_options.txt
|
||||
@@ -121,6 +121,8 @@ option('lzo', type : 'feature', value : 'auto',
|
||||
description: 'lzo compression support')
|
||||
option('rbd', type : 'feature', value : 'auto',
|
||||
description: 'Ceph block device driver')
|
||||
+option('vitastor', type : 'feature', value : 'auto',
|
||||
+ description: 'Vitastor block device driver')
|
||||
option('gtk', type : 'feature', value : 'auto',
|
||||
description: 'GTK+ user interface')
|
||||
option('sdl', type : 'feature', value : 'auto',
|
||||
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
||||
index 1d3dd9cb48..88453405e5 100644
|
||||
--- a/qapi/block-core.json
|
||||
+++ b/qapi/block-core.json
|
||||
@@ -2930,7 +2930,7 @@
|
||||
'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
|
||||
'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
|
||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||
- 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
|
||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor', 'vmdk', 'vpc', 'vvfat' ] }
|
||||
|
||||
##
|
||||
# @BlockdevOptionsFile:
|
||||
@@ -3864,6 +3864,28 @@
|
||||
'*key-secret': 'str',
|
||||
'*server': ['InetSocketAddressBase'] } }
|
||||
|
||||
+##
|
||||
+# @BlockdevOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific block device options for vitastor
|
||||
+#
|
||||
+# @image: Image name
|
||||
+# @inode: Inode number
|
||||
+# @pool: Pool ID
|
||||
+# @size: Desired image size in bytes
|
||||
+# @config-path: Path to Vitastor configuration
|
||||
+# @etcd-host: etcd connection address(es)
|
||||
+# @etcd-prefix: etcd key/value prefix
|
||||
+##
|
||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||
+ 'data': { '*inode': 'uint64',
|
||||
+ '*pool': 'uint64',
|
||||
+ '*size': 'uint64',
|
||||
+ '*image': 'str',
|
||||
+ '*config-path': 'str',
|
||||
+ '*etcd-host': 'str',
|
||||
+ '*etcd-prefix': 'str' } }
|
||||
+
|
||||
##
|
||||
# @ReplicationMode:
|
||||
#
|
||||
@@ -4259,6 +4281,7 @@
|
||||
'throttle': 'BlockdevOptionsThrottle',
|
||||
'vdi': 'BlockdevOptionsGenericFormat',
|
||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||
'vmdk': 'BlockdevOptionsGenericCOWFormat',
|
||||
'vpc': 'BlockdevOptionsGenericFormat',
|
||||
'vvfat': 'BlockdevOptionsVVFAT'
|
||||
@@ -4647,6 +4670,17 @@
|
||||
'*cluster-size' : 'size',
|
||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||
|
||||
+##
|
||||
+# @BlockdevCreateOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific image creation options for Vitastor.
|
||||
+#
|
||||
+# @size: Size of the virtual disk in bytes
|
||||
+##
|
||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||
+ 'size': 'size' } }
|
||||
+
|
||||
##
|
||||
# @BlockdevVmdkSubformat:
|
||||
#
|
||||
@@ -4846,6 +4880,7 @@
|
||||
'ssh': 'BlockdevCreateOptionsSsh',
|
||||
'vdi': 'BlockdevCreateOptionsVdi',
|
||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||
'vpc': 'BlockdevCreateOptionsVpc'
|
||||
} }
|
||||
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
|
||||
index 7a17ff4218..cdddbf32aa 100644
|
||||
--- a/scripts/meson-buildoptions.sh
|
||||
+++ b/scripts/meson-buildoptions.sh
|
||||
@@ -69,6 +69,7 @@ meson_options_help() {
|
||||
printf "%s\n" ' oss OSS sound support'
|
||||
printf "%s\n" ' pa PulseAudio sound support'
|
||||
printf "%s\n" ' rbd Ceph block device driver'
|
||||
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
||||
printf "%s\n" ' sdl SDL user interface'
|
||||
printf "%s\n" ' sdl-image SDL Image support for icons'
|
||||
printf "%s\n" ' seccomp seccomp support'
|
||||
@@ -210,6 +211,8 @@ _meson_option_parse() {
|
||||
--disable-pa) printf "%s" -Dpa=disabled ;;
|
||||
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
||||
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
||||
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
||||
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
||||
--enable-sdl) printf "%s" -Dsdl=enabled ;;
|
||||
--disable-sdl) printf "%s" -Dsdl=disabled ;;
|
||||
--enable-sdl-image) printf "%s" -Dsdl_image=enabled ;;
|
@@ -1,190 +0,0 @@
|
||||
diff --git a/block/meson.build b/block/meson.build
|
||||
index 0b2a60c99b..d923713804 100644
|
||||
--- a/block/meson.build
|
||||
+++ b/block/meson.build
|
||||
@@ -98,6 +98,7 @@ foreach m : [
|
||||
[libnfs, 'nfs', files('nfs.c')],
|
||||
[libssh, 'ssh', files('ssh.c')],
|
||||
[rbd, 'rbd', files('rbd.c')],
|
||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||
]
|
||||
if m[0].found()
|
||||
module_ss = ss.source_set()
|
||||
diff --git a/meson.build b/meson.build
|
||||
index 861de93c4f..272f72af11 100644
|
||||
--- a/meson.build
|
||||
+++ b/meson.build
|
||||
@@ -884,6 +884,26 @@ if not get_option('rbd').auto() or have_block
|
||||
endif
|
||||
endif
|
||||
|
||||
+vitastor = not_found
|
||||
+if not get_option('vitastor').auto() or have_block
|
||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||
+ required: get_option('vitastor'), kwargs: static_kwargs)
|
||||
+ if libvitastor_client.found()
|
||||
+ if cc.links('''
|
||||
+ #include <vitastor_c.h>
|
||||
+ int main(void) {
|
||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
+ return 0;
|
||||
+ }''', dependencies: libvitastor_client)
|
||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||
+ elif get_option('vitastor').enabled()
|
||||
+ error('could not link libvitastor_client')
|
||||
+ else
|
||||
+ warning('could not link libvitastor_client, disabling')
|
||||
+ endif
|
||||
+ endif
|
||||
+endif
|
||||
+
|
||||
glusterfs = not_found
|
||||
glusterfs_ftruncate_has_stat = false
|
||||
glusterfs_iocb_has_stat = false
|
||||
@@ -1546,6 +1566,7 @@ config_host_data.set('CONFIG_LIBPMEM', libpmem.found())
|
||||
config_host_data.set('CONFIG_NUMA', numa.found())
|
||||
config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
|
||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||
config_host_data.set('CONFIG_SDL', sdl.found())
|
||||
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
|
||||
config_host_data.set('CONFIG_SECCOMP', seccomp.found())
|
||||
@@ -3709,6 +3730,7 @@ if spice_protocol.found()
|
||||
summary_info += {' spice server support': spice}
|
||||
endif
|
||||
summary_info += {'rbd support': rbd}
|
||||
+summary_info += {'vitastor support': vitastor}
|
||||
summary_info += {'smartcard support': cacard}
|
||||
summary_info += {'U2F support': u2f}
|
||||
summary_info += {'libusb': libusb}
|
||||
diff --git a/meson_options.txt b/meson_options.txt
|
||||
index 52b11cead4..d8d0868174 100644
|
||||
--- a/meson_options.txt
|
||||
+++ b/meson_options.txt
|
||||
@@ -149,6 +149,8 @@ option('lzo', type : 'feature', value : 'auto',
|
||||
description: 'lzo compression support')
|
||||
option('rbd', type : 'feature', value : 'auto',
|
||||
description: 'Ceph block device driver')
|
||||
+option('vitastor', type : 'feature', value : 'auto',
|
||||
+ description: 'Vitastor block device driver')
|
||||
option('gtk', type : 'feature', value : 'auto',
|
||||
description: 'GTK+ user interface')
|
||||
option('sdl', type : 'feature', value : 'auto',
|
||||
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
||||
index beeb91952a..1c98dc0e12 100644
|
||||
--- a/qapi/block-core.json
|
||||
+++ b/qapi/block-core.json
|
||||
@@ -2929,7 +2929,7 @@
|
||||
'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
|
||||
'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
|
||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||
- 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
|
||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor', 'vmdk', 'vpc', 'vvfat' ] }
|
||||
|
||||
##
|
||||
# @BlockdevOptionsFile:
|
||||
@@ -3863,6 +3863,28 @@
|
||||
'*key-secret': 'str',
|
||||
'*server': ['InetSocketAddressBase'] } }
|
||||
|
||||
+##
|
||||
+# @BlockdevOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific block device options for vitastor
|
||||
+#
|
||||
+# @image: Image name
|
||||
+# @inode: Inode number
|
||||
+# @pool: Pool ID
|
||||
+# @size: Desired image size in bytes
|
||||
+# @config-path: Path to Vitastor configuration
|
||||
+# @etcd-host: etcd connection address(es)
|
||||
+# @etcd-prefix: etcd key/value prefix
|
||||
+##
|
||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||
+ 'data': { '*inode': 'uint64',
|
||||
+ '*pool': 'uint64',
|
||||
+ '*size': 'uint64',
|
||||
+ '*image': 'str',
|
||||
+ '*config-path': 'str',
|
||||
+ '*etcd-host': 'str',
|
||||
+ '*etcd-prefix': 'str' } }
|
||||
+
|
||||
##
|
||||
# @ReplicationMode:
|
||||
#
|
||||
@@ -4277,6 +4299,7 @@
|
||||
'throttle': 'BlockdevOptionsThrottle',
|
||||
'vdi': 'BlockdevOptionsGenericFormat',
|
||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||
'vmdk': 'BlockdevOptionsGenericCOWFormat',
|
||||
'vpc': 'BlockdevOptionsGenericFormat',
|
||||
'vvfat': 'BlockdevOptionsVVFAT'
|
||||
@@ -4665,6 +4688,17 @@
|
||||
'*cluster-size' : 'size',
|
||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||
|
||||
+##
|
||||
+# @BlockdevCreateOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific image creation options for Vitastor.
|
||||
+#
|
||||
+# @size: Size of the virtual disk in bytes
|
||||
+##
|
||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||
+ 'size': 'size' } }
|
||||
+
|
||||
##
|
||||
# @BlockdevVmdkSubformat:
|
||||
#
|
||||
@@ -4864,6 +4898,7 @@
|
||||
'ssh': 'BlockdevCreateOptionsSsh',
|
||||
'vdi': 'BlockdevCreateOptionsVdi',
|
||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||
'vpc': 'BlockdevCreateOptionsVpc'
|
||||
} }
|
||||
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
index 9850dd4444..72b1287520 100755
|
||||
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
@@ -31,7 +31,7 @@
|
||||
--with-git=meson \
|
||||
--with-git-submodules=update \
|
||||
--target-list="x86_64-softmmu" \
|
||||
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
--audio-drv-list="" \
|
||||
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
|
||||
--with-coroutine=ucontext \
|
||||
@@ -181,6 +181,7 @@
|
||||
--enable-opengl \
|
||||
--enable-pie \
|
||||
--enable-rbd \
|
||||
+--enable-vitastor \
|
||||
--enable-rdma \
|
||||
--enable-seccomp \
|
||||
--enable-snappy \
|
||||
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
|
||||
index 1e26f4571e..370898d48c 100644
|
||||
--- a/scripts/meson-buildoptions.sh
|
||||
+++ b/scripts/meson-buildoptions.sh
|
||||
@@ -98,6 +98,7 @@ meson_options_help() {
|
||||
printf "%s\n" ' qed qed image format support'
|
||||
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
||||
printf "%s\n" ' rbd Ceph block device driver'
|
||||
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
||||
printf "%s\n" ' replication replication support'
|
||||
printf "%s\n" ' sdl SDL user interface'
|
||||
printf "%s\n" ' sdl-image SDL Image support for icons'
|
||||
@@ -289,6 +290,8 @@ _meson_option_parse() {
|
||||
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
|
||||
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
||||
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
||||
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
||||
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
||||
--enable-replication) printf "%s" -Dreplication=enabled ;;
|
||||
--disable-replication) printf "%s" -Dreplication=disabled ;;
|
||||
--enable-rng-none) printf "%s" -Drng_none=true ;;
|
@@ -1,190 +0,0 @@
|
||||
diff --git a/block/meson.build b/block/meson.build
|
||||
index 60bc305597..89a042216f 100644
|
||||
--- a/block/meson.build
|
||||
+++ b/block/meson.build
|
||||
@@ -98,6 +98,7 @@ foreach m : [
|
||||
[libnfs, 'nfs', files('nfs.c')],
|
||||
[libssh, 'ssh', files('ssh.c')],
|
||||
[rbd, 'rbd', files('rbd.c')],
|
||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||
]
|
||||
if m[0].found()
|
||||
module_ss = ss.source_set()
|
||||
diff --git a/meson.build b/meson.build
|
||||
index 20fddbd707..600db4e2fb 100644
|
||||
--- a/meson.build
|
||||
+++ b/meson.build
|
||||
@@ -967,6 +967,26 @@ if not get_option('rbd').auto() or have_block
|
||||
endif
|
||||
endif
|
||||
|
||||
+vitastor = not_found
|
||||
+if not get_option('vitastor').auto() or have_block
|
||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||
+ required: get_option('vitastor'), kwargs: static_kwargs)
|
||||
+ if libvitastor_client.found()
|
||||
+ if cc.links('''
|
||||
+ #include <vitastor_c.h>
|
||||
+ int main(void) {
|
||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
+ return 0;
|
||||
+ }''', dependencies: libvitastor_client)
|
||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||
+ elif get_option('vitastor').enabled()
|
||||
+ error('could not link libvitastor_client')
|
||||
+ else
|
||||
+ warning('could not link libvitastor_client, disabling')
|
||||
+ endif
|
||||
+ endif
|
||||
+endif
|
||||
+
|
||||
glusterfs = not_found
|
||||
glusterfs_ftruncate_has_stat = false
|
||||
glusterfs_iocb_has_stat = false
|
||||
@@ -1799,6 +1819,7 @@ config_host_data.set('CONFIG_NUMA', numa.found())
|
||||
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
||||
config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
|
||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||
config_host_data.set('CONFIG_RDMA', rdma.found())
|
||||
config_host_data.set('CONFIG_SDL', sdl.found())
|
||||
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
|
||||
@@ -3954,6 +3975,7 @@ if spice_protocol.found()
|
||||
summary_info += {' spice server support': spice}
|
||||
endif
|
||||
summary_info += {'rbd support': rbd}
|
||||
+summary_info += {'vitastor support': vitastor}
|
||||
summary_info += {'smartcard support': cacard}
|
||||
summary_info += {'U2F support': u2f}
|
||||
summary_info += {'libusb': libusb}
|
||||
diff --git a/meson_options.txt b/meson_options.txt
|
||||
index e58e158396..9747b38fd0 100644
|
||||
--- a/meson_options.txt
|
||||
+++ b/meson_options.txt
|
||||
@@ -167,6 +167,8 @@ option('lzo', type : 'feature', value : 'auto',
|
||||
description: 'lzo compression support')
|
||||
option('rbd', type : 'feature', value : 'auto',
|
||||
description: 'Ceph block device driver')
|
||||
+option('vitastor', type : 'feature', value : 'auto',
|
||||
+ description: 'Vitastor block device driver')
|
||||
option('opengl', type : 'feature', value : 'auto',
|
||||
description: 'OpenGL support')
|
||||
option('rdma', type : 'feature', value : 'auto',
|
||||
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
||||
index 2173e7734a..5a4900b322 100644
|
||||
--- a/qapi/block-core.json
|
||||
+++ b/qapi/block-core.json
|
||||
@@ -2955,7 +2955,7 @@
|
||||
'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
|
||||
'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
|
||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||
- 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
|
||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor', 'vmdk', 'vpc', 'vvfat' ] }
|
||||
|
||||
##
|
||||
# @BlockdevOptionsFile:
|
||||
@@ -3883,6 +3883,28 @@
|
||||
'*key-secret': 'str',
|
||||
'*server': ['InetSocketAddressBase'] } }
|
||||
|
||||
+##
|
||||
+# @BlockdevOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific block device options for vitastor
|
||||
+#
|
||||
+# @image: Image name
|
||||
+# @inode: Inode number
|
||||
+# @pool: Pool ID
|
||||
+# @size: Desired image size in bytes
|
||||
+# @config-path: Path to Vitastor configuration
|
||||
+# @etcd-host: etcd connection address(es)
|
||||
+# @etcd-prefix: etcd key/value prefix
|
||||
+##
|
||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||
+ 'data': { '*inode': 'uint64',
|
||||
+ '*pool': 'uint64',
|
||||
+ '*size': 'uint64',
|
||||
+ '*image': 'str',
|
||||
+ '*config-path': 'str',
|
||||
+ '*etcd-host': 'str',
|
||||
+ '*etcd-prefix': 'str' } }
|
||||
+
|
||||
##
|
||||
# @ReplicationMode:
|
||||
#
|
||||
@@ -4327,6 +4349,7 @@
|
||||
'throttle': 'BlockdevOptionsThrottle',
|
||||
'vdi': 'BlockdevOptionsGenericFormat',
|
||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||
'vmdk': 'BlockdevOptionsGenericCOWFormat',
|
||||
'vpc': 'BlockdevOptionsGenericFormat',
|
||||
'vvfat': 'BlockdevOptionsVVFAT'
|
||||
@@ -4717,6 +4740,17 @@
|
||||
'*cluster-size' : 'size',
|
||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||
|
||||
+##
|
||||
+# @BlockdevCreateOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific image creation options for Vitastor.
|
||||
+#
|
||||
+# @size: Size of the virtual disk in bytes
|
||||
+##
|
||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||
+ 'size': 'size' } }
|
||||
+
|
||||
##
|
||||
# @BlockdevVmdkSubformat:
|
||||
#
|
||||
@@ -4915,6 +4949,7 @@
|
||||
'ssh': 'BlockdevCreateOptionsSsh',
|
||||
'vdi': 'BlockdevCreateOptionsVdi',
|
||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||
'vpc': 'BlockdevCreateOptionsVpc'
|
||||
} }
|
||||
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
index a7f92aff90..53dc55be2e 100755
|
||||
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
@@ -31,7 +31,7 @@
|
||||
--with-git=meson \
|
||||
--with-git-submodules=update \
|
||||
--target-list="x86_64-softmmu" \
|
||||
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
--audio-drv-list="" \
|
||||
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
|
||||
--with-coroutine=ucontext \
|
||||
@@ -179,6 +179,7 @@
|
||||
--enable-opengl \
|
||||
--enable-pie \
|
||||
--enable-rbd \
|
||||
+--enable-vitastor \
|
||||
--enable-rdma \
|
||||
--enable-seccomp \
|
||||
--enable-snappy \
|
||||
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
|
||||
index 359b04e0e6..f5b85ba78c 100644
|
||||
--- a/scripts/meson-buildoptions.sh
|
||||
+++ b/scripts/meson-buildoptions.sh
|
||||
@@ -135,6 +135,7 @@ meson_options_help() {
|
||||
printf "%s\n" ' qed qed image format support'
|
||||
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
||||
printf "%s\n" ' rbd Ceph block device driver'
|
||||
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
||||
printf "%s\n" ' rdma Enable RDMA-based migration'
|
||||
printf "%s\n" ' replication replication support'
|
||||
printf "%s\n" ' sdl SDL user interface'
|
||||
@@ -370,6 +371,8 @@ _meson_option_parse() {
|
||||
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
|
||||
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
||||
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
||||
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
||||
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
||||
--enable-rdma) printf "%s" -Drdma=enabled ;;
|
||||
--disable-rdma) printf "%s" -Drdma=disabled ;;
|
||||
--enable-replication) printf "%s" -Dreplication=enabled ;;
|
@@ -1,190 +0,0 @@
|
||||
diff --git a/block/meson.build b/block/meson.build
|
||||
index b7c68b83a3..95d8a6f15d 100644
|
||||
--- a/block/meson.build
|
||||
+++ b/block/meson.build
|
||||
@@ -100,6 +100,7 @@ foreach m : [
|
||||
[libnfs, 'nfs', files('nfs.c')],
|
||||
[libssh, 'ssh', files('ssh.c')],
|
||||
[rbd, 'rbd', files('rbd.c')],
|
||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||
]
|
||||
if m[0].found()
|
||||
module_ss = ss.source_set()
|
||||
diff --git a/meson.build b/meson.build
|
||||
index 5c6b5a1c75..f31f73612e 100644
|
||||
--- a/meson.build
|
||||
+++ b/meson.build
|
||||
@@ -1026,6 +1026,26 @@ if not get_option('rbd').auto() or have_block
|
||||
endif
|
||||
endif
|
||||
|
||||
+vitastor = not_found
|
||||
+if not get_option('vitastor').auto() or have_block
|
||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||
+ required: get_option('vitastor'), kwargs: static_kwargs)
|
||||
+ if libvitastor_client.found()
|
||||
+ if cc.links('''
|
||||
+ #include <vitastor_c.h>
|
||||
+ int main(void) {
|
||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
+ return 0;
|
||||
+ }''', dependencies: libvitastor_client)
|
||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||
+ elif get_option('vitastor').enabled()
|
||||
+ error('could not link libvitastor_client')
|
||||
+ else
|
||||
+ warning('could not link libvitastor_client, disabling')
|
||||
+ endif
|
||||
+ endif
|
||||
+endif
|
||||
+
|
||||
glusterfs = not_found
|
||||
glusterfs_ftruncate_has_stat = false
|
||||
glusterfs_iocb_has_stat = false
|
||||
@@ -1861,6 +1881,7 @@ config_host_data.set('CONFIG_NUMA', numa.found())
|
||||
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
||||
config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
|
||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||
config_host_data.set('CONFIG_RDMA', rdma.found())
|
||||
config_host_data.set('CONFIG_SDL', sdl.found())
|
||||
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
|
||||
@@ -3945,6 +3966,7 @@ if spice_protocol.found()
|
||||
summary_info += {' spice server support': spice}
|
||||
endif
|
||||
summary_info += {'rbd support': rbd}
|
||||
+summary_info += {'vitastor support': vitastor}
|
||||
summary_info += {'smartcard support': cacard}
|
||||
summary_info += {'U2F support': u2f}
|
||||
summary_info += {'libusb': libusb}
|
||||
diff --git a/meson_options.txt b/meson_options.txt
|
||||
index 4b749ca549..6b37bd6b77 100644
|
||||
--- a/meson_options.txt
|
||||
+++ b/meson_options.txt
|
||||
@@ -169,6 +169,8 @@ option('lzo', type : 'feature', value : 'auto',
|
||||
description: 'lzo compression support')
|
||||
option('rbd', type : 'feature', value : 'auto',
|
||||
description: 'Ceph block device driver')
|
||||
+option('vitastor', type : 'feature', value : 'auto',
|
||||
+ description: 'Vitastor block device driver')
|
||||
option('opengl', type : 'feature', value : 'auto',
|
||||
description: 'OpenGL support')
|
||||
option('rdma', type : 'feature', value : 'auto',
|
||||
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
||||
index 95ac4fa634..7a240827e4 100644
|
||||
--- a/qapi/block-core.json
|
||||
+++ b/qapi/block-core.json
|
||||
@@ -2959,7 +2959,7 @@
|
||||
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
|
||||
'raw', 'rbd',
|
||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||
- 'ssh', 'throttle', 'vdi', 'vhdx',
|
||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
|
||||
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
|
||||
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
|
||||
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
|
||||
@@ -3957,6 +3957,28 @@
|
||||
'*key-secret': 'str',
|
||||
'*server': ['InetSocketAddressBase'] } }
|
||||
|
||||
+##
|
||||
+# @BlockdevOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific block device options for vitastor
|
||||
+#
|
||||
+# @image: Image name
|
||||
+# @inode: Inode number
|
||||
+# @pool: Pool ID
|
||||
+# @size: Desired image size in bytes
|
||||
+# @config-path: Path to Vitastor configuration
|
||||
+# @etcd-host: etcd connection address(es)
|
||||
+# @etcd-prefix: etcd key/value prefix
|
||||
+##
|
||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||
+ 'data': { '*inode': 'uint64',
|
||||
+ '*pool': 'uint64',
|
||||
+ '*size': 'uint64',
|
||||
+ '*image': 'str',
|
||||
+ '*config-path': 'str',
|
||||
+ '*etcd-host': 'str',
|
||||
+ '*etcd-prefix': 'str' } }
|
||||
+
|
||||
##
|
||||
# @ReplicationMode:
|
||||
#
|
||||
@@ -4405,6 +4427,7 @@
|
||||
'throttle': 'BlockdevOptionsThrottle',
|
||||
'vdi': 'BlockdevOptionsGenericFormat',
|
||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||
'virtio-blk-vfio-pci':
|
||||
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
||||
'if': 'CONFIG_BLKIO' },
|
||||
@@ -4804,6 +4827,17 @@
|
||||
'*cluster-size' : 'size',
|
||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||
|
||||
+##
|
||||
+# @BlockdevCreateOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific image creation options for Vitastor.
|
||||
+#
|
||||
+# @size: Size of the virtual disk in bytes
|
||||
+##
|
||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||
+ 'size': 'size' } }
|
||||
+
|
||||
##
|
||||
# @BlockdevVmdkSubformat:
|
||||
#
|
||||
@@ -5002,6 +5036,7 @@
|
||||
'ssh': 'BlockdevCreateOptionsSsh',
|
||||
'vdi': 'BlockdevCreateOptionsVdi',
|
||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||
'vpc': 'BlockdevCreateOptionsVpc'
|
||||
} }
|
||||
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
index a7f92aff90..53dc55be2e 100755
|
||||
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
@@ -31,7 +31,7 @@
|
||||
--with-git=meson \
|
||||
--with-git-submodules=update \
|
||||
--target-list="x86_64-softmmu" \
|
||||
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
--audio-drv-list="" \
|
||||
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
|
||||
--with-coroutine=ucontext \
|
||||
@@ -179,6 +179,7 @@
|
||||
--enable-opengl \
|
||||
--enable-pie \
|
||||
--enable-rbd \
|
||||
+--enable-vitastor \
|
||||
--enable-rdma \
|
||||
--enable-seccomp \
|
||||
--enable-snappy \
|
||||
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
|
||||
index aa6e30ea91..c45d21c40f 100644
|
||||
--- a/scripts/meson-buildoptions.sh
|
||||
+++ b/scripts/meson-buildoptions.sh
|
||||
@@ -135,6 +135,7 @@ meson_options_help() {
|
||||
printf "%s\n" ' qed qed image format support'
|
||||
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
||||
printf "%s\n" ' rbd Ceph block device driver'
|
||||
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
||||
printf "%s\n" ' rdma Enable RDMA-based migration'
|
||||
printf "%s\n" ' replication replication support'
|
||||
printf "%s\n" ' sdl SDL user interface'
|
||||
@@ -376,6 +377,8 @@ _meson_option_parse() {
|
||||
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
|
||||
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
||||
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
||||
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
||||
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
||||
--enable-rdma) printf "%s" -Drdma=enabled ;;
|
||||
--disable-rdma) printf "%s" -Drdma=disabled ;;
|
||||
--enable-replication) printf "%s" -Dreplication=enabled ;;
|
@@ -1,190 +0,0 @@
|
||||
diff --git a/block/meson.build b/block/meson.build
|
||||
index 382bec0e7d..af6207dbce 100644
|
||||
--- a/block/meson.build
|
||||
+++ b/block/meson.build
|
||||
@@ -101,6 +101,7 @@ foreach m : [
|
||||
[libnfs, 'nfs', files('nfs.c')],
|
||||
[libssh, 'ssh', files('ssh.c')],
|
||||
[rbd, 'rbd', files('rbd.c')],
|
||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||
]
|
||||
if m[0].found()
|
||||
module_ss = ss.source_set()
|
||||
diff --git a/meson.build b/meson.build
|
||||
index c44d05a13f..ebedb42843 100644
|
||||
--- a/meson.build
|
||||
+++ b/meson.build
|
||||
@@ -1028,6 +1028,26 @@ if not get_option('rbd').auto() or have_block
|
||||
endif
|
||||
endif
|
||||
|
||||
+vitastor = not_found
|
||||
+if not get_option('vitastor').auto() or have_block
|
||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||
+ required: get_option('vitastor'), kwargs: static_kwargs)
|
||||
+ if libvitastor_client.found()
|
||||
+ if cc.links('''
|
||||
+ #include <vitastor_c.h>
|
||||
+ int main(void) {
|
||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
+ return 0;
|
||||
+ }''', dependencies: libvitastor_client)
|
||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||
+ elif get_option('vitastor').enabled()
|
||||
+ error('could not link libvitastor_client')
|
||||
+ else
|
||||
+ warning('could not link libvitastor_client, disabling')
|
||||
+ endif
|
||||
+ endif
|
||||
+endif
|
||||
+
|
||||
glusterfs = not_found
|
||||
glusterfs_ftruncate_has_stat = false
|
||||
glusterfs_iocb_has_stat = false
|
||||
@@ -1878,6 +1898,7 @@ endif
|
||||
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
||||
config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
|
||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||
config_host_data.set('CONFIG_RDMA', rdma.found())
|
||||
config_host_data.set('CONFIG_SDL', sdl.found())
|
||||
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
|
||||
@@ -4002,6 +4023,7 @@ if spice_protocol.found()
|
||||
summary_info += {' spice server support': spice}
|
||||
endif
|
||||
summary_info += {'rbd support': rbd}
|
||||
+summary_info += {'vitastor support': vitastor}
|
||||
summary_info += {'smartcard support': cacard}
|
||||
summary_info += {'U2F support': u2f}
|
||||
summary_info += {'libusb': libusb}
|
||||
diff --git a/meson_options.txt b/meson_options.txt
|
||||
index fc9447d267..c4ac55c283 100644
|
||||
--- a/meson_options.txt
|
||||
+++ b/meson_options.txt
|
||||
@@ -173,6 +173,8 @@ option('lzo', type : 'feature', value : 'auto',
|
||||
description: 'lzo compression support')
|
||||
option('rbd', type : 'feature', value : 'auto',
|
||||
description: 'Ceph block device driver')
|
||||
+option('vitastor', type : 'feature', value : 'auto',
|
||||
+ description: 'Vitastor block device driver')
|
||||
option('opengl', type : 'feature', value : 'auto',
|
||||
description: 'OpenGL support')
|
||||
option('rdma', type : 'feature', value : 'auto',
|
||||
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
||||
index c05ad0c07e..f5eb701604 100644
|
||||
--- a/qapi/block-core.json
|
||||
+++ b/qapi/block-core.json
|
||||
@@ -3054,7 +3054,7 @@
|
||||
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
|
||||
'raw', 'rbd',
|
||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||
- 'ssh', 'throttle', 'vdi', 'vhdx',
|
||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
|
||||
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
|
||||
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
|
||||
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
|
||||
@@ -4073,6 +4073,28 @@
|
||||
'*key-secret': 'str',
|
||||
'*server': ['InetSocketAddressBase'] } }
|
||||
|
||||
+##
|
||||
+# @BlockdevOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific block device options for vitastor
|
||||
+#
|
||||
+# @image: Image name
|
||||
+# @inode: Inode number
|
||||
+# @pool: Pool ID
|
||||
+# @size: Desired image size in bytes
|
||||
+# @config-path: Path to Vitastor configuration
|
||||
+# @etcd-host: etcd connection address(es)
|
||||
+# @etcd-prefix: etcd key/value prefix
|
||||
+##
|
||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||
+ 'data': { '*inode': 'uint64',
|
||||
+ '*pool': 'uint64',
|
||||
+ '*size': 'uint64',
|
||||
+ '*image': 'str',
|
||||
+ '*config-path': 'str',
|
||||
+ '*etcd-host': 'str',
|
||||
+ '*etcd-prefix': 'str' } }
|
||||
+
|
||||
##
|
||||
# @ReplicationMode:
|
||||
#
|
||||
@@ -4521,6 +4543,7 @@
|
||||
'throttle': 'BlockdevOptionsThrottle',
|
||||
'vdi': 'BlockdevOptionsGenericFormat',
|
||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||
'virtio-blk-vfio-pci':
|
||||
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
||||
'if': 'CONFIG_BLKIO' },
|
||||
@@ -4920,6 +4943,17 @@
|
||||
'*cluster-size' : 'size',
|
||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||
|
||||
+##
|
||||
+# @BlockdevCreateOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific image creation options for Vitastor.
|
||||
+#
|
||||
+# @size: Size of the virtual disk in bytes
|
||||
+##
|
||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||
+ 'size': 'size' } }
|
||||
+
|
||||
##
|
||||
# @BlockdevVmdkSubformat:
|
||||
#
|
||||
@@ -5118,6 +5152,7 @@
|
||||
'ssh': 'BlockdevCreateOptionsSsh',
|
||||
'vdi': 'BlockdevCreateOptionsVdi',
|
||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||
'vpc': 'BlockdevCreateOptionsVpc'
|
||||
} }
|
||||
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
index 6e8983f39c..1b0b9fcf3e 100755
|
||||
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
@@ -32,7 +32,7 @@
|
||||
--with-git=meson \
|
||||
--with-git-submodules=update \
|
||||
--target-list="x86_64-softmmu" \
|
||||
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
--audio-drv-list="" \
|
||||
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
|
||||
--with-coroutine=ucontext \
|
||||
@@ -179,6 +179,7 @@
|
||||
--enable-opengl \
|
||||
--enable-pie \
|
||||
--enable-rbd \
|
||||
+--enable-vitastor \
|
||||
--enable-rdma \
|
||||
--enable-seccomp \
|
||||
--enable-snappy \
|
||||
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
|
||||
index 009fab1515..95914e6ebc 100644
|
||||
--- a/scripts/meson-buildoptions.sh
|
||||
+++ b/scripts/meson-buildoptions.sh
|
||||
@@ -142,6 +142,7 @@ meson_options_help() {
|
||||
printf "%s\n" ' qed qed image format support'
|
||||
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
||||
printf "%s\n" ' rbd Ceph block device driver'
|
||||
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
||||
printf "%s\n" ' rdma Enable RDMA-based migration'
|
||||
printf "%s\n" ' replication replication support'
|
||||
printf "%s\n" ' sdl SDL user interface'
|
||||
@@ -388,6 +389,8 @@ _meson_option_parse() {
|
||||
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
|
||||
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
||||
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
||||
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
||||
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
||||
--enable-rdma) printf "%s" -Drdma=enabled ;;
|
||||
--disable-rdma) printf "%s" -Drdma=disabled ;;
|
||||
--enable-replication) printf "%s" -Dreplication=enabled ;;
|
@@ -7,12 +7,13 @@ set -e
|
||||
VITASTOR=$(dirname $0)
|
||||
VITASTOR=$(realpath "$VITASTOR/..")
|
||||
|
||||
EL=$(rpm --eval '%dist')
|
||||
if [ "$EL" = ".el8" ]; then
|
||||
if [ -d /opt/rh/gcc-toolset-9 ]; then
|
||||
# CentOS 8
|
||||
EL=8
|
||||
. /opt/rh/gcc-toolset-9/enable
|
||||
elif [ "$EL" = ".el7" ]; then
|
||||
else
|
||||
# CentOS 7
|
||||
EL=7
|
||||
. /opt/rh/devtoolset-9/enable
|
||||
fi
|
||||
cd ~/rpmbuild/SPECS
|
||||
@@ -24,4 +25,4 @@ rm fio
|
||||
mv fio-copy fio
|
||||
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
|
||||
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
|
||||
tar --transform 's#^#vitastor-0.8.8/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.8$(rpm --eval '%dist').tar.gz *
|
||||
tar --transform 's#^#vitastor-0.8.5/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.5$(rpm --eval '%dist').tar.gz *
|
||||
|
@@ -1,93 +0,0 @@
|
||||
--- qemu-kvm.spec.orig 2023-02-28 08:04:06.000000000 +0000
|
||||
+++ qemu-kvm.spec 2023-04-27 22:29:18.094878829 +0000
|
||||
@@ -100,8 +100,6 @@
|
||||
%endif
|
||||
|
||||
%global target_list %{kvm_target}-softmmu
|
||||
-%global block_drivers_rw_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,compress
|
||||
-%global block_drivers_ro_list vdi,vmdk,vhdx,vpc,https
|
||||
%define qemudocdir %{_docdir}/%{name}
|
||||
%global firmwaredirs "%{_datadir}/qemu-firmware:%{_datadir}/ipxe/qemu:%{_datadir}/seavgabios:%{_datadir}/seabios"
|
||||
|
||||
@@ -129,6 +127,7 @@ Requires: %{name}-device-usb-host = %{ep
|
||||
Requires: %{name}-device-usb-redirect = %{epoch}:%{version}-%{release} \
|
||||
%endif \
|
||||
Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release} \
|
||||
+Requires: %{name}-block-vitastor = %{epoch}:%{version}-%{release}\
|
||||
Requires: %{name}-audio-pa = %{epoch}:%{version}-%{release}
|
||||
|
||||
# Since SPICE is removed from RHEL-9, the following Obsoletes:
|
||||
@@ -151,7 +150,7 @@ Obsoletes: %{name}-block-ssh <= %{epoch}
|
||||
Summary: QEMU is a machine emulator and virtualizer
|
||||
Name: qemu-kvm
|
||||
Version: 7.0.0
|
||||
-Release: 13%{?rcrel}%{?dist}%{?cc_suffix}.2
|
||||
+Release: 13.vitastor%{?rcrel}%{?dist}%{?cc_suffix}
|
||||
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
|
||||
# Epoch 15 used for RHEL 8
|
||||
# Epoch 17 used for RHEL 9 (due to release versioning offset in RHEL 8.5)
|
||||
@@ -174,6 +173,7 @@ Source28: 95-kvm-memlock.conf
|
||||
Source30: kvm-s390x.conf
|
||||
Source31: kvm-x86.conf
|
||||
Source36: README.tests
|
||||
+Source37: qemu-vitastor.c
|
||||
|
||||
|
||||
Patch0004: 0004-Initial-redhat-build.patch
|
||||
@@ -498,6 +498,7 @@ Patch171: kvm-i386-do-kvm_put_msr_featur
|
||||
Patch172: kvm-target-i386-kvm-fix-kvmclock_current_nsec-Assertion-.patch
|
||||
# For bz#2168221 - while live-migrating many instances concurrently, libvirt sometimes return internal error: migration was active, but no RAM info was set [rhel-9.1.0.z]
|
||||
Patch173: kvm-migration-Read-state-once.patch
|
||||
+Patch174: qemu-7.0-vitastor.patch
|
||||
|
||||
# Source-git patches
|
||||
|
||||
@@ -531,6 +532,7 @@ BuildRequires: libcurl-devel
|
||||
%if %{have_block_rbd}
|
||||
BuildRequires: librbd-devel
|
||||
%endif
|
||||
+BuildRequires: vitastor-client-devel
|
||||
# We need both because the 'stap' binary is probed for by configure
|
||||
BuildRequires: systemtap
|
||||
BuildRequires: systemtap-sdt-devel
|
||||
@@ -718,6 +720,14 @@ using the rbd protocol.
|
||||
%endif
|
||||
|
||||
|
||||
+%package block-vitastor
|
||||
+Summary: QEMU Vitastor block driver
|
||||
+Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
|
||||
+
|
||||
+%description block-vitastor
|
||||
+This package provides the additional Vitastor block driver for QEMU.
|
||||
+
|
||||
+
|
||||
%package audio-pa
|
||||
Summary: QEMU PulseAudio audio driver
|
||||
Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
|
||||
@@ -811,6 +821,7 @@ This package provides usbredir support.
|
||||
%prep
|
||||
%setup -q -n qemu-%{version}%{?rcstr}
|
||||
%autopatch -p1
|
||||
+cp %{SOURCE37} ./block/vitastor.c
|
||||
|
||||
%global qemu_kvm_build qemu_kvm_build
|
||||
mkdir -p %{qemu_kvm_build}
|
||||
@@ -1032,6 +1043,7 @@ run_configure \
|
||||
%if %{have_block_rbd}
|
||||
--enable-rbd \
|
||||
%endif
|
||||
+ --enable-vitastor \
|
||||
%if %{have_librdma}
|
||||
--enable-rdma \
|
||||
%endif
|
||||
@@ -1511,6 +1523,9 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
|
||||
%files block-rbd
|
||||
%{_libdir}/%{name}/block-rbd.so
|
||||
%endif
|
||||
+%files block-vitastor
|
||||
+%{_libdir}/%{name}/block-vitastor.so
|
||||
+
|
||||
%files audio-pa
|
||||
%{_libdir}/%{name}/audio-pa.so
|
||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.8.8.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.8.5.el7.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.8.8
|
||||
Version: 0.8.5
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.8.8.el7.tar.gz
|
||||
Source0: vitastor-0.8.5.el7.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -35,7 +35,7 @@ ADD . /root/vitastor
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.8.8.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp /root/vitastor-0.8.5.el8.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
|
@@ -1,11 +1,11 @@
|
||||
Name: vitastor
|
||||
Version: 0.8.8
|
||||
Version: 0.8.5
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.8.8.el8.tar.gz
|
||||
Source0: vitastor-0.8.5.el8.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
|
@@ -1,28 +0,0 @@
|
||||
# Build packages for AlmaLinux 9 inside a container
|
||||
# cd ..; podman build -t vitastor-el9 -v `pwd`/packages:/root/packages -f rpm/vitastor-el9.Dockerfile .
|
||||
|
||||
FROM almalinux:9
|
||||
|
||||
WORKDIR /root
|
||||
|
||||
RUN sed -i 's/enabled=0/enabled=1/' /etc/yum.repos.d/*.repo
|
||||
RUN dnf -y install epel-release dnf-plugins-core
|
||||
RUN dnf -y install https://vitastor.io/rpms/centos/9/vitastor-release-1.0-1.el9.noarch.rpm
|
||||
RUN dnf -y install gcc-c++ gperftools-devel fio nodejs rpm-build jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel libarchive liburing-devel cmake
|
||||
RUN dnf download --source fio
|
||||
RUN rpm --nomd5 -i fio*.src.rpm
|
||||
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --spec fio.spec
|
||||
|
||||
ADD . /root/vitastor
|
||||
|
||||
RUN set -e; \
|
||||
cd /root/vitastor/rpm; \
|
||||
sh build-tarball.sh; \
|
||||
cp /root/vitastor-0.8.8.el9.tar.gz ~/rpmbuild/SOURCES; \
|
||||
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
|
||||
cd ~/rpmbuild/SPECS/; \
|
||||
rpmbuild -ba vitastor.spec; \
|
||||
mkdir -p /root/packages/vitastor-el9; \
|
||||
rm -rf /root/packages/vitastor-el9/*; \
|
||||
cp ~/rpmbuild/RPMS/*/vitastor* /root/packages/vitastor-el9/; \
|
||||
cp ~/rpmbuild/SRPMS/vitastor* /root/packages/vitastor-el9/
|
@@ -1,158 +0,0 @@
|
||||
Name: vitastor
|
||||
Version: 0.8.8
|
||||
Release: 1%{?dist}
|
||||
Summary: Vitastor, a fast software-defined clustered block storage
|
||||
|
||||
License: Vitastor Network Public License 1.1
|
||||
URL: https://vitastor.io/
|
||||
Source0: vitastor-0.8.8.el9.tar.gz
|
||||
|
||||
BuildRequires: liburing-devel >= 0.6
|
||||
BuildRequires: gperftools-devel
|
||||
BuildRequires: gcc-c++
|
||||
BuildRequires: nodejs >= 10
|
||||
BuildRequires: jerasure-devel
|
||||
BuildRequires: libisa-l-devel
|
||||
BuildRequires: gf-complete-devel
|
||||
BuildRequires: rdma-core-devel
|
||||
BuildRequires: cmake
|
||||
Requires: vitastor-osd = %{version}-%{release}
|
||||
Requires: vitastor-mon = %{version}-%{release}
|
||||
Requires: vitastor-client = %{version}-%{release}
|
||||
Requires: vitastor-client-devel = %{version}-%{release}
|
||||
Requires: vitastor-fio = %{version}-%{release}
|
||||
|
||||
%description
|
||||
Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
|
||||
architecturally similar to Ceph which means strong consistency, primary-replication,
|
||||
symmetric clustering and automatic data distribution over any number of drives of any
|
||||
size with configurable redundancy (replication or erasure codes/XOR).
|
||||
|
||||
|
||||
%package -n vitastor-osd
|
||||
Summary: Vitastor - OSD
|
||||
Requires: vitastor-client = %{version}-%{release}
|
||||
Requires: util-linux
|
||||
Requires: parted
|
||||
|
||||
|
||||
%description -n vitastor-osd
|
||||
Vitastor object storage daemon, i.e. server program that stores data.
|
||||
|
||||
|
||||
%package -n vitastor-mon
|
||||
Summary: Vitastor - monitor
|
||||
Requires: nodejs >= 10
|
||||
Requires: lpsolve
|
||||
|
||||
|
||||
%description -n vitastor-mon
|
||||
Vitastor monitor, i.e. server program responsible for watching cluster state and
|
||||
scheduling cluster-level operations.
|
||||
|
||||
|
||||
%package -n vitastor-client
|
||||
Summary: Vitastor - client
|
||||
|
||||
|
||||
%description -n vitastor-client
|
||||
Vitastor client library and command-line interface.
|
||||
|
||||
|
||||
%package -n vitastor-client-devel
|
||||
Summary: Vitastor - development files
|
||||
Group: Development/Libraries
|
||||
Requires: vitastor-client = %{version}-%{release}
|
||||
|
||||
|
||||
%description -n vitastor-client-devel
|
||||
Vitastor library headers for development.
|
||||
|
||||
|
||||
%package -n vitastor-fio
|
||||
Summary: Vitastor - fio drivers
|
||||
Group: Development/Libraries
|
||||
Requires: vitastor-client = %{version}-%{release}
|
||||
Requires: fio = 3.27-7.el9
|
||||
|
||||
|
||||
%description -n vitastor-fio
|
||||
Vitastor fio drivers for benchmarking.
|
||||
|
||||
|
||||
%prep
|
||||
%setup -q
|
||||
|
||||
|
||||
%build
|
||||
%cmake
|
||||
%cmake_build
|
||||
|
||||
|
||||
%install
|
||||
rm -rf $RPM_BUILD_ROOT
|
||||
%cmake_install
|
||||
cd mon
|
||||
npm install
|
||||
cd ..
|
||||
mkdir -p %buildroot/usr/lib/vitastor
|
||||
cp -r mon %buildroot/usr/lib/vitastor
|
||||
mkdir -p %buildroot/lib/systemd/system
|
||||
cp mon/vitastor.target mon/vitastor-mon.service mon/vitastor-osd@.service %buildroot/lib/systemd/system
|
||||
mkdir -p %buildroot/lib/udev/rules.d
|
||||
cp mon/90-vitastor.rules %buildroot/lib/udev/rules.d
|
||||
|
||||
|
||||
%files
|
||||
%doc GPL-2.0.txt VNPL-1.1.txt README.md README-ru.md
|
||||
|
||||
|
||||
%files -n vitastor-osd
|
||||
%_bindir/vitastor-osd
|
||||
%_bindir/vitastor-disk
|
||||
%_bindir/vitastor-dump-journal
|
||||
/lib/systemd/system/vitastor-osd@.service
|
||||
/lib/systemd/system/vitastor.target
|
||||
/lib/udev/rules.d/90-vitastor.rules
|
||||
|
||||
|
||||
%pre -n vitastor-osd
|
||||
groupadd -r -f vitastor 2>/dev/null ||:
|
||||
useradd -r -g vitastor -s /sbin/nologin -c "Vitastor daemons" -M -d /nonexistent vitastor 2>/dev/null ||:
|
||||
install -o vitastor -g vitastor -d /var/log/vitastor
|
||||
mkdir -p /etc/vitastor
|
||||
|
||||
|
||||
%files -n vitastor-mon
|
||||
/usr/lib/vitastor/mon
|
||||
/lib/systemd/system/vitastor-mon.service
|
||||
|
||||
|
||||
%pre -n vitastor-mon
|
||||
groupadd -r -f vitastor 2>/dev/null ||:
|
||||
useradd -r -g vitastor -s /sbin/nologin -c "Vitastor daemons" -M -d /nonexistent vitastor 2>/dev/null ||:
|
||||
mkdir -p /etc/vitastor
|
||||
|
||||
|
||||
%files -n vitastor-client
|
||||
%_bindir/vitastor-nbd
|
||||
%_bindir/vitastor-nfs
|
||||
%_bindir/vitastor-cli
|
||||
%_bindir/vitastor-rm
|
||||
%_bindir/vita
|
||||
%_libdir/libvitastor_blk.so*
|
||||
%_libdir/libvitastor_client.so*
|
||||
|
||||
|
||||
%files -n vitastor-client-devel
|
||||
%_includedir/vitastor_c.h
|
||||
%_libdir/pkgconfig
|
||||
|
||||
|
||||
%files -n vitastor-fio
|
||||
%_libdir/libfio_vitastor.so
|
||||
%_libdir/libfio_vitastor_blk.so
|
||||
%_libdir/libfio_vitastor_sec.so
|
||||
|
||||
|
||||
%changelog
|
@@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 2.8.12)
|
||||
cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
project(vitastor)
|
||||
|
||||
@@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||
endif()
|
||||
|
||||
add_definitions(-DVERSION="0.8.8")
|
||||
add_definitions(-DVERSION="0.8.5")
|
||||
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
|
||||
if (${WITH_ASAN})
|
||||
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
|
||||
@@ -111,7 +111,7 @@ target_compile_options(vitastor_common PUBLIC -fPIC)
|
||||
add_executable(vitastor-osd
|
||||
osd_main.cpp osd.cpp osd_secondary.cpp osd_peering.cpp osd_flush.cpp osd_peering_pg.cpp
|
||||
osd_primary.cpp osd_primary_chain.cpp osd_primary_sync.cpp osd_primary_write.cpp osd_primary_subops.cpp
|
||||
osd_cluster.cpp osd_rmw.cpp
|
||||
osd_cluster.cpp osd_rmw.cpp osd_scrub.cpp
|
||||
)
|
||||
target_link_libraries(vitastor-osd
|
||||
vitastor_common
|
||||
@@ -299,7 +299,7 @@ add_executable(test_cluster_client
|
||||
EXCLUDE_FROM_ALL
|
||||
test_cluster_client.cpp
|
||||
pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
|
||||
etcd_state_client.cpp timerfd_manager.cpp ../json11/json11.cpp
|
||||
etcd_state_client.cpp timerfd_manager.cpp str_util.cpp ../json11/json11.cpp
|
||||
)
|
||||
target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
|
||||
target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mock)
|
||||
|
@@ -13,11 +13,6 @@ blockstore_t::~blockstore_t()
|
||||
delete impl;
|
||||
}
|
||||
|
||||
void blockstore_t::parse_config(blockstore_config_t & config)
|
||||
{
|
||||
impl->parse_config(config, false);
|
||||
}
|
||||
|
||||
void blockstore_t::loop()
|
||||
{
|
||||
impl->loop();
|
||||
|
@@ -107,7 +107,7 @@ Input:
|
||||
- buf = pre-allocated obj_ver_id array <len> units long
|
||||
|
||||
Output:
|
||||
- retval = 0 or negative error number (-ENOENT if no such version for stabilize)
|
||||
- retval = 0 or negative error number (-EINVAL, -ENOENT if no such version or -EBUSY if not synced)
|
||||
|
||||
## BS_OP_SYNC_STAB_ALL
|
||||
|
||||
@@ -122,11 +122,14 @@ Output:
|
||||
Get a list of all objects in this Blockstore.
|
||||
|
||||
Input:
|
||||
- oid.stripe = PG alignment
|
||||
- len = PG count or 0 to list all objects
|
||||
- offset = PG number
|
||||
- oid.inode = min inode number or 0 to list all inodes
|
||||
- version = max inode number or 0 to list all inodes
|
||||
- pg_alignment = PG alignment
|
||||
- pg_count = PG count or 0 to list all objects
|
||||
- pg_number = PG number
|
||||
- list_stable_limit = max number of clean objects in the reply
|
||||
it's guaranteed that dirty objects are returned from the same interval,
|
||||
i.e. from (min_oid .. min(max_oid, max(returned stable OIDs)))
|
||||
- min_oid = min inode/stripe or 0 to list all objects
|
||||
- max_oid = max inode/stripe or 0 to list all objects
|
||||
|
||||
Output:
|
||||
- retval = total obj_ver_id count
|
||||
@@ -143,10 +146,27 @@ struct blockstore_op_t
|
||||
uint64_t opcode;
|
||||
// finish callback
|
||||
std::function<void (blockstore_op_t*)> callback;
|
||||
object_id oid;
|
||||
uint64_t version;
|
||||
uint32_t offset;
|
||||
uint32_t len;
|
||||
union
|
||||
{
|
||||
// R/W
|
||||
struct
|
||||
{
|
||||
object_id oid;
|
||||
uint64_t version;
|
||||
uint32_t offset;
|
||||
uint32_t len;
|
||||
};
|
||||
// List
|
||||
struct __attribute__((__packed__))
|
||||
{
|
||||
object_id min_oid;
|
||||
object_id max_oid;
|
||||
uint32_t pg_alignment;
|
||||
uint32_t pg_count;
|
||||
uint32_t pg_number;
|
||||
uint32_t list_stable_limit;
|
||||
};
|
||||
};
|
||||
void *buf;
|
||||
void *bitmap;
|
||||
int retval;
|
||||
@@ -165,9 +185,6 @@ public:
|
||||
blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
|
||||
~blockstore_t();
|
||||
|
||||
// Update configuration
|
||||
void parse_config(blockstore_config_t & config);
|
||||
|
||||
// Event loop
|
||||
void loop();
|
||||
|
||||
|
@@ -932,7 +932,7 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
|
||||
resume_1:
|
||||
if (!cur_sync->state)
|
||||
{
|
||||
if (flusher->syncing_flushers >= flusher->active_flushers || !flusher->flush_queue.size())
|
||||
if (flusher->syncing_flushers >= flusher->cur_flusher_count || !flusher->flush_queue.size())
|
||||
{
|
||||
// Sync batch is ready. Do it.
|
||||
await_sqe(0);
|
||||
|
@@ -11,7 +11,7 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
|
||||
ring_consumer.loop = [this]() { loop(); };
|
||||
ringloop->register_consumer(&ring_consumer);
|
||||
initialized = 0;
|
||||
parse_config(config, true);
|
||||
parse_config(config);
|
||||
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
|
||||
try
|
||||
{
|
||||
@@ -171,7 +171,7 @@ void blockstore_impl_t::loop()
|
||||
// Can't submit SYNC before previous writes
|
||||
continue;
|
||||
}
|
||||
wr_st = continue_sync(op);
|
||||
wr_st = continue_sync(op, false);
|
||||
if (wr_st != 2)
|
||||
{
|
||||
has_writes = wr_st > 0 ? 1 : 2;
|
||||
@@ -371,18 +371,13 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
|
||||
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
|
||||
return;
|
||||
}
|
||||
init_op(op);
|
||||
submit_queue.push_back(op);
|
||||
ringloop->wakeup();
|
||||
}
|
||||
|
||||
void blockstore_impl_t::init_op(blockstore_op_t *op)
|
||||
{
|
||||
// Call constructor without allocating memory. We'll call destructor before returning op back
|
||||
new ((void*)op->private_data) blockstore_op_private_t;
|
||||
PRIV(op)->wait_for = 0;
|
||||
PRIV(op)->op_state = 0;
|
||||
PRIV(op)->pending_ops = 0;
|
||||
submit_queue.push_back(op);
|
||||
ringloop->wakeup();
|
||||
}
|
||||
|
||||
static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)
|
||||
@@ -450,11 +445,11 @@ void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint
|
||||
|
||||
void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
{
|
||||
uint32_t list_pg = op->offset+1;
|
||||
uint32_t pg_count = op->len;
|
||||
uint64_t pg_stripe_size = op->oid.stripe;
|
||||
uint64_t min_inode = op->oid.inode;
|
||||
uint64_t max_inode = op->version;
|
||||
uint32_t list_pg = op->pg_number+1;
|
||||
uint32_t pg_count = op->pg_count;
|
||||
uint64_t pg_stripe_size = op->pg_alignment;
|
||||
uint64_t min_inode = op->min_oid.inode;
|
||||
uint64_t max_inode = op->max_oid.inode;
|
||||
// Check PG
|
||||
if (pg_count != 0 && (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count))
|
||||
{
|
||||
@@ -501,7 +496,13 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
stable_alloc += clean_db.size();
|
||||
}
|
||||
}
|
||||
else
|
||||
if (op->list_stable_limit > 0)
|
||||
{
|
||||
stable_alloc = op->list_stable_limit;
|
||||
if (stable_alloc > 1024*1024)
|
||||
stable_alloc = 1024*1024;
|
||||
}
|
||||
if (stable_alloc < 32768)
|
||||
{
|
||||
stable_alloc = 32768;
|
||||
}
|
||||
@@ -512,22 +513,21 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
FINISH_OP(op);
|
||||
return;
|
||||
}
|
||||
auto max_oid = op->max_oid;
|
||||
bool limited = false;
|
||||
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
|
||||
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
|
||||
shard_it++)
|
||||
{
|
||||
auto & clean_db = shard_it->second;
|
||||
auto clean_it = clean_db.begin(), clean_end = clean_db.end();
|
||||
if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
|
||||
if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
|
||||
{
|
||||
clean_it = clean_db.lower_bound({
|
||||
.inode = min_inode,
|
||||
.stripe = 0,
|
||||
});
|
||||
clean_end = clean_db.upper_bound({
|
||||
.inode = max_inode,
|
||||
.stripe = UINT64_MAX,
|
||||
});
|
||||
clean_it = clean_db.lower_bound(op->min_oid);
|
||||
}
|
||||
if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
|
||||
{
|
||||
clean_end = clean_db.upper_bound(max_oid);
|
||||
}
|
||||
for (; clean_it != clean_end; clean_it++)
|
||||
{
|
||||
@@ -546,11 +546,24 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
.oid = clean_it->first,
|
||||
.version = clean_it->second.version,
|
||||
};
|
||||
if (op->list_stable_limit > 0 && !limited && stable_count >= op->list_stable_limit)
|
||||
{
|
||||
limited = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (op->list_stable_limit > 0 && first_shard != last_shard)
|
||||
{
|
||||
// To maintain the order, we have to include objects in the same range from other shards
|
||||
std::sort(stable, stable+stable_count);
|
||||
if (stable_count > op->list_stable_limit)
|
||||
stable_count = op->list_stable_limit;
|
||||
max_oid = stable[stable_count-1].oid;
|
||||
}
|
||||
}
|
||||
if (first_shard != last_shard)
|
||||
if (op->list_stable_limit == 0 && first_shard != last_shard)
|
||||
{
|
||||
// If that's not a per-PG listing, sort clean entries
|
||||
// If that's not a per-PG listing, sort clean entries (already sorted if list_stable_limit != 0)
|
||||
std::sort(stable, stable+stable_count);
|
||||
}
|
||||
int clean_stable_count = stable_count;
|
||||
@@ -559,20 +572,17 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
|
||||
obj_ver_id *unstable = NULL;
|
||||
{
|
||||
auto dirty_it = dirty_db.begin(), dirty_end = dirty_db.end();
|
||||
if ((min_inode != 0 || max_inode != 0) && min_inode <= max_inode)
|
||||
if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
|
||||
{
|
||||
dirty_it = dirty_db.lower_bound({
|
||||
.oid = {
|
||||
.inode = min_inode,
|
||||
.stripe = 0,
|
||||
},
|
||||
.oid = op->min_oid,
|
||||
.version = 0,
|
||||
});
|
||||
}
|
||||
if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
|
||||
{
|
||||
dirty_end = dirty_db.upper_bound({
|
||||
.oid = {
|
||||
.inode = max_inode,
|
||||
.stripe = UINT64_MAX,
|
||||
},
|
||||
.oid = max_oid,
|
||||
.version = UINT64_MAX,
|
||||
});
|
||||
}
|
||||
|
@@ -216,11 +216,6 @@ struct pool_shard_settings_t
|
||||
uint32_t pg_stripe_size;
|
||||
};
|
||||
|
||||
#define STAB_SPLIT_DONE 1
|
||||
#define STAB_SPLIT_WAIT 2
|
||||
#define STAB_SPLIT_SYNC 3
|
||||
#define STAB_SPLIT_TODO 4
|
||||
|
||||
class blockstore_impl_t
|
||||
{
|
||||
blockstore_disk_t dsk;
|
||||
@@ -282,6 +277,7 @@ class blockstore_impl_t
|
||||
friend class journal_flusher_t;
|
||||
friend class journal_flusher_co;
|
||||
|
||||
void parse_config(blockstore_config_t & config);
|
||||
void calc_lengths();
|
||||
void open_data();
|
||||
void open_meta();
|
||||
@@ -303,7 +299,6 @@ class blockstore_impl_t
|
||||
blockstore_init_journal* journal_init_reader;
|
||||
|
||||
void check_wait(blockstore_op_t *op);
|
||||
void init_op(blockstore_op_t *op);
|
||||
|
||||
// Read
|
||||
int dequeue_read(blockstore_op_t *read_op);
|
||||
@@ -323,7 +318,7 @@ class blockstore_impl_t
|
||||
void handle_write_event(ring_data_t *data, blockstore_op_t *op);
|
||||
|
||||
// Sync
|
||||
int continue_sync(blockstore_op_t *op);
|
||||
int continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync);
|
||||
void ack_sync(blockstore_op_t *op);
|
||||
|
||||
// Stabilize
|
||||
@@ -331,8 +326,6 @@ class blockstore_impl_t
|
||||
int continue_stable(blockstore_op_t *op);
|
||||
void mark_stable(const obj_ver_id & ov, bool forget_dirty = false);
|
||||
void stabilize_object(object_id oid, uint64_t max_ver);
|
||||
blockstore_op_t* selective_sync(blockstore_op_t *op);
|
||||
int split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider);
|
||||
|
||||
// Rollback
|
||||
int dequeue_rollback(blockstore_op_t *op);
|
||||
@@ -348,8 +341,6 @@ public:
|
||||
blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
|
||||
~blockstore_impl_t();
|
||||
|
||||
void parse_config(blockstore_config_t & config, bool init);
|
||||
|
||||
// Event loop
|
||||
void loop();
|
||||
|
||||
|
@@ -4,54 +4,8 @@
|
||||
#include <sys/file.h>
|
||||
#include "blockstore_impl.h"
|
||||
|
||||
void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||
void blockstore_impl_t::parse_config(blockstore_config_t & config)
|
||||
{
|
||||
// Online-configurable options:
|
||||
max_flusher_count = strtoull(config["max_flusher_count"].c_str(), NULL, 10);
|
||||
if (!max_flusher_count)
|
||||
{
|
||||
max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
|
||||
}
|
||||
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
|
||||
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
|
||||
throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
|
||||
throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
|
||||
throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
|
||||
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
|
||||
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
|
||||
if (!max_flusher_count)
|
||||
{
|
||||
max_flusher_count = 256;
|
||||
}
|
||||
if (!min_flusher_count || journal.flush_journal)
|
||||
{
|
||||
min_flusher_count = 1;
|
||||
}
|
||||
if (!max_write_iodepth)
|
||||
{
|
||||
max_write_iodepth = 128;
|
||||
}
|
||||
if (!throttle_target_iops)
|
||||
{
|
||||
throttle_target_iops = 100;
|
||||
}
|
||||
if (!throttle_target_mbs)
|
||||
{
|
||||
throttle_target_mbs = 100;
|
||||
}
|
||||
if (!throttle_target_parallelism)
|
||||
{
|
||||
throttle_target_parallelism = 1;
|
||||
}
|
||||
if (!throttle_threshold_us)
|
||||
{
|
||||
throttle_threshold_us = 50;
|
||||
}
|
||||
if (!init)
|
||||
{
|
||||
return;
|
||||
}
|
||||
// Offline-configurable options:
|
||||
// Common disk options
|
||||
dsk.parse_config(config);
|
||||
// Parse
|
||||
@@ -90,7 +44,29 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||
journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
|
||||
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
|
||||
journal.inmemory = config["inmemory_journal"] != "false";
|
||||
max_flusher_count = strtoull(config["max_flusher_count"].c_str(), NULL, 10);
|
||||
if (!max_flusher_count)
|
||||
max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
|
||||
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
|
||||
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
|
||||
throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
|
||||
throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
|
||||
throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
|
||||
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
|
||||
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
|
||||
// Validate
|
||||
if (!max_flusher_count)
|
||||
{
|
||||
max_flusher_count = 256;
|
||||
}
|
||||
if (!min_flusher_count || journal.flush_journal)
|
||||
{
|
||||
min_flusher_count = 1;
|
||||
}
|
||||
if (!max_write_iodepth)
|
||||
{
|
||||
max_write_iodepth = 128;
|
||||
}
|
||||
if (journal.sector_count < 2)
|
||||
{
|
||||
journal.sector_count = 32;
|
||||
@@ -115,6 +91,22 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
||||
{
|
||||
throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
|
||||
}
|
||||
if (!throttle_target_iops)
|
||||
{
|
||||
throttle_target_iops = 100;
|
||||
}
|
||||
if (!throttle_target_mbs)
|
||||
{
|
||||
throttle_target_mbs = 100;
|
||||
}
|
||||
if (!throttle_target_parallelism)
|
||||
{
|
||||
throttle_target_parallelism = 1;
|
||||
}
|
||||
if (!throttle_threshold_us)
|
||||
{
|
||||
throttle_threshold_us = 50;
|
||||
}
|
||||
// init some fields
|
||||
journal.block_size = dsk.journal_block_size;
|
||||
journal.next_free = dsk.journal_block_size;
|
||||
|
@@ -9,39 +9,48 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
|
||||
{
|
||||
return continue_rollback(op);
|
||||
}
|
||||
int r = split_stab_op(op, [this](obj_ver_id ov)
|
||||
obj_ver_id *v, *nv;
|
||||
int i, todo = op->len;
|
||||
for (i = 0, v = (obj_ver_id*)op->buf, nv = (obj_ver_id*)op->buf; i < op->len; i++, v++, nv++)
|
||||
{
|
||||
if (nv != v)
|
||||
{
|
||||
*nv = *v;
|
||||
}
|
||||
// Check that there are some versions greater than v->version (which may be zero),
|
||||
// check that they're unstable, synced, and not currently written to
|
||||
auto dirty_it = dirty_db.lower_bound((obj_ver_id){
|
||||
.oid = ov.oid,
|
||||
.oid = v->oid,
|
||||
.version = UINT64_MAX,
|
||||
});
|
||||
if (dirty_it == dirty_db.begin())
|
||||
{
|
||||
skip_ov:
|
||||
// Already rolled back, skip this object version
|
||||
return STAB_SPLIT_DONE;
|
||||
todo--;
|
||||
nv--;
|
||||
continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
dirty_it--;
|
||||
if (dirty_it->first.oid != ov.oid || dirty_it->first.version < ov.version)
|
||||
if (dirty_it->first.oid != v->oid || dirty_it->first.version < v->version)
|
||||
{
|
||||
// Already rolled back, skip this object version
|
||||
return STAB_SPLIT_DONE;
|
||||
goto skip_ov;
|
||||
}
|
||||
while (dirty_it->first.oid == ov.oid && dirty_it->first.version > ov.version)
|
||||
while (dirty_it->first.oid == v->oid && dirty_it->first.version > v->version)
|
||||
{
|
||||
if (IS_IN_FLIGHT(dirty_it->second.state))
|
||||
{
|
||||
// Object write is still in progress. Wait until the write request completes
|
||||
return STAB_SPLIT_WAIT;
|
||||
return 0;
|
||||
}
|
||||
else if (!IS_SYNCED(dirty_it->second.state) ||
|
||||
IS_STABLE(dirty_it->second.state))
|
||||
{
|
||||
// Sync the object
|
||||
return STAB_SPLIT_SYNC;
|
||||
op->retval = -EBUSY;
|
||||
FINISH_OP(op);
|
||||
return 2;
|
||||
}
|
||||
if (dirty_it == dirty_db.begin())
|
||||
{
|
||||
@@ -49,16 +58,19 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
|
||||
}
|
||||
dirty_it--;
|
||||
}
|
||||
return STAB_SPLIT_TODO;
|
||||
}
|
||||
});
|
||||
if (r != 1)
|
||||
}
|
||||
op->len = todo;
|
||||
if (!todo)
|
||||
{
|
||||
return r;
|
||||
// Already rolled back
|
||||
op->retval = 0;
|
||||
FINISH_OP(op);
|
||||
return 2;
|
||||
}
|
||||
// Check journal space
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (!space_check.check_available(op, op->len, sizeof(journal_entry_rollback), 0))
|
||||
if (!space_check.check_available(op, todo, sizeof(journal_entry_rollback), 0))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@@ -66,8 +78,7 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
|
||||
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
|
||||
// Prepare and submit journal entries
|
||||
int s = 0;
|
||||
auto v = (obj_ver_id*)op->buf;
|
||||
for (int i = 0; i < op->len; i++, v++)
|
||||
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||
{
|
||||
if (!journal.entry_fits(sizeof(journal_entry_rollback)) &&
|
||||
journal.sector_info[journal.cur_sector].dirty)
|
||||
|
@@ -41,309 +41,60 @@
|
||||
// 4) after a while it takes his synced object list and sends stabilize requests
|
||||
// to peers and to its own blockstore, thus freeing the old version
|
||||
|
||||
struct ver_vector_t
|
||||
{
|
||||
obj_ver_id *items = NULL;
|
||||
uint64_t alloc = 0, size = 0;
|
||||
};
|
||||
|
||||
static void init_versions(ver_vector_t & vec, obj_ver_id *start, obj_ver_id *end, uint64_t len)
|
||||
{
|
||||
if (!vec.items)
|
||||
{
|
||||
vec.alloc = len;
|
||||
vec.items = (obj_ver_id*)malloc_or_die(sizeof(obj_ver_id) * vec.alloc);
|
||||
for (auto sv = start; sv < end; sv++)
|
||||
{
|
||||
vec.items[vec.size++] = *sv;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void append_version(ver_vector_t & vec, obj_ver_id ov)
|
||||
{
|
||||
if (vec.size >= vec.alloc)
|
||||
{
|
||||
vec.alloc = !vec.alloc ? 4 : vec.alloc*2;
|
||||
vec.items = (obj_ver_id*)realloc_or_die(vec.items, sizeof(obj_ver_id) * vec.alloc);
|
||||
}
|
||||
vec.items[vec.size++] = ov;
|
||||
}
|
||||
|
||||
static bool check_unsynced(std::vector<obj_ver_id> & check, obj_ver_id ov, std::vector<obj_ver_id> & to, int *count)
|
||||
{
|
||||
bool found = false;
|
||||
int j = 0, k = 0;
|
||||
while (j < check.size())
|
||||
{
|
||||
if (check[j] == ov)
|
||||
found = true;
|
||||
if (check[j].oid == ov.oid && check[j].version <= ov.version)
|
||||
{
|
||||
to.push_back(check[j++]);
|
||||
if (count)
|
||||
(*count)--;
|
||||
}
|
||||
else
|
||||
check[k++] = check[j++];
|
||||
}
|
||||
check.resize(k);
|
||||
return found;
|
||||
}
|
||||
|
||||
blockstore_op_t* blockstore_impl_t::selective_sync(blockstore_op_t *op)
|
||||
{
|
||||
unsynced_big_write_count -= unsynced_big_writes.size();
|
||||
unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
|
||||
unsynced_big_write_count += unsynced_big_writes.size();
|
||||
unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
|
||||
// Create a sync operation, insert into the end of the queue
|
||||
// And move ourselves into the end too!
|
||||
// Rather hacky but that's what we need...
|
||||
blockstore_op_t *sync_op = new blockstore_op_t;
|
||||
sync_op->opcode = BS_OP_SYNC;
|
||||
sync_op->buf = NULL;
|
||||
sync_op->callback = [this](blockstore_op_t *sync_op)
|
||||
{
|
||||
delete sync_op;
|
||||
};
|
||||
init_op(sync_op);
|
||||
int sync_res = continue_sync(sync_op);
|
||||
if (sync_res != 2)
|
||||
{
|
||||
// Put SYNC into the queue if it's not finished yet
|
||||
submit_queue.push_back(sync_op);
|
||||
}
|
||||
// Restore unsynced_writes
|
||||
unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
|
||||
unsynced_big_write_count -= unsynced_big_writes.size();
|
||||
unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
|
||||
unsynced_big_write_count += unsynced_big_writes.size();
|
||||
if (sync_res == 2)
|
||||
{
|
||||
// Sync is immediately completed
|
||||
return NULL;
|
||||
}
|
||||
return sync_op;
|
||||
}
|
||||
|
||||
// Returns: 2 = stop processing and dequeue, 0 = stop processing and do not dequeue, 1 = proceed with op itself
|
||||
int blockstore_impl_t::split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider)
|
||||
{
|
||||
bool add_sync = false;
|
||||
ver_vector_t good_vers, bad_vers;
|
||||
obj_ver_id* v;
|
||||
int i, todo = 0;
|
||||
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||
{
|
||||
int action = decider(*v);
|
||||
if (action < 0)
|
||||
{
|
||||
// Rollback changes
|
||||
for (auto & ov: PRIV(op)->sync_big_writes)
|
||||
{
|
||||
unsynced_big_writes.push_back(ov);
|
||||
unsynced_big_write_count++;
|
||||
}
|
||||
for (auto & ov: PRIV(op)->sync_small_writes)
|
||||
{
|
||||
unsynced_small_writes.push_back(ov);
|
||||
}
|
||||
free(good_vers.items);
|
||||
good_vers.items = NULL;
|
||||
free(bad_vers.items);
|
||||
bad_vers.items = NULL;
|
||||
// Error
|
||||
op->retval = action;
|
||||
FINISH_OP(op);
|
||||
return 2;
|
||||
}
|
||||
else if (action == STAB_SPLIT_DONE)
|
||||
{
|
||||
// Already done
|
||||
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
|
||||
}
|
||||
else if (action == STAB_SPLIT_WAIT)
|
||||
{
|
||||
// Already in progress, we just have to wait until it finishes
|
||||
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
|
||||
append_version(bad_vers, *v);
|
||||
}
|
||||
else if (action == STAB_SPLIT_SYNC)
|
||||
{
|
||||
// Needs a SYNC, we have to send a SYNC if not already in progress
|
||||
//
|
||||
// If the object is not present in unsynced_(big|small)_writes then
|
||||
// it's currently being synced. If it's present then we can initiate
|
||||
// its sync ourselves.
|
||||
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
|
||||
append_version(bad_vers, *v);
|
||||
if (!add_sync)
|
||||
{
|
||||
PRIV(op)->sync_big_writes.clear();
|
||||
PRIV(op)->sync_small_writes.clear();
|
||||
add_sync = true;
|
||||
}
|
||||
check_unsynced(unsynced_small_writes, *v, PRIV(op)->sync_small_writes, NULL);
|
||||
check_unsynced(unsynced_big_writes, *v, PRIV(op)->sync_big_writes, &unsynced_big_write_count);
|
||||
}
|
||||
else /* if (action == STAB_SPLIT_TODO) */
|
||||
{
|
||||
if (good_vers.items)
|
||||
{
|
||||
// If we're selecting versions then append it
|
||||
// Main idea is that 99% of the time all versions passed to BS_OP_STABLE are synced
|
||||
// And we don't want to select/allocate anything in that optimistic case
|
||||
append_version(good_vers, *v);
|
||||
}
|
||||
todo++;
|
||||
}
|
||||
}
|
||||
// In a pessimistic scenario, an operation may be split into 3:
|
||||
// - Stabilize synced entries
|
||||
// - Sync unsynced entries
|
||||
// - Continue for unsynced entries after sync
|
||||
add_sync = add_sync && (PRIV(op)->sync_big_writes.size() || PRIV(op)->sync_small_writes.size());
|
||||
if (!todo && !bad_vers.size)
|
||||
{
|
||||
// Already stable
|
||||
op->retval = 0;
|
||||
FINISH_OP(op);
|
||||
return 2;
|
||||
}
|
||||
op->retval = 0;
|
||||
if (!todo && !add_sync)
|
||||
{
|
||||
// Only wait for inflight writes or current in-progress syncs
|
||||
return 0;
|
||||
}
|
||||
blockstore_op_t *sync_op = NULL, *split_stab_op = NULL;
|
||||
if (add_sync)
|
||||
{
|
||||
// Initiate a selective sync for PRIV(op)->sync_(big|small)_writes
|
||||
sync_op = selective_sync(op);
|
||||
}
|
||||
if (bad_vers.size)
|
||||
{
|
||||
// Split part of the request into a separate operation
|
||||
split_stab_op = new blockstore_op_t;
|
||||
split_stab_op->opcode = op->opcode;
|
||||
split_stab_op->buf = bad_vers.items;
|
||||
split_stab_op->len = bad_vers.size;
|
||||
init_op(split_stab_op);
|
||||
submit_queue.push_back(split_stab_op);
|
||||
}
|
||||
if (sync_op || split_stab_op || good_vers.items)
|
||||
{
|
||||
void *orig_buf = op->buf;
|
||||
if (good_vers.items)
|
||||
{
|
||||
op->buf = good_vers.items;
|
||||
op->len = good_vers.size;
|
||||
}
|
||||
// Make a wrapped callback
|
||||
int *split_op_counter = (int*)malloc_or_die(sizeof(int));
|
||||
*split_op_counter = (sync_op ? 1 : 0) + (split_stab_op ? 1 : 0) + (todo ? 1 : 0);
|
||||
auto cb = [this, op, good_items = good_vers.items,
|
||||
bad_items = bad_vers.items, split_op_counter,
|
||||
orig_buf, real_cb = op->callback](blockstore_op_t *split_op)
|
||||
{
|
||||
if (split_op->retval != 0)
|
||||
op->retval = split_op->retval;
|
||||
(*split_op_counter)--;
|
||||
assert((*split_op_counter) >= 0);
|
||||
if (op != split_op)
|
||||
delete split_op;
|
||||
if (!*split_op_counter)
|
||||
{
|
||||
free(good_items);
|
||||
free(bad_items);
|
||||
free(split_op_counter);
|
||||
op->buf = orig_buf;
|
||||
real_cb(op);
|
||||
}
|
||||
};
|
||||
if (sync_op)
|
||||
{
|
||||
sync_op->callback = cb;
|
||||
}
|
||||
if (split_stab_op)
|
||||
{
|
||||
split_stab_op->callback = cb;
|
||||
}
|
||||
op->callback = cb;
|
||||
}
|
||||
if (!todo)
|
||||
{
|
||||
// All work is postponed
|
||||
op->callback = NULL;
|
||||
return 2;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||
{
|
||||
if (PRIV(op)->op_state)
|
||||
{
|
||||
return continue_stable(op);
|
||||
}
|
||||
int r = split_stab_op(op, [this](obj_ver_id ov)
|
||||
obj_ver_id* v;
|
||||
int i, todo = 0;
|
||||
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||
{
|
||||
auto dirty_it = dirty_db.find(ov);
|
||||
auto dirty_it = dirty_db.find(*v);
|
||||
if (dirty_it == dirty_db.end())
|
||||
{
|
||||
auto & clean_db = clean_db_shard(ov.oid);
|
||||
auto clean_it = clean_db.find(ov.oid);
|
||||
if (clean_it == clean_db.end() || clean_it->second.version < ov.version)
|
||||
auto & clean_db = clean_db_shard(v->oid);
|
||||
auto clean_it = clean_db.find(v->oid);
|
||||
if (clean_it == clean_db.end() || clean_it->second.version < v->version)
|
||||
{
|
||||
// No such object version
|
||||
printf("Error: %lx:%lx v%lu not found while stabilizing\n", ov.oid.inode, ov.oid.stripe, ov.version);
|
||||
return -ENOENT;
|
||||
op->retval = -ENOENT;
|
||||
FINISH_OP(op);
|
||||
return 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Already stable
|
||||
return STAB_SPLIT_DONE;
|
||||
}
|
||||
}
|
||||
else if (IS_IN_FLIGHT(dirty_it->second.state))
|
||||
{
|
||||
// Object write is still in progress. Wait until the write request completes
|
||||
return STAB_SPLIT_WAIT;
|
||||
return 0;
|
||||
}
|
||||
else if (!IS_SYNCED(dirty_it->second.state))
|
||||
{
|
||||
// Object not synced yet - sync it
|
||||
// In previous versions we returned EBUSY here and required
|
||||
// the caller (OSD) to issue a global sync first. But a global sync
|
||||
// waits for all writes in the queue including inflight writes. And
|
||||
// inflight writes may themselves be blocked by unstable writes being
|
||||
// still present in the journal and not flushed away from it.
|
||||
// So we must sync specific objects here.
|
||||
//
|
||||
// Even more, we have to process "stabilize" request in parts. That is,
|
||||
// we must stabilize all objects which are already synced. Otherwise
|
||||
// they may block objects which are NOT synced yet.
|
||||
return STAB_SPLIT_SYNC;
|
||||
// Object not synced yet. Caller must sync it first
|
||||
op->retval = -EBUSY;
|
||||
FINISH_OP(op);
|
||||
return 2;
|
||||
}
|
||||
else if (IS_STABLE(dirty_it->second.state))
|
||||
else if (!IS_STABLE(dirty_it->second.state))
|
||||
{
|
||||
// Already stable
|
||||
return STAB_SPLIT_DONE;
|
||||
todo++;
|
||||
}
|
||||
else
|
||||
{
|
||||
return STAB_SPLIT_TODO;
|
||||
}
|
||||
});
|
||||
if (r != 1)
|
||||
}
|
||||
if (!todo)
|
||||
{
|
||||
return r;
|
||||
// Already stable
|
||||
op->retval = 0;
|
||||
FINISH_OP(op);
|
||||
return 2;
|
||||
}
|
||||
// Check journal space
|
||||
blockstore_journal_check_t space_check(this);
|
||||
if (!space_check.check_available(op, op->len, sizeof(journal_entry_stable), 0))
|
||||
if (!space_check.check_available(op, todo, sizeof(journal_entry_stable), 0))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@@ -351,9 +102,9 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
|
||||
// Prepare and submit journal entries
|
||||
int s = 0;
|
||||
auto v = (obj_ver_id*)op->buf;
|
||||
for (int i = 0; i < op->len; i++, v++)
|
||||
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||
{
|
||||
// FIXME: Only stabilize versions that aren't stable yet
|
||||
if (!journal.entry_fits(sizeof(journal_entry_stable)) &&
|
||||
journal.sector_info[journal.cur_sector].dirty)
|
||||
{
|
||||
|
@@ -12,7 +12,7 @@
|
||||
#define SYNC_JOURNAL_SYNC_SENT 7
|
||||
#define SYNC_DONE 8
|
||||
|
||||
int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||
int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync)
|
||||
{
|
||||
if (immediate_commit == IMMEDIATE_ALL)
|
||||
{
|
||||
@@ -145,7 +145,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||
PRIV(op)->op_state = SYNC_DONE;
|
||||
}
|
||||
}
|
||||
if (PRIV(op)->op_state == SYNC_DONE)
|
||||
if (PRIV(op)->op_state == SYNC_DONE && !queue_has_in_progress_sync)
|
||||
{
|
||||
ack_sync(op);
|
||||
return 2;
|
||||
|
@@ -278,7 +278,7 @@ struct rm_osd_t
|
||||
if (rsp["response_delete_range"]["deleted"].uint64_value() > 0)
|
||||
{
|
||||
// Wait for mon_change_timeout before updating PG history, or the monitor's change will likely interfere with ours
|
||||
retry_wait = parent->cli->config["mon_change_timeout"].uint64_value();
|
||||
retry_wait = parent->cli->merged_config["mon_change_timeout"].uint64_value();
|
||||
if (!retry_wait)
|
||||
retry_wait = 1000;
|
||||
retry_wait += etcd_tx_retry_ms;
|
||||
@@ -410,14 +410,17 @@ struct rm_osd_t
|
||||
parent->cli->st_cli.etcd_prefix+"/pg/history/"+
|
||||
std::to_string(pool_cfg.id)+"/"+std::to_string(pg_num)
|
||||
);
|
||||
auto hist = json11::Json::object {
|
||||
{ "epoch", pg_cfg.epoch },
|
||||
{ "all_peers", pg_cfg.all_peers },
|
||||
{ "osd_sets", pg_cfg.target_history },
|
||||
};
|
||||
if (pg_cfg.scrub_ts)
|
||||
hist["scrub_ts"] = pg_cfg.scrub_ts;
|
||||
history_updates.push_back(json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
{ "key", history_key },
|
||||
{ "value", base64_encode(json11::Json(json11::Json::object {
|
||||
{ "epoch", pg_cfg.epoch },
|
||||
{ "all_peers", pg_cfg.all_peers },
|
||||
{ "osd_sets", pg_cfg.target_history },
|
||||
}).dump()) },
|
||||
{ "value", base64_encode(json11::Json(hist).dump()) },
|
||||
} },
|
||||
});
|
||||
history_checks.push_back(json11::Json::object {
|
||||
|
@@ -198,9 +198,9 @@ resume_2:
|
||||
}
|
||||
pgs_by_state_str += std::to_string(kv.second)+" "+kv.first;
|
||||
}
|
||||
bool readonly = json_is_true(parent->cli->config["readonly"]);
|
||||
bool no_recovery = json_is_true(parent->cli->config["no_recovery"]);
|
||||
bool no_rebalance = json_is_true(parent->cli->config["no_rebalance"]);
|
||||
bool readonly = json_is_true(parent->cli->merged_config["readonly"]);
|
||||
bool no_recovery = json_is_true(parent->cli->merged_config["no_recovery"]);
|
||||
bool no_rebalance = json_is_true(parent->cli->merged_config["no_rebalance"]);
|
||||
if (parent->json_output)
|
||||
{
|
||||
// JSON output
|
||||
|
@@ -18,12 +18,11 @@
|
||||
|
||||
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
|
||||
{
|
||||
cli_config = config.object_items();
|
||||
file_config = osd_messenger_t::read_config(config);
|
||||
config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
|
||||
config = osd_messenger_t::read_config(config);
|
||||
|
||||
this->ringloop = ringloop;
|
||||
this->tfd = tfd;
|
||||
this->config = config;
|
||||
|
||||
msgr.osd_num = 0;
|
||||
msgr.tfd = tfd;
|
||||
@@ -59,7 +58,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
|
||||
msgr.stop_client(op->peer_fd);
|
||||
delete op;
|
||||
};
|
||||
msgr.parse_config(config);
|
||||
msgr.parse_config(this->config);
|
||||
|
||||
st_cli.tfd = tfd;
|
||||
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
|
||||
@@ -277,10 +276,13 @@ restart:
|
||||
continuing_ops = 0;
|
||||
}
|
||||
|
||||
void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_config)
|
||||
void cluster_client_t::on_load_config_hook(json11::Json::object & config)
|
||||
{
|
||||
this->etcd_global_config = etcd_global_config;
|
||||
config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
|
||||
this->merged_config = config;
|
||||
for (auto & kv: this->config.object_items())
|
||||
{
|
||||
this->merged_config[kv.first] = kv.second;
|
||||
}
|
||||
if (config.find("client_max_dirty_bytes") != config.end())
|
||||
{
|
||||
client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
|
||||
@@ -290,13 +292,14 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
|
||||
// Old name
|
||||
client_max_dirty_bytes = config["client_dirty_limit"].uint64_value();
|
||||
}
|
||||
else
|
||||
client_max_dirty_bytes = 0;
|
||||
if (config.find("client_max_dirty_ops") != config.end())
|
||||
{
|
||||
client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
|
||||
}
|
||||
if (!client_max_dirty_bytes)
|
||||
{
|
||||
client_max_dirty_bytes = DEFAULT_CLIENT_MAX_DIRTY_BYTES;
|
||||
}
|
||||
client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
|
||||
if (!client_max_dirty_ops)
|
||||
{
|
||||
client_max_dirty_ops = DEFAULT_CLIENT_MAX_DIRTY_OPS;
|
||||
@@ -311,7 +314,7 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
|
||||
up_wait_retry_interval = 50;
|
||||
}
|
||||
msgr.parse_config(config);
|
||||
st_cli.parse_config(config);
|
||||
msgr.parse_config(this->config);
|
||||
st_cli.load_pgs();
|
||||
}
|
||||
|
||||
@@ -1118,24 +1121,6 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||
if (part->op.reply.hdr.retval != expected)
|
||||
{
|
||||
// Operation failed, retry
|
||||
part->flags |= PART_ERROR;
|
||||
if (!op->retval || op->retval == -EPIPE)
|
||||
{
|
||||
// Don't overwrite other errors with -EPIPE
|
||||
op->retval = part->op.reply.hdr.retval;
|
||||
}
|
||||
int stop_fd = -1;
|
||||
if (op->retval != -EINTR && op->retval != -EIO)
|
||||
{
|
||||
stop_fd = part->op.peer_fd;
|
||||
fprintf(
|
||||
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
|
||||
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
||||
);
|
||||
}
|
||||
// All next things like timer, continue_sync/rw and stop_client may affect the operation again
|
||||
// So do all these things after modifying operation state, otherwise we may hit reenterability bugs
|
||||
// FIXME postpone such things to set_immediate here to avoid bugs
|
||||
if (part->op.reply.hdr.retval == -EPIPE)
|
||||
{
|
||||
// Mark op->up_wait = true before stopping the client
|
||||
@@ -1149,17 +1134,20 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||
});
|
||||
}
|
||||
}
|
||||
if (op->inflight_count == 0)
|
||||
if (!op->retval || op->retval == -EPIPE)
|
||||
{
|
||||
if (op->opcode == OSD_OP_SYNC)
|
||||
continue_sync(op);
|
||||
else
|
||||
continue_rw(op);
|
||||
// Don't overwrite other errors with -EPIPE
|
||||
op->retval = part->op.reply.hdr.retval;
|
||||
}
|
||||
if (stop_fd >= 0)
|
||||
if (op->retval != -EINTR && op->retval != -EIO)
|
||||
{
|
||||
msgr.stop_client(stop_fd);
|
||||
fprintf(
|
||||
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
|
||||
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
|
||||
);
|
||||
msgr.stop_client(part->op.peer_fd);
|
||||
}
|
||||
part->flags |= PART_ERROR;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1173,13 +1161,13 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
|
||||
copy_part_bitmap(op, part);
|
||||
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
|
||||
}
|
||||
if (op->inflight_count == 0)
|
||||
{
|
||||
if (op->opcode == OSD_OP_SYNC)
|
||||
continue_sync(op);
|
||||
else
|
||||
continue_rw(op);
|
||||
}
|
||||
}
|
||||
if (op->inflight_count == 0)
|
||||
{
|
||||
if (op->opcode == OSD_OP_SYNC)
|
||||
continue_sync(op);
|
||||
else
|
||||
continue_rw(op);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -112,8 +112,8 @@ public:
|
||||
osd_messenger_t msgr;
|
||||
void init_msgr();
|
||||
|
||||
json11::Json::object cli_config, file_config, etcd_global_config;
|
||||
json11::Json::object config;
|
||||
json11::Json config;
|
||||
json11::Json::object merged_config;
|
||||
|
||||
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
|
||||
~cluster_client_t();
|
||||
|
@@ -43,7 +43,6 @@ struct inode_list_t
|
||||
inode_list_t* cluster_client_t::list_inode_start(inode_t inode,
|
||||
std::function<void(inode_list_t* lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)> callback)
|
||||
{
|
||||
init_msgr();
|
||||
int skipped_pgs = 0;
|
||||
pool_id_t pool_id = INODE_POOL(inode);
|
||||
if (!pool_id || st_cli.pool_config.find(pool_id) == st_cli.pool_config.end())
|
||||
|
@@ -281,7 +281,7 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
|
||||
if (je->big_write.size > sizeof(journal_entry_big_write))
|
||||
{
|
||||
printf(json ? ",\"bitmap\":\"" : " (bitmap: ");
|
||||
for (int i = sizeof(journal_entry_big_write); i < je->big_write.size; i++)
|
||||
for (int i = sizeof(journal_entry_big_write); i < je->small_write.size; i++)
|
||||
{
|
||||
printf("%02x", ((uint8_t*)je)[i]);
|
||||
}
|
||||
|
@@ -26,7 +26,7 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v1_t *)>
|
||||
buf_size = dsk.meta_len;
|
||||
void *data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
||||
lseek64(dsk.meta_fd, dsk.meta_offset, 0);
|
||||
read_blocking(dsk.meta_fd, data, dsk.meta_block_size);
|
||||
read_blocking(dsk.meta_fd, data, buf_size);
|
||||
// Check superblock
|
||||
blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)data;
|
||||
if (hdr->zero == 0 &&
|
||||
@@ -41,11 +41,8 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v1_t *)>
|
||||
if (buf_size % dsk.meta_block_size)
|
||||
{
|
||||
buf_size = 8*dsk.meta_block_size;
|
||||
void *new_data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
||||
memcpy(new_data, data, dsk.meta_block_size);
|
||||
free(data);
|
||||
data = new_data;
|
||||
hdr = (blockstore_meta_header_v1_t *)data;
|
||||
data = memalign_or_die(MEM_ALIGNMENT, buf_size);
|
||||
}
|
||||
}
|
||||
dsk.bitmap_granularity = hdr->bitmap_granularity;
|
||||
|
@@ -7,8 +7,8 @@
|
||||
#ifndef __MOCK__
|
||||
#include "addr_util.h"
|
||||
#include "http_client.h"
|
||||
#include "str_util.h"
|
||||
#endif
|
||||
#include "str_util.h"
|
||||
|
||||
etcd_state_client_t::~etcd_state_client_t()
|
||||
{
|
||||
@@ -18,8 +18,12 @@ etcd_state_client_t::~etcd_state_client_t()
|
||||
}
|
||||
watches.clear();
|
||||
etcd_watches_initialised = -1;
|
||||
if (ws_keepalive_timer >= 0)
|
||||
{
|
||||
tfd->clear_timer(ws_keepalive_timer);
|
||||
ws_keepalive_timer = -1;
|
||||
}
|
||||
#ifndef __MOCK__
|
||||
stop_ws_keepalive();
|
||||
if (etcd_watch_ws)
|
||||
{
|
||||
http_close(etcd_watch_ws);
|
||||
@@ -241,7 +245,6 @@ void etcd_state_client_t::parse_config(const json11::Json & config)
|
||||
if (this->etcd_keepalive_timeout < 30)
|
||||
this->etcd_keepalive_timeout = 30;
|
||||
}
|
||||
auto old_etcd_ws_keepalive_interval = this->etcd_ws_keepalive_interval;
|
||||
this->etcd_ws_keepalive_interval = config["etcd_ws_keepalive_interval"].uint64_value();
|
||||
if (this->etcd_ws_keepalive_interval <= 0)
|
||||
{
|
||||
@@ -262,13 +265,6 @@ void etcd_state_client_t::parse_config(const json11::Json & config)
|
||||
{
|
||||
this->etcd_quick_timeout = 1000;
|
||||
}
|
||||
if (this->etcd_ws_keepalive_interval != old_etcd_ws_keepalive_interval && ws_keepalive_timer >= 0)
|
||||
{
|
||||
#ifndef __MOCK__
|
||||
stop_ws_keepalive();
|
||||
start_ws_keepalive();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
void etcd_state_client_t::pick_next_etcd()
|
||||
@@ -482,20 +478,6 @@ void etcd_state_client_t::start_etcd_watcher()
|
||||
{
|
||||
on_start_watcher_hook(etcd_watch_ws);
|
||||
}
|
||||
start_ws_keepalive();
|
||||
}
|
||||
|
||||
void etcd_state_client_t::stop_ws_keepalive()
|
||||
{
|
||||
if (ws_keepalive_timer >= 0)
|
||||
{
|
||||
tfd->clear_timer(ws_keepalive_timer);
|
||||
ws_keepalive_timer = -1;
|
||||
}
|
||||
}
|
||||
|
||||
void etcd_state_client_t::start_ws_keepalive()
|
||||
{
|
||||
if (ws_keepalive_timer < 0)
|
||||
{
|
||||
ws_keepalive_timer = tfd->set_timer(etcd_ws_keepalive_interval*1000, true, [this](int)
|
||||
@@ -777,6 +759,10 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
fprintf(stderr, "Pool %u has invalid bitmap_granularity (must divide block_size), skipping pool\n", pool_id);
|
||||
continue;
|
||||
}
|
||||
// Scrub Interval
|
||||
pc.scrub_interval = parse_time(pool_item.second["scrub_interval"].string_value());
|
||||
if (!pc.scrub_interval)
|
||||
pc.scrub_interval = 0;
|
||||
// Immediate Commit Mode
|
||||
pc.immediate_commit = pool_item.second["immediate_commit"].is_string()
|
||||
? (pool_item.second["immediate_commit"].string_value() == "all"
|
||||
@@ -919,6 +905,8 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
|
||||
}
|
||||
// Read epoch
|
||||
pg_cfg.epoch = value["epoch"].uint64_value();
|
||||
// Scrub timestamp
|
||||
pg_cfg.scrub_ts = parse_time(value["scrub_ts"].string_value());
|
||||
if (on_change_pg_history_hook != NULL)
|
||||
{
|
||||
on_change_pg_history_hook(pool_id, pg_num);
|
||||
|
@@ -39,6 +39,7 @@ struct pg_config_t
|
||||
osd_num_t cur_primary;
|
||||
int cur_state;
|
||||
uint64_t epoch;
|
||||
uint64_t scrub_ts;
|
||||
};
|
||||
|
||||
struct pool_config_t
|
||||
@@ -55,6 +56,7 @@ struct pool_config_t
|
||||
uint64_t max_osd_combinations;
|
||||
uint64_t pg_stripe_size;
|
||||
std::map<pg_num_t, pg_config_t> pg_config;
|
||||
uint64_t scrub_interval;
|
||||
};
|
||||
|
||||
struct inode_config_t
|
||||
@@ -132,8 +134,6 @@ public:
|
||||
void etcd_txn(json11::Json txn, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback);
|
||||
void etcd_txn_slow(json11::Json txn, std::function<void(std::string, json11::Json)> callback);
|
||||
void start_etcd_watcher();
|
||||
void stop_ws_keepalive();
|
||||
void start_ws_keepalive();
|
||||
void load_global_config();
|
||||
void load_pgs();
|
||||
void parse_state(const etcd_kv_t & kv);
|
||||
|
@@ -157,10 +157,10 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
||||
this->rdma_max_sge = 128;
|
||||
this->rdma_max_send = config["rdma_max_send"].uint64_value();
|
||||
if (!this->rdma_max_send)
|
||||
this->rdma_max_send = 8;
|
||||
this->rdma_max_send = 64;
|
||||
this->rdma_max_recv = config["rdma_max_recv"].uint64_value();
|
||||
if (!this->rdma_max_recv)
|
||||
this->rdma_max_recv = 16;
|
||||
this->rdma_max_recv = 128;
|
||||
this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
|
||||
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
||||
this->rdma_max_msg = 129*1024;
|
||||
@@ -534,9 +534,8 @@ bool osd_messenger_t::is_rdma_enabled()
|
||||
}
|
||||
#endif
|
||||
|
||||
json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
|
||||
json11::Json osd_messenger_t::read_config(const json11::Json & config)
|
||||
{
|
||||
json11::Json::object file_config;
|
||||
const char *config_path = config["config_path"].string_value() != ""
|
||||
? config["config_path"].string_value().c_str() : VITASTOR_CONFIG_PATH;
|
||||
int fd = open(config_path, O_RDONLY);
|
||||
@@ -544,14 +543,14 @@ json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
|
||||
{
|
||||
if (errno != ENOENT)
|
||||
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
|
||||
return file_config;
|
||||
return config;
|
||||
}
|
||||
struct stat st;
|
||||
if (fstat(fd, &st) != 0)
|
||||
{
|
||||
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
|
||||
close(fd);
|
||||
return file_config;
|
||||
return config;
|
||||
}
|
||||
std::string buf;
|
||||
buf.resize(st.st_size);
|
||||
@@ -563,125 +562,23 @@ json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
|
||||
{
|
||||
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
|
||||
close(fd);
|
||||
return file_config;
|
||||
return config;
|
||||
}
|
||||
done += r;
|
||||
}
|
||||
close(fd);
|
||||
std::string json_err;
|
||||
file_config = json11::Json::parse(buf, json_err).object_items();
|
||||
json11::Json::object file_config = json11::Json::parse(buf, json_err).object_items();
|
||||
if (json_err != "")
|
||||
{
|
||||
fprintf(stderr, "Invalid JSON in %s: %s\n", config_path, json_err.c_str());
|
||||
return config;
|
||||
}
|
||||
file_config.erase("config_path");
|
||||
file_config.erase("osd_num");
|
||||
for (auto kv: config.object_items())
|
||||
{
|
||||
file_config[kv.first] = kv.second;
|
||||
}
|
||||
return file_config;
|
||||
}
|
||||
|
||||
static const char* cli_only_params[] = {
|
||||
// The list has to be sorted
|
||||
"bitmap_granularity",
|
||||
"block_size",
|
||||
"data_device",
|
||||
"data_offset",
|
||||
"data_size",
|
||||
"disable_data_fsync",
|
||||
"disable_device_lock",
|
||||
"disable_journal_fsync",
|
||||
"disable_meta_fsync",
|
||||
"disk_alignment",
|
||||
"flush_journal",
|
||||
"immediate_commit",
|
||||
"inmemory_journal",
|
||||
"inmemory_metadata",
|
||||
"journal_block_size",
|
||||
"journal_device",
|
||||
"journal_no_same_sector_overwrites",
|
||||
"journal_offset",
|
||||
"journal_sector_buffer_count",
|
||||
"journal_size",
|
||||
"meta_block_size",
|
||||
"meta_buf_size",
|
||||
"meta_device",
|
||||
"meta_offset",
|
||||
"osd_num",
|
||||
"readonly",
|
||||
};
|
||||
|
||||
static const char **cli_only_end = cli_only_params + (sizeof(cli_only_params)/sizeof(cli_only_params[0]));
|
||||
|
||||
static const char* local_only_params[] = {
|
||||
// The list has to be sorted
|
||||
"config_path",
|
||||
"rdma_device",
|
||||
"rdma_gid_index",
|
||||
"rdma_max_msg",
|
||||
"rdma_max_recv",
|
||||
"rdma_max_send",
|
||||
"rdma_max_sge",
|
||||
"rdma_mtu",
|
||||
"rdma_port_num",
|
||||
"tcp_header_buffer_size",
|
||||
"use_rdma",
|
||||
"use_sync_send_recv",
|
||||
};
|
||||
|
||||
static const char **local_only_end = local_only_params + (sizeof(local_only_params)/sizeof(local_only_params[0]));
|
||||
|
||||
// Basically could be replaced by std::lower_bound()...
|
||||
static int find_str_array(const char **start, const char **end, const std::string & s)
|
||||
{
|
||||
int min = 0, max = end-start;
|
||||
while (max-min >= 2)
|
||||
{
|
||||
int mid = (min+max)/2;
|
||||
int r = strcmp(s.c_str(), start[mid]);
|
||||
if (r < 0)
|
||||
max = mid;
|
||||
else if (r > 0)
|
||||
min = mid;
|
||||
else
|
||||
return mid;
|
||||
}
|
||||
if (min < end-start && !strcmp(s.c_str(), start[min]))
|
||||
return min;
|
||||
return -1;
|
||||
}
|
||||
|
||||
json11::Json::object osd_messenger_t::merge_configs(const json11::Json::object & cli_config,
|
||||
const json11::Json::object & file_config,
|
||||
const json11::Json::object & etcd_global_config,
|
||||
const json11::Json::object & etcd_osd_config)
|
||||
{
|
||||
// Priority: most important -> less important:
|
||||
// etcd_osd_config -> cli_config -> etcd_global_config -> file_config
|
||||
json11::Json::object res = file_config;
|
||||
for (auto & kv: file_config)
|
||||
{
|
||||
int cli_only = find_str_array(cli_only_params, cli_only_end, kv.first);
|
||||
if (cli_only < 0)
|
||||
{
|
||||
res[kv.first] = kv.second;
|
||||
}
|
||||
}
|
||||
for (auto & kv: etcd_global_config)
|
||||
{
|
||||
int local_only = find_str_array(local_only_params, local_only_end, kv.first);
|
||||
if (local_only < 0)
|
||||
{
|
||||
res[kv.first] = kv.second;
|
||||
}
|
||||
}
|
||||
for (auto & kv: cli_config)
|
||||
{
|
||||
res[kv.first] = kv.second;
|
||||
}
|
||||
for (auto & kv: etcd_osd_config)
|
||||
{
|
||||
int local_only = find_str_array(local_only_params, local_only_end, kv.first);
|
||||
if (local_only < 0)
|
||||
{
|
||||
res[kv.first] = kv.second;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
@@ -166,11 +166,7 @@ public:
|
||||
void accept_connections(int listen_fd);
|
||||
~osd_messenger_t();
|
||||
|
||||
static json11::Json::object read_config(const json11::Json & config);
|
||||
static json11::Json::object merge_configs(const json11::Json::object & cli_config,
|
||||
const json11::Json::object & file_config,
|
||||
const json11::Json::object & etcd_global_config,
|
||||
const json11::Json::object & etcd_osd_config);
|
||||
static json11::Json read_config(const json11::Json & config);
|
||||
|
||||
#ifdef WITH_RDMA
|
||||
bool is_rdma_enabled();
|
||||
|
@@ -43,15 +43,7 @@ void osd_messenger_t::send_replies()
|
||||
{
|
||||
}
|
||||
|
||||
json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
|
||||
json11::Json osd_messenger_t::read_config(const json11::Json & config)
|
||||
{
|
||||
return json11::Json::object();
|
||||
}
|
||||
|
||||
json11::Json::object osd_messenger_t::merge_configs(const json11::Json::object & cli_config,
|
||||
const json11::Json::object & file_config,
|
||||
const json11::Json::object & etcd_global_config,
|
||||
const json11::Json::object & etcd_osd_config)
|
||||
{
|
||||
return cli_config;
|
||||
return config;
|
||||
}
|
||||
|
@@ -313,18 +313,17 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
|
||||
stop_client(cl->peer_fd);
|
||||
return false;
|
||||
}
|
||||
if (bmp_len > 0)
|
||||
if (op->reply.hdr.retval >= 0 && bmp_len > 0)
|
||||
{
|
||||
assert(op->bitmap);
|
||||
cl->recv_list.push_back(op->bitmap, bmp_len);
|
||||
cl->read_remaining += bmp_len;
|
||||
}
|
||||
if (op->reply.hdr.retval > 0)
|
||||
{
|
||||
assert(op->iov.count > 0);
|
||||
cl->recv_list.append(op->iov);
|
||||
cl->read_remaining += op->reply.hdr.retval;
|
||||
}
|
||||
cl->read_remaining = op->reply.hdr.retval + bmp_len;
|
||||
if (cl->read_remaining == 0)
|
||||
{
|
||||
goto reuse;
|
||||
|
@@ -39,11 +39,6 @@ struct __attribute__((__packed__)) obj_ver_id
|
||||
uint64_t version;
|
||||
};
|
||||
|
||||
inline bool operator == (const obj_ver_id & a, const obj_ver_id & b)
|
||||
{
|
||||
return a.oid == b.oid && a.version == b.version;
|
||||
}
|
||||
|
||||
inline bool operator < (const obj_ver_id & a, const obj_ver_id & b)
|
||||
{
|
||||
return a.oid < b.oid || a.oid == b.oid && a.version < b.version;
|
||||
|
141
src/osd.cpp
141
src/osd.cpp
@@ -35,18 +35,18 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
|
||||
|
||||
this->ringloop = ringloop;
|
||||
|
||||
this->cli_config = config.object_items();
|
||||
this->file_config = msgr.read_config(this->cli_config);
|
||||
parse_config(true);
|
||||
this->config = msgr.read_config(config).object_items();
|
||||
if (this->config.find("log_level") == this->config.end())
|
||||
this->config["log_level"] = 1;
|
||||
parse_config(this->config, true);
|
||||
|
||||
epmgr = new epoll_manager_t(ringloop);
|
||||
// FIXME: Use timerfd_interval based directly on io_uring
|
||||
this->tfd = epmgr->tfd;
|
||||
|
||||
if (!json_is_true(this->config["disable_blockstore"]))
|
||||
auto bs_cfg = json_to_bs(this->config);
|
||||
this->bs = new blockstore_t(bs_cfg, ringloop, tfd);
|
||||
{
|
||||
auto bs_cfg = json_to_bs(this->config);
|
||||
this->bs = new blockstore_t(bs_cfg, ringloop, tfd);
|
||||
// Autosync based on the number of unstable writes to prevent stalls due to insufficient journal space
|
||||
uint64_t max_autosync = bs->get_journal_size() / bs->get_block_size() / 2;
|
||||
if (autosync_writes > max_autosync)
|
||||
@@ -67,11 +67,11 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
|
||||
}
|
||||
}
|
||||
|
||||
print_stats_timer_id = this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
|
||||
this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
|
||||
{
|
||||
print_stats();
|
||||
});
|
||||
slow_log_timer_id = this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
|
||||
this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
|
||||
{
|
||||
print_slow();
|
||||
});
|
||||
@@ -91,42 +91,18 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
|
||||
|
||||
osd_t::~osd_t()
|
||||
{
|
||||
if (slow_log_timer_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(slow_log_timer_id);
|
||||
slow_log_timer_id = -1;
|
||||
}
|
||||
if (print_stats_timer_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(print_stats_timer_id);
|
||||
print_stats_timer_id = -1;
|
||||
}
|
||||
if (autosync_timer_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(autosync_timer_id);
|
||||
autosync_timer_id = -1;
|
||||
}
|
||||
ringloop->unregister_consumer(&consumer);
|
||||
delete epmgr;
|
||||
if (bs)
|
||||
delete bs;
|
||||
delete bs;
|
||||
close(listen_fd);
|
||||
free(zero_buffer);
|
||||
}
|
||||
|
||||
void osd_t::parse_config(bool init)
|
||||
void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
|
||||
{
|
||||
config = msgr.merge_configs(cli_config, file_config, etcd_global_config, etcd_osd_config);
|
||||
if (config.find("log_level") == this->config.end())
|
||||
config["log_level"] = 1;
|
||||
if (bs)
|
||||
{
|
||||
auto bs_cfg = json_to_bs(config);
|
||||
bs->parse_config(bs_cfg);
|
||||
}
|
||||
st_cli.parse_config(config);
|
||||
msgr.parse_config(config);
|
||||
if (init)
|
||||
if (allow_disk_params)
|
||||
{
|
||||
// OSD number
|
||||
osd_num = config["osd_num"].uint64_value();
|
||||
@@ -148,27 +124,24 @@ void osd_t::parse_config(bool init)
|
||||
immediate_commit = IMMEDIATE_SMALL;
|
||||
else
|
||||
immediate_commit = IMMEDIATE_NONE;
|
||||
// Bind address
|
||||
bind_address = config["bind_address"].string_value();
|
||||
if (bind_address == "")
|
||||
bind_address = "0.0.0.0";
|
||||
bind_port = config["bind_port"].uint64_value();
|
||||
if (bind_port <= 0 || bind_port > 65535)
|
||||
bind_port = 0;
|
||||
// OSD configuration
|
||||
etcd_report_interval = config["etcd_report_interval"].uint64_value();
|
||||
if (etcd_report_interval <= 0)
|
||||
etcd_report_interval = 5;
|
||||
readonly = json_is_true(config["readonly"]);
|
||||
run_primary = !json_is_false(config["run_primary"]);
|
||||
allow_test_ops = json_is_true(config["allow_test_ops"]);
|
||||
}
|
||||
// Bind address
|
||||
bind_address = config["bind_address"].string_value();
|
||||
if (bind_address == "")
|
||||
bind_address = "0.0.0.0";
|
||||
bind_port = config["bind_port"].uint64_value();
|
||||
if (bind_port <= 0 || bind_port > 65535)
|
||||
bind_port = 0;
|
||||
// OSD configuration
|
||||
log_level = config["log_level"].uint64_value();
|
||||
auto old_no_rebalance = no_rebalance;
|
||||
etcd_report_interval = config["etcd_report_interval"].uint64_value();
|
||||
if (etcd_report_interval <= 0)
|
||||
etcd_report_interval = 5;
|
||||
readonly = json_is_true(config["readonly"]);
|
||||
run_primary = !json_is_false(config["run_primary"]);
|
||||
no_rebalance = json_is_true(config["no_rebalance"]);
|
||||
auto old_no_recovery = no_recovery;
|
||||
no_recovery = json_is_true(config["no_recovery"]);
|
||||
auto old_autosync_interval = autosync_interval;
|
||||
allow_test_ops = json_is_true(config["allow_test_ops"]);
|
||||
if (!config["autosync_interval"].is_null())
|
||||
{
|
||||
// Allow to set it to 0
|
||||
@@ -196,46 +169,25 @@ void osd_t::parse_config(bool init)
|
||||
recovery_sync_batch = config["recovery_sync_batch"].uint64_value();
|
||||
if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
|
||||
recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||
auto old_print_stats_interval = print_stats_interval;
|
||||
print_stats_interval = config["print_stats_interval"].uint64_value();
|
||||
if (!print_stats_interval)
|
||||
print_stats_interval = 3;
|
||||
auto old_slow_log_interval = slow_log_interval;
|
||||
slow_log_interval = config["slow_log_interval"].uint64_value();
|
||||
if (!slow_log_interval)
|
||||
slow_log_interval = 10;
|
||||
inode_vanish_time = config["inode_vanish_time"].uint64_value();
|
||||
if (!inode_vanish_time)
|
||||
inode_vanish_time = 60;
|
||||
if ((old_no_rebalance && !no_rebalance || old_no_recovery && !no_recovery) &&
|
||||
!(peering_state & (OSD_RECOVERING | OSD_FLUSHING_PGS)))
|
||||
{
|
||||
peering_state = peering_state | OSD_RECOVERING;
|
||||
}
|
||||
if (old_autosync_interval != autosync_interval && autosync_timer_id >= 0)
|
||||
{
|
||||
this->tfd->clear_timer(autosync_timer_id);
|
||||
autosync_timer_id = this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
|
||||
{
|
||||
autosync();
|
||||
});
|
||||
}
|
||||
if (old_print_stats_interval != print_stats_interval && print_stats_timer_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(print_stats_timer_id);
|
||||
print_stats_timer_id = this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
|
||||
{
|
||||
print_stats();
|
||||
});
|
||||
}
|
||||
if (old_slow_log_interval != slow_log_interval && slow_log_timer_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(slow_log_timer_id);
|
||||
slow_log_timer_id = this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
|
||||
{
|
||||
print_slow();
|
||||
});
|
||||
}
|
||||
global_scrub_interval = config["scrub_interval"].uint64_value();
|
||||
if (!global_scrub_interval)
|
||||
global_scrub_interval = 30*86400;
|
||||
scrub_queue_depth = config["scrub_queue_depth"].uint64_value();
|
||||
if (scrub_queue_depth < 1 || scrub_queue_depth > MAX_RECOVERY_QUEUE)
|
||||
scrub_queue_depth = 1;
|
||||
scrub_sleep_ms = config["scrub_sleep"].uint64_value();
|
||||
scrub_list_limit = config["scrub_list_limit"].uint64_value();
|
||||
if (!scrub_list_limit)
|
||||
scrub_list_limit = 1000;
|
||||
}
|
||||
|
||||
void osd_t::bind_socket()
|
||||
@@ -320,7 +272,8 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
||||
cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
|
||||
(cur_op->req.rw.len > OSD_RW_MAX ||
|
||||
cur_op->req.rw.len % bs_bitmap_granularity ||
|
||||
cur_op->req.rw.offset % bs_bitmap_granularity)))
|
||||
cur_op->req.rw.offset % bs_bitmap_granularity)) ||
|
||||
cur_op->req.hdr.opcode == OSD_OP_SCRUB && cur_op->peer_fd != -1)
|
||||
{
|
||||
// Bad command
|
||||
finish_op(cur_op, -EINVAL);
|
||||
@@ -337,6 +290,7 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
||||
cur_op->req.hdr.opcode != OSD_OP_SEC_LIST &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_READ &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_SEC_READ_BMP &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_SCRUB &&
|
||||
cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
|
||||
{
|
||||
// Readonly mode
|
||||
@@ -367,6 +321,10 @@ void osd_t::exec_op(osd_op_t *cur_op)
|
||||
{
|
||||
continue_primary_del(cur_op);
|
||||
}
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_SCRUB)
|
||||
{
|
||||
continue_primary_scrub(cur_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
exec_secondary(cur_op);
|
||||
@@ -431,6 +389,10 @@ void osd_t::print_stats()
|
||||
recovery_stat_bytes[1][i] = recovery_stat_bytes[0][i];
|
||||
}
|
||||
}
|
||||
if (corrupted_objects > 0)
|
||||
{
|
||||
printf("[OSD %lu] %lu object(s) corrupted\n", osd_num, corrupted_objects);
|
||||
}
|
||||
if (incomplete_objects > 0)
|
||||
{
|
||||
printf("[OSD %lu] %lu object(s) incomplete\n", osd_num, incomplete_objects);
|
||||
@@ -460,7 +422,7 @@ void osd_t::print_slow()
|
||||
int l = sizeof(alloc), n;
|
||||
char *buf = alloc;
|
||||
#define bufprintf(s, ...) { n = snprintf(buf, l, s, __VA_ARGS__); n = n < 0 ? 0 : n; buf += n; l -= n; }
|
||||
bufprintf("[OSD %lu] Slow op %lx", osd_num, (unsigned long)op);
|
||||
bufprintf("[OSD %lu] Slow op", osd_num);
|
||||
if (kv.second->osd_num)
|
||||
{
|
||||
bufprintf(" from peer OSD %lu (client %d)", kv.second->osd_num, kv.second->peer_fd);
|
||||
@@ -498,10 +460,11 @@ void osd_t::print_slow()
|
||||
else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
|
||||
{
|
||||
bufprintf(
|
||||
" inode=%lx-%lx pg=%u/%u, stripe=%lu",
|
||||
op->req.sec_list.min_inode, op->req.sec_list.max_inode,
|
||||
" oid=%lx/%lx-%lx/%lx pg=%u/%u, stripe=%lu, limit=%u",
|
||||
op->req.sec_list.min_inode, op->req.sec_list.min_stripe,
|
||||
op->req.sec_list.max_inode, op->req.sec_list.max_stripe,
|
||||
op->req.sec_list.list_pg, op->req.sec_list.pg_count,
|
||||
op->req.sec_list.pg_stripe_size
|
||||
op->req.sec_list.pg_stripe_size, op->req.sec_list.stable_limit
|
||||
);
|
||||
}
|
||||
else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
|
||||
@@ -533,7 +496,7 @@ void osd_t::print_slow()
|
||||
}
|
||||
}
|
||||
}
|
||||
if (has_slow && bs)
|
||||
if (has_slow)
|
||||
{
|
||||
bs->dump_diagnostics();
|
||||
}
|
||||
|
39
src/osd.h
39
src/osd.h
@@ -28,6 +28,7 @@
|
||||
#define OSD_PEERING_PGS 0x04
|
||||
#define OSD_FLUSHING_PGS 0x08
|
||||
#define OSD_RECOVERING 0x10
|
||||
#define OSD_SCRUBBING 0x20
|
||||
|
||||
#define MAX_AUTOSYNC_INTERVAL 3600
|
||||
#define DEFAULT_AUTOSYNC_INTERVAL 5
|
||||
@@ -90,7 +91,7 @@ class osd_t
|
||||
{
|
||||
// config
|
||||
|
||||
json11::Json::object cli_config, file_config, etcd_global_config, etcd_osd_config, config;
|
||||
json11::Json::object config;
|
||||
int etcd_report_interval = 5;
|
||||
|
||||
bool readonly = false;
|
||||
@@ -113,6 +114,10 @@ class osd_t
|
||||
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||
int inode_vanish_time = 60;
|
||||
int log_level = 0;
|
||||
uint64_t global_scrub_interval = 30*86400;
|
||||
uint64_t scrub_queue_depth = 1;
|
||||
uint64_t scrub_sleep_ms = 0;
|
||||
uint32_t scrub_list_limit = 1000;
|
||||
|
||||
// cluster state
|
||||
|
||||
@@ -126,7 +131,6 @@ class osd_t
|
||||
bool pg_config_applied = false;
|
||||
bool etcd_reporting_pg_state = false;
|
||||
bool etcd_reporting_stats = false;
|
||||
int autosync_timer_id = -1, print_stats_timer_id = -1, slow_log_timer_id = -1;
|
||||
|
||||
// peers and PGs
|
||||
|
||||
@@ -135,15 +139,24 @@ class osd_t
|
||||
std::set<pool_pg_num_t> dirty_pgs;
|
||||
std::set<osd_num_t> dirty_osds;
|
||||
int copies_to_delete_after_sync_count = 0;
|
||||
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
|
||||
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0, corrupted_objects = 0;
|
||||
int peering_state = 0;
|
||||
std::map<object_id, osd_recovery_op_t> recovery_ops;
|
||||
std::map<object_id, osd_op_t*> scrub_ops;
|
||||
bool recovery_last_degraded = true;
|
||||
pool_pg_num_t recovery_last_pg;
|
||||
object_id recovery_last_oid;
|
||||
int recovery_pg_done = 0, recovery_done = 0;
|
||||
osd_op_t *autosync_op = NULL;
|
||||
|
||||
// Scrubbing
|
||||
uint64_t scrub_nearest_ts = 0;
|
||||
int scrub_timer_id = -1;
|
||||
pool_pg_num_t scrub_last_pg;
|
||||
osd_op_t *scrub_list_op;
|
||||
pg_list_result_t scrub_cur_list = {};
|
||||
uint64_t scrub_list_pos = 0;
|
||||
|
||||
// Unstable writes
|
||||
uint64_t unstable_write_count = 0;
|
||||
std::map<osd_object_id_t, uint64_t> unstable_writes;
|
||||
@@ -153,7 +166,7 @@ class osd_t
|
||||
|
||||
bool stopping = false;
|
||||
int inflight_ops = 0;
|
||||
blockstore_t *bs = NULL;
|
||||
blockstore_t *bs;
|
||||
void *zero_buffer = NULL;
|
||||
uint64_t zero_buffer_size = 0;
|
||||
uint32_t bs_block_size, bs_bitmap_granularity, clean_entry_bitmap_size;
|
||||
@@ -174,7 +187,7 @@ class osd_t
|
||||
uint64_t recovery_stat_bytes[2][2] = {};
|
||||
|
||||
// cluster connection
|
||||
void parse_config(bool init);
|
||||
void parse_config(const json11::Json & config, bool allow_disk_params);
|
||||
void init_cluster();
|
||||
void on_change_osd_state_hook(osd_num_t peer_osd);
|
||||
void on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num);
|
||||
@@ -221,6 +234,13 @@ class osd_t
|
||||
bool continue_recovery();
|
||||
pg_osd_set_state_t* change_osd_set(pg_osd_set_state_t *st, pg_t *pg);
|
||||
|
||||
// scrub
|
||||
void scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oid);
|
||||
bool pick_next_scrub(object_id & next_oid);
|
||||
void submit_scrub_op(object_id oid);
|
||||
bool continue_scrub();
|
||||
void schedule_scrub(pg_t & pg);
|
||||
|
||||
// op execution
|
||||
void exec_op(osd_op_t *cur_op);
|
||||
void finish_op(osd_op_t *cur_op, int retval);
|
||||
@@ -235,13 +255,15 @@ class osd_t
|
||||
void autosync();
|
||||
bool prepare_primary_rw(osd_op_t *cur_op);
|
||||
void continue_primary_read(osd_op_t *cur_op);
|
||||
void continue_primary_scrub(osd_op_t *cur_op);
|
||||
void continue_primary_write(osd_op_t *cur_op);
|
||||
void cancel_primary_write(osd_op_t *cur_op);
|
||||
void continue_primary_sync(osd_op_t *cur_op);
|
||||
void continue_primary_del(osd_op_t *cur_op);
|
||||
bool check_write_queue(osd_op_t *cur_op, pg_t & pg);
|
||||
void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg);
|
||||
void free_object_state(pg_t & pg, pg_osd_set_state_t **object_state);
|
||||
void remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t &pg, bool report = true);
|
||||
pg_osd_set_state_t *mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref);
|
||||
void deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref);
|
||||
bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state);
|
||||
void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op);
|
||||
void handle_primary_bs_subop(osd_op_t *subop);
|
||||
@@ -256,10 +278,11 @@ class osd_t
|
||||
int submit_primary_sync_subops(osd_op_t *cur_op);
|
||||
void submit_primary_stab_subops(osd_op_t *cur_op);
|
||||
|
||||
uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state);
|
||||
uint64_t* get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state);
|
||||
|
||||
void continue_chained_read(osd_op_t *cur_op);
|
||||
int submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op);
|
||||
void check_corrupted_chained(pg_t & pg, osd_op_t *cur_op);
|
||||
void send_chained_read_results(pg_t & pg, osd_op_t *cur_op);
|
||||
std::vector<osd_chain_read_t> collect_chained_read_requests(osd_op_t *cur_op);
|
||||
int collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitmap_request_t> & bitmap_requests);
|
||||
|
@@ -75,7 +75,7 @@ void osd_t::init_cluster()
|
||||
}
|
||||
if (run_primary && autosync_interval > 0)
|
||||
{
|
||||
autosync_timer_id = this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
|
||||
this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
|
||||
{
|
||||
autosync();
|
||||
});
|
||||
@@ -182,10 +182,10 @@ json11::Json osd_t::get_statistics()
|
||||
char time_str[50] = { 0 };
|
||||
sprintf(time_str, "%ld.%03ld", ts.tv_sec, ts.tv_nsec/1000000);
|
||||
st["time"] = time_str;
|
||||
st["blockstore_ready"] = bs->is_started();
|
||||
st["data_block_size"] = (uint64_t)bs->get_block_size();
|
||||
if (bs)
|
||||
{
|
||||
st["blockstore_ready"] = bs->is_started();
|
||||
st["data_block_size"] = (uint64_t)bs->get_block_size();
|
||||
st["size"] = bs->get_block_count() * bs->get_block_size();
|
||||
st["free"] = bs->get_free_block_count() * bs->get_block_size();
|
||||
}
|
||||
@@ -233,8 +233,7 @@ void osd_t::report_statistics()
|
||||
json11::Json::object inode_space;
|
||||
json11::Json::object last_stat;
|
||||
pool_id_t last_pool = 0;
|
||||
std::map<uint64_t, uint64_t> bs_empty_space;
|
||||
auto & bs_inode_space = bs ? bs->get_inode_space_stats() : bs_empty_space;
|
||||
auto & bs_inode_space = bs->get_inode_space_stats();
|
||||
for (auto kv: bs_inode_space)
|
||||
{
|
||||
pool_id_t pool_id = INODE_POOL(kv.first);
|
||||
@@ -337,6 +336,8 @@ void osd_t::report_statistics()
|
||||
pg_stats["misplaced_count"] = pg.misplaced_objects.size();
|
||||
pg_stats["degraded_count"] = pg.degraded_objects.size();
|
||||
pg_stats["incomplete_count"] = pg.incomplete_objects.size();
|
||||
if (pg.corrupted_count)
|
||||
pg_stats["corrupted_count"] = pg.corrupted_count;
|
||||
pg_stats["write_osd_set"] = pg.cur_set;
|
||||
txn.push_back(json11::Json::object {
|
||||
{ "request_put", json11::Json::object {
|
||||
@@ -375,11 +376,7 @@ void osd_t::on_change_osd_state_hook(osd_num_t peer_osd)
|
||||
|
||||
void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes)
|
||||
{
|
||||
if (changes.find(st_cli.etcd_prefix+"/config/global") != changes.end())
|
||||
{
|
||||
etcd_global_config = changes[st_cli.etcd_prefix+"/config/global"].value.object_items();
|
||||
parse_config(false);
|
||||
}
|
||||
// FIXME apply config changes in runtime (maybe, some)
|
||||
if (run_primary)
|
||||
{
|
||||
apply_pg_count();
|
||||
@@ -389,8 +386,11 @@ void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes
|
||||
|
||||
void osd_t::on_load_config_hook(json11::Json::object & global_config)
|
||||
{
|
||||
etcd_global_config = global_config;
|
||||
parse_config(true);
|
||||
json11::Json::object osd_config = this->config;
|
||||
for (auto & kv: global_config)
|
||||
if (osd_config.find(kv.first) == osd_config.end())
|
||||
osd_config[kv.first] = kv.second;
|
||||
parse_config(osd_config, false);
|
||||
bind_socket();
|
||||
acquire_lease();
|
||||
}
|
||||
@@ -692,6 +692,12 @@ void osd_t::apply_pg_config()
|
||||
pg_it->second.all_peers == vec_all_peers)
|
||||
{
|
||||
// No change in osd_set and history
|
||||
if (pg_it->second.scrub_ts != pg_cfg.scrub_ts)
|
||||
{
|
||||
pg_it->second.scrub_ts = pg_cfg.scrub_ts;
|
||||
peering_state = peering_state | OSD_SCRUBBING;
|
||||
ringloop->wakeup();
|
||||
}
|
||||
continue;
|
||||
}
|
||||
else
|
||||
@@ -736,13 +742,14 @@ void osd_t::apply_pg_config()
|
||||
.pg_cursize = 0,
|
||||
.pg_size = pool_item.second.pg_size,
|
||||
.pg_minsize = pool_item.second.pg_minsize,
|
||||
.pg_data_size = pool_item.second.scheme == POOL_SCHEME_REPLICATED
|
||||
.pg_data_size = pg.scheme == POOL_SCHEME_REPLICATED
|
||||
? 1 : pool_item.second.pg_size - pool_item.second.parity_chunks,
|
||||
.pool_id = pool_id,
|
||||
.pg_num = pg_num,
|
||||
.reported_epoch = pg_cfg.epoch,
|
||||
.target_history = pg_cfg.target_history,
|
||||
.all_peers = vec_all_peers,
|
||||
.scrub_ts = pg_cfg.scrub_ts,
|
||||
.target_set = pg_cfg.target_set,
|
||||
};
|
||||
if (pg.scheme == POOL_SCHEME_EC)
|
||||
@@ -873,6 +880,8 @@ void osd_t::report_pg_states()
|
||||
{ "all_peers", pg.all_peers },
|
||||
{ "osd_sets", pg.target_history },
|
||||
};
|
||||
if (pg.scrub_ts)
|
||||
history_value["scrub_ts"] = pg.scrub_ts;
|
||||
checks.push_back(json11::Json::object {
|
||||
{ "target", "MOD" },
|
||||
{ "key", history_key },
|
||||
|
@@ -64,11 +64,6 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
|
||||
|
||||
void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval)
|
||||
{
|
||||
if (log_level > 2)
|
||||
{
|
||||
printf("[PG %u/%u] flush batch %lx completed on OSD %lu with result %d\n",
|
||||
pool_id, pg_num, (uint64_t)fb, peer_osd, retval);
|
||||
}
|
||||
pool_pg_num_t pg_id = { .pool_id = pool_id, .pg_num = pg_num };
|
||||
if (pgs.find(pg_id) == pgs.end() || pgs[pg_id].flush_batch != fb)
|
||||
{
|
||||
@@ -104,9 +99,10 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
||||
std::vector<osd_op_t*> continue_ops;
|
||||
auto & pg = pgs.at(pg_id);
|
||||
auto it = pg.flush_actions.begin(), prev_it = it;
|
||||
auto erase_start = it;
|
||||
while (1)
|
||||
{
|
||||
if (it == pg.flush_actions.end() || !it->second.submitted ||
|
||||
if (it == pg.flush_actions.end() ||
|
||||
it->first.oid.inode != prev_it->first.oid.inode ||
|
||||
(it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK))
|
||||
{
|
||||
@@ -120,23 +116,29 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
||||
});
|
||||
if (wr_it != pg.write_queue.end())
|
||||
{
|
||||
if (log_level > 2)
|
||||
{
|
||||
printf("[PG %u/%u] continuing write %lx to object %lx:%lx after flush\n",
|
||||
pool_id, pg_num, (uint64_t)wr_it->second, wr_it->first.inode, wr_it->first.stripe);
|
||||
}
|
||||
continue_ops.push_back(wr_it->second);
|
||||
pg.write_queue.erase(wr_it);
|
||||
}
|
||||
}
|
||||
if (it == pg.flush_actions.end() || !it->second.submitted)
|
||||
if ((it == pg.flush_actions.end() || !it->second.submitted) &&
|
||||
erase_start != it)
|
||||
{
|
||||
pg.flush_actions.erase(erase_start, it);
|
||||
}
|
||||
if (it == pg.flush_actions.end())
|
||||
{
|
||||
if (it != pg.flush_actions.begin())
|
||||
{
|
||||
pg.flush_actions.erase(pg.flush_actions.begin(), it);
|
||||
}
|
||||
break;
|
||||
}
|
||||
prev_it = it++;
|
||||
prev_it = it;
|
||||
if (!it->second.submitted)
|
||||
{
|
||||
it++;
|
||||
erase_start = it;
|
||||
}
|
||||
else
|
||||
{
|
||||
it++;
|
||||
}
|
||||
}
|
||||
delete fb;
|
||||
pg.flush_batch = NULL;
|
||||
@@ -166,18 +168,6 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
||||
// Copy buffer so it gets freed along with the operation
|
||||
op->buf = malloc_or_die(sizeof(obj_ver_id) * count);
|
||||
memcpy(op->buf, data, sizeof(obj_ver_id) * count);
|
||||
if (log_level > 2)
|
||||
{
|
||||
printf(
|
||||
"[PG %u/%u] flush batch %lx on OSD %lu: %s objects: ",
|
||||
pool_id, pg_num, (uint64_t)fb, peer_osd, rollback ? "rollback" : "stabilize"
|
||||
);
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
printf(i > 0 ? ", %lx:%lx v%lu" : "%lx:%lx v%lu", data[i].oid.inode, data[i].oid.stripe, data[i].version);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
if (peer_osd == this->osd_num)
|
||||
{
|
||||
// local
|
||||
@@ -192,7 +182,9 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
||||
op->bs_op = NULL;
|
||||
delete op;
|
||||
},
|
||||
.len = (uint32_t)count,
|
||||
{
|
||||
.len = (uint32_t)count,
|
||||
},
|
||||
.buf = op->buf,
|
||||
});
|
||||
bs->enqueue_op(op->bs_op);
|
||||
@@ -310,20 +302,17 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
|
||||
if (osd_op->reply.hdr.retval < 0)
|
||||
{
|
||||
// Error recovering object
|
||||
if (osd_op->reply.hdr.retval == -EPIPE)
|
||||
{
|
||||
// PG is stopped or one of the OSDs is gone, error is harmless
|
||||
printf(
|
||||
"[PG %u/%u] Recovery operation failed with object %lx:%lx\n",
|
||||
INODE_POOL(op->oid.inode),
|
||||
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
|
||||
op->oid.inode, op->oid.stripe
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("Failed to recover an object");
|
||||
}
|
||||
// EPIPE is totally harmless (peer is gone), others like EIO/EDOM may be not
|
||||
printf(
|
||||
"Recovery operation failed with object %lx:%lx (PG %u/%u): error %ld\n",
|
||||
op->oid.inode, op->oid.stripe, INODE_POOL(op->oid.inode),
|
||||
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
|
||||
osd_op->reply.hdr.retval
|
||||
);
|
||||
}
|
||||
else if (log_level > 2)
|
||||
{
|
||||
printf("Recovery operation done for %lx:%lx\n", op->oid.inode, op->oid.stripe);
|
||||
}
|
||||
// CAREFUL! op = &recovery_ops[op->oid]. Don't access op->* after recovery_ops.erase()
|
||||
op->osd_op = NULL;
|
||||
|
@@ -29,7 +29,8 @@
|
||||
#define OSD_OP_DELETE 14
|
||||
#define OSD_OP_PING 15
|
||||
#define OSD_OP_SEC_READ_BMP 16
|
||||
#define OSD_OP_MAX 16
|
||||
#define OSD_OP_SCRUB 17
|
||||
#define OSD_OP_MAX 17
|
||||
#define OSD_RW_MAX 64*1024*1024
|
||||
#define OSD_PROTOCOL_VERSION 1
|
||||
|
||||
@@ -173,6 +174,11 @@ struct __attribute__((__packed__)) osd_op_sec_list_t
|
||||
uint64_t pg_stripe_size;
|
||||
// inode range (used to select pools)
|
||||
uint64_t min_inode, max_inode;
|
||||
// min/max oid stripe, added after inodes for backwards compatibility
|
||||
// also for backwards compatibility, max_stripe=UINT64_MAX means 0 and 0 means UINT64_MAX O_o
|
||||
uint64_t min_stripe, max_stripe;
|
||||
// max stable object count
|
||||
uint32_t stable_limit;
|
||||
};
|
||||
|
||||
struct __attribute__((__packed__)) osd_reply_sec_list_t
|
||||
|
@@ -24,6 +24,7 @@ void osd_t::handle_peers()
|
||||
if (!p.second.peering_state->list_ops.size())
|
||||
{
|
||||
p.second.calc_object_states(log_level);
|
||||
schedule_scrub(p.second);
|
||||
report_pg_state(p.second);
|
||||
incomplete_objects += p.second.incomplete_objects.size();
|
||||
misplaced_objects += p.second.misplaced_objects.size();
|
||||
@@ -76,13 +77,20 @@ void osd_t::handle_peers()
|
||||
peering_state = peering_state & ~OSD_FLUSHING_PGS | OSD_RECOVERING;
|
||||
}
|
||||
}
|
||||
if (!(peering_state & OSD_FLUSHING_PGS) && (peering_state & OSD_RECOVERING) && !readonly)
|
||||
if ((peering_state & OSD_RECOVERING) && !readonly)
|
||||
{
|
||||
if (!continue_recovery())
|
||||
{
|
||||
peering_state = peering_state & ~OSD_RECOVERING;
|
||||
}
|
||||
}
|
||||
if (peering_state & OSD_SCRUBBING)
|
||||
{
|
||||
if (!continue_scrub())
|
||||
{
|
||||
peering_state = peering_state & ~OSD_SCRUBBING;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::repeer_pgs(osd_num_t peer_osd)
|
||||
@@ -128,9 +136,11 @@ void osd_t::reset_pg(pg_t & pg)
|
||||
pg.state_dict.clear();
|
||||
copies_to_delete_after_sync_count -= pg.copies_to_delete_after_sync.size();
|
||||
pg.copies_to_delete_after_sync.clear();
|
||||
corrupted_objects -= pg.corrupted_count;
|
||||
incomplete_objects -= pg.incomplete_objects.size();
|
||||
misplaced_objects -= pg.misplaced_objects.size();
|
||||
degraded_objects -= pg.degraded_objects.size();
|
||||
pg.corrupted_count = 0;
|
||||
pg.incomplete_objects.clear();
|
||||
pg.misplaced_objects.clear();
|
||||
pg.degraded_objects.clear();
|
||||
@@ -206,7 +216,7 @@ void osd_t::start_pg_peering(pg_t & pg)
|
||||
pg.cur_loc_set.push_back({
|
||||
.role = (uint64_t)role,
|
||||
.osd_num = pg.cur_set[role],
|
||||
.outdated = false,
|
||||
.loc_bad = 0,
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -319,11 +329,12 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
|
||||
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
|
||||
op->bs_op = new blockstore_op_t();
|
||||
op->bs_op->opcode = BS_OP_LIST;
|
||||
op->bs_op->oid.stripe = st_cli.pool_config[ps->pool_id].pg_stripe_size;
|
||||
op->bs_op->oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS));
|
||||
op->bs_op->version = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1;
|
||||
op->bs_op->len = pg_counts[ps->pool_id];
|
||||
op->bs_op->offset = ps->pg_num-1;
|
||||
op->bs_op->pg_alignment = st_cli.pool_config[ps->pool_id].pg_stripe_size;
|
||||
op->bs_op->min_oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS));
|
||||
op->bs_op->max_oid.inode = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1;
|
||||
op->bs_op->max_oid.stripe = UINT64_MAX;
|
||||
op->bs_op->pg_count = pg_counts[ps->pool_id];
|
||||
op->bs_op->pg_number = ps->pg_num-1;
|
||||
op->bs_op->callback = [this, ps, op, role_osd](blockstore_op_t *bs_op)
|
||||
{
|
||||
if (op->bs_op->retval < 0)
|
||||
@@ -483,10 +494,6 @@ void osd_t::report_pg_state(pg_t & pg)
|
||||
pg.all_peers = pg.target_set;
|
||||
std::sort(pg.all_peers.begin(), pg.all_peers.end());
|
||||
pg.cur_peers = pg.target_set;
|
||||
// Change pg_config at the same time, otherwise our PG reconciling loop may try to apply the old metadata
|
||||
auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num];
|
||||
pg_cfg.target_history = pg.target_history;
|
||||
pg_cfg.all_peers = pg.all_peers;
|
||||
}
|
||||
else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
|
||||
{
|
||||
@@ -526,9 +533,6 @@ void osd_t::report_pg_state(pg_t & pg)
|
||||
pg.cur_peers.push_back(pg_osd);
|
||||
}
|
||||
}
|
||||
auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num];
|
||||
pg_cfg.target_history = pg.target_history;
|
||||
pg_cfg.all_peers = pg.all_peers;
|
||||
}
|
||||
if (pg.state == PG_OFFLINE && !this->pg_config_applied)
|
||||
{
|
||||
|
@@ -91,7 +91,7 @@ void pg_obj_state_check_t::walk()
|
||||
pg->state |= PG_DEGRADED;
|
||||
}
|
||||
pg->state |= PG_ACTIVE;
|
||||
if (pg->cur_peers.size() < pg->all_peers.size())
|
||||
if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
|
||||
{
|
||||
pg->state |= PG_LEFT_ON_DEAD;
|
||||
}
|
||||
@@ -280,7 +280,7 @@ void pg_obj_state_check_t::finish_object()
|
||||
osd_set.push_back((pg_obj_loc_t){
|
||||
.role = (list[i].oid.stripe & STRIPE_MASK),
|
||||
.osd_num = list[i].osd_num,
|
||||
.outdated = false,
|
||||
.loc_bad = 0,
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -302,7 +302,7 @@ void pg_obj_state_check_t::finish_object()
|
||||
osd_set.push_back((pg_obj_loc_t){
|
||||
.role = (list[i].oid.stripe & STRIPE_MASK),
|
||||
.osd_num = list[i].osd_num,
|
||||
.outdated = true,
|
||||
.loc_bad = LOC_OUTDATED,
|
||||
});
|
||||
if (!(state & (OBJ_INCOMPLETE | OBJ_DEGRADED)))
|
||||
{
|
||||
@@ -322,67 +322,73 @@ void pg_obj_state_check_t::finish_object()
|
||||
}
|
||||
else
|
||||
{
|
||||
auto it = pg->state_dict.find(osd_set);
|
||||
if (it == pg->state_dict.end())
|
||||
{
|
||||
std::vector<uint64_t> read_target;
|
||||
if (replicated)
|
||||
{
|
||||
for (auto & o: osd_set)
|
||||
{
|
||||
if (!o.outdated)
|
||||
{
|
||||
read_target.push_back(o.osd_num);
|
||||
}
|
||||
}
|
||||
while (read_target.size() < pg->pg_size)
|
||||
{
|
||||
// FIXME: This is because we then use .data() and assume it's at least <pg_size> long
|
||||
read_target.push_back(0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
read_target.resize(pg->pg_size);
|
||||
for (int i = 0; i < pg->pg_size; i++)
|
||||
{
|
||||
read_target[i] = 0;
|
||||
}
|
||||
for (auto & o: osd_set)
|
||||
{
|
||||
if (!o.outdated)
|
||||
{
|
||||
read_target[o.role] = o.osd_num;
|
||||
}
|
||||
}
|
||||
}
|
||||
pg->state_dict[osd_set] = {
|
||||
.read_target = read_target,
|
||||
.osd_set = osd_set,
|
||||
.state = state,
|
||||
.object_count = 1,
|
||||
};
|
||||
it = pg->state_dict.find(osd_set);
|
||||
}
|
||||
else
|
||||
{
|
||||
it->second.object_count++;
|
||||
}
|
||||
if (state & OBJ_INCOMPLETE)
|
||||
{
|
||||
pg->incomplete_objects[oid] = &it->second;
|
||||
}
|
||||
else if (state & OBJ_DEGRADED)
|
||||
{
|
||||
pg->degraded_objects[oid] = &it->second;
|
||||
}
|
||||
else
|
||||
{
|
||||
pg->misplaced_objects[oid] = &it->second;
|
||||
}
|
||||
pg->add_object_to_state(oid, state, osd_set);
|
||||
}
|
||||
}
|
||||
|
||||
pg_osd_set_state_t* pg_t::add_object_to_state(const object_id oid, const uint64_t state, const pg_osd_set_t & osd_set)
|
||||
{
|
||||
auto it = state_dict.find(osd_set);
|
||||
if (it == state_dict.end())
|
||||
{
|
||||
std::vector<osd_num_t> read_target;
|
||||
if (scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
for (auto & o: osd_set)
|
||||
{
|
||||
if (!o.loc_bad)
|
||||
{
|
||||
read_target.push_back(o.osd_num);
|
||||
}
|
||||
}
|
||||
while (read_target.size() < pg_size)
|
||||
{
|
||||
// FIXME: This is because we then use .data() and assume it's at least <pg_size> long
|
||||
read_target.push_back(0);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
read_target.resize(pg_size);
|
||||
for (int i = 0; i < pg_size; i++)
|
||||
{
|
||||
read_target[i] = 0;
|
||||
}
|
||||
for (auto & o: osd_set)
|
||||
{
|
||||
if (!o.loc_bad)
|
||||
{
|
||||
read_target[o.role] = o.osd_num;
|
||||
}
|
||||
}
|
||||
}
|
||||
state_dict[osd_set] = {
|
||||
.read_target = read_target,
|
||||
.osd_set = osd_set,
|
||||
.state = state,
|
||||
.object_count = 1,
|
||||
};
|
||||
it = state_dict.find(osd_set);
|
||||
}
|
||||
else
|
||||
{
|
||||
it->second.object_count++;
|
||||
}
|
||||
if (state & OBJ_INCOMPLETE)
|
||||
{
|
||||
incomplete_objects[oid] = &it->second;
|
||||
}
|
||||
else if (state & OBJ_DEGRADED)
|
||||
{
|
||||
degraded_objects[oid] = &it->second;
|
||||
}
|
||||
else
|
||||
{
|
||||
misplaced_objects[oid] = &it->second;
|
||||
}
|
||||
return &it->second;
|
||||
}
|
||||
|
||||
// FIXME: Write at least some tests for this function
|
||||
void pg_t::calc_object_states(int log_level)
|
||||
{
|
||||
@@ -446,7 +452,8 @@ void pg_t::calc_object_states(int log_level)
|
||||
osd_set_desc += (osd_set_desc == "" ? "" : ", ")+
|
||||
std::to_string(loc.osd_num)+
|
||||
(st.replicated ? "" : "("+std::to_string(loc.role)+")")+
|
||||
(loc.outdated ? "(old)" : "");
|
||||
(loc.loc_bad & LOC_OUTDATED ? "(old)" : "")+
|
||||
(loc.loc_bad & LOC_CORRUPTED ? "(corrupted)" : "");
|
||||
}
|
||||
printf("[PG %u/%u] %lu objects on OSD set %s\n", pool_id, pg_num, stp.second.object_count, osd_set_desc.c_str());
|
||||
}
|
||||
@@ -456,7 +463,7 @@ void pg_t::calc_object_states(int log_level)
|
||||
void pg_t::print_state()
|
||||
{
|
||||
printf(
|
||||
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
|
||||
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
|
||||
(state & PG_STARTING) ? "starting" : "",
|
||||
(state & PG_OFFLINE) ? "offline" : "",
|
||||
(state & PG_PEERING) ? "peering" : "",
|
||||
@@ -465,12 +472,14 @@ void pg_t::print_state()
|
||||
(state & PG_REPEERING) ? "repeering" : "",
|
||||
(state & PG_STOPPING) ? "stopping" : "",
|
||||
(state & PG_DEGRADED) ? " + degraded" : "",
|
||||
(state & PG_HAS_CORRUPTED) ? " + has_corrupted" : "",
|
||||
(state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",
|
||||
(state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
|
||||
(state & PG_HAS_MISPLACED) ? " + has_misplaced" : "",
|
||||
(state & PG_HAS_UNCLEAN) ? " + has_unclean" : "",
|
||||
(state & PG_HAS_INVALID) ? " + has_invalid" : "",
|
||||
(state & PG_LEFT_ON_DEAD) ? " + left_on_dead" : "",
|
||||
(state & PG_SCRUBBING) ? " + scrubbing" : "",
|
||||
total_count
|
||||
);
|
||||
}
|
||||
|
@@ -13,11 +13,14 @@
|
||||
|
||||
#define PG_EPOCH_BITS 48
|
||||
|
||||
#define LOC_OUTDATED 1
|
||||
#define LOC_CORRUPTED 2
|
||||
|
||||
struct pg_obj_loc_t
|
||||
{
|
||||
uint64_t role;
|
||||
osd_num_t osd_num;
|
||||
bool outdated;
|
||||
uint32_t loc_bad; // LOC_OUTDATED / LOC_CORRUPTED
|
||||
};
|
||||
|
||||
typedef std::vector<pg_obj_loc_t> pg_osd_set_t;
|
||||
@@ -30,6 +33,7 @@ struct pg_osd_set_state_t
|
||||
pg_osd_set_t osd_set;
|
||||
uint64_t state = 0;
|
||||
uint64_t object_count = 0;
|
||||
uint64_t ref_count = 0;
|
||||
};
|
||||
|
||||
struct pg_list_result_t
|
||||
@@ -91,6 +95,8 @@ struct pg_t
|
||||
// target history and all potential peers
|
||||
std::vector<std::vector<osd_num_t>> target_history;
|
||||
std::vector<osd_num_t> all_peers;
|
||||
// last scrub time
|
||||
uint64_t scrub_ts = 0;
|
||||
bool history_changed = false;
|
||||
// peer list from the last peering event
|
||||
std::vector<osd_num_t> cur_peers;
|
||||
@@ -106,6 +112,7 @@ struct pg_t
|
||||
// it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
|
||||
// which is up to ~192 MB per 1 TB in the worst case scenario
|
||||
std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
|
||||
uint64_t corrupted_count;
|
||||
btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects;
|
||||
std::map<obj_piece_id_t, flush_action_t> flush_actions;
|
||||
std::vector<obj_ver_osd_t> copies_to_delete_after_sync;
|
||||
@@ -116,15 +123,16 @@ struct pg_t
|
||||
int inflight = 0; // including write_queue
|
||||
std::multimap<object_id, osd_op_t*> write_queue;
|
||||
|
||||
pg_osd_set_state_t* add_object_to_state(const object_id oid, const uint64_t state, const pg_osd_set_t & osd_set);
|
||||
void calc_object_states(int log_level);
|
||||
void print_state();
|
||||
};
|
||||
|
||||
inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
|
||||
{
|
||||
return a.outdated < b.outdated ||
|
||||
a.outdated == b.outdated && a.role < b.role ||
|
||||
a.outdated == b.outdated && a.role == b.role && a.osd_num < b.osd_num;
|
||||
return a.loc_bad < b.loc_bad ||
|
||||
a.loc_bad == b.loc_bad && a.role < b.role ||
|
||||
a.loc_bad == b.loc_bad && a.role == b.role && a.osd_num < b.osd_num;
|
||||
}
|
||||
|
||||
inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)
|
||||
|
@@ -52,7 +52,9 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||
finish_op(cur_op, -EINVAL);
|
||||
return false;
|
||||
}
|
||||
int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size);
|
||||
// Scrub is similar to r/w, so it's also handled here
|
||||
int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED
|
||||
&& cur_op->req.hdr.opcode != OSD_OP_SCRUB ? 1 : pg_it->second.pg_size);
|
||||
int chain_size = 0;
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_READ && cur_op->req.rw.meta_revision > 0)
|
||||
{
|
||||
@@ -90,6 +92,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||
chain_size * (
|
||||
// - copy of the chain
|
||||
sizeof(inode_t) +
|
||||
// - object states for every chain item
|
||||
sizeof(void*) +
|
||||
// - bitmap buffers for chained read
|
||||
stripe_count * clean_entry_bitmap_size +
|
||||
// - 'missing' flags for chained reads
|
||||
@@ -117,6 +121,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||
{
|
||||
op_data->read_chain = (inode_t*)data_buf;
|
||||
data_buf = (uint8_t*)data_buf + sizeof(inode_t) * chain_size;
|
||||
op_data->chain_states = (pg_osd_set_state_t**)data_buf;
|
||||
data_buf = (uint8_t*)data_buf + sizeof(pg_osd_set_state_t*) * chain_size;
|
||||
op_data->snapshot_bitmaps = data_buf;
|
||||
data_buf = (uint8_t*)data_buf + chain_size * stripe_count * clean_entry_bitmap_size;
|
||||
op_data->missing_flags = (uint8_t*)data_buf;
|
||||
@@ -131,6 +137,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||
inode_it->second.parent_id != cur_op->req.rw.inode)
|
||||
{
|
||||
op_data->read_chain[chain_num++] = inode_it->second.parent_id;
|
||||
op_data->chain_states[chain_num++] = NULL;
|
||||
inode_it = st_cli.inode_config.find(inode_it->second.parent_id);
|
||||
}
|
||||
}
|
||||
@@ -138,12 +145,12 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
|
||||
return true;
|
||||
}
|
||||
|
||||
uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_osd_set_state_t **object_state)
|
||||
uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, pg_osd_set_state_t **object_state)
|
||||
{
|
||||
if (!(pg.state & (PG_HAS_INCOMPLETE | PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
|
||||
{
|
||||
*object_state = NULL;
|
||||
return def;
|
||||
return pg.cur_set.data();
|
||||
}
|
||||
auto st_it = pg.incomplete_objects.find(oid);
|
||||
if (st_it != pg.incomplete_objects.end())
|
||||
@@ -164,7 +171,7 @@ uint64_t* osd_t::get_object_osd_set(pg_t &pg, object_id &oid, uint64_t *def, pg_
|
||||
return st_it->second->read_target.data();
|
||||
}
|
||||
*object_state = NULL;
|
||||
return def;
|
||||
return pg.cur_set.data();
|
||||
}
|
||||
|
||||
void osd_t::continue_primary_read(osd_op_t *cur_op)
|
||||
@@ -183,6 +190,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
|
||||
goto resume_1;
|
||||
else if (op_data->st == 2)
|
||||
goto resume_2;
|
||||
resume_0:
|
||||
cur_op->reply.rw.bitmap_len = 0;
|
||||
{
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
@@ -194,15 +202,17 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
|
||||
// Determine version
|
||||
auto vo_it = pg.ver_override.find(op_data->oid);
|
||||
op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||
op_data->prev_set = pg.cur_set.data();
|
||||
if (pg.state != PG_ACTIVE)
|
||||
{
|
||||
// PG may be degraded or have misplaced objects
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
|
||||
}
|
||||
// PG may have degraded or misplaced objects
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
|
||||
if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Fast happy-path
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED &&
|
||||
op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
|
||||
{
|
||||
finish_op(cur_op, -EIO);
|
||||
return;
|
||||
}
|
||||
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0);
|
||||
submit_primary_subops(SUBMIT_RMW_READ, op_data->target_ver, op_data->prev_set, cur_op);
|
||||
op_data->st = 1;
|
||||
@@ -228,6 +238,14 @@ resume_1:
|
||||
resume_2:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
||||
{
|
||||
// I/O or checksum error
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
|
||||
goto resume_0;
|
||||
}
|
||||
finish_op(cur_op, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
@@ -266,10 +284,144 @@ resume_2:
|
||||
finish_op(cur_op, cur_op->req.rw.len);
|
||||
}
|
||||
|
||||
// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
|
||||
void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t & pg)
|
||||
pg_osd_set_state_t *osd_t::mark_object_corrupted(pg_t & pg, object_id oid, pg_osd_set_state_t *prev_object_state, osd_rmw_stripe_t *stripes, bool ref)
|
||||
{
|
||||
if (object_state->state & OBJ_INCOMPLETE)
|
||||
pg_osd_set_state_t *object_state = NULL;
|
||||
get_object_osd_set(pg, oid, &object_state);
|
||||
if (prev_object_state != object_state)
|
||||
{
|
||||
// Object state changed in between by a parallel I/O operation, skip marking as failed
|
||||
if (ref)
|
||||
{
|
||||
deref_object_state(pg, &prev_object_state, ref);
|
||||
if (object_state)
|
||||
object_state->ref_count++;
|
||||
}
|
||||
return object_state;
|
||||
}
|
||||
pg_osd_set_t corrupted_set;
|
||||
if (object_state)
|
||||
{
|
||||
corrupted_set = object_state->osd_set;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < pg.cur_set.size(); i++)
|
||||
{
|
||||
corrupted_set.push_back((pg_obj_loc_t){
|
||||
.role = (pg.scheme == POOL_SCHEME_REPLICATED ? 0 : (uint64_t)i),
|
||||
.osd_num = pg.cur_set[i],
|
||||
});
|
||||
}
|
||||
}
|
||||
// Mark object chunk(s) as corrupted
|
||||
uint64_t has_roles = 0, n_roles = 0, n_copies = 0, n_corrupted = 0;
|
||||
for (auto & chunk: corrupted_set)
|
||||
{
|
||||
bool corrupted = stripes[chunk.role].osd_num == chunk.osd_num && stripes[chunk.role].read_error;
|
||||
if (corrupted && !(chunk.loc_bad & LOC_CORRUPTED))
|
||||
n_corrupted++;
|
||||
chunk.loc_bad = chunk.loc_bad | (corrupted ? LOC_CORRUPTED : 0);
|
||||
if (!chunk.loc_bad)
|
||||
{
|
||||
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
||||
n_roles = 1;
|
||||
else if (!(has_roles & (1 << chunk.role)))
|
||||
{
|
||||
n_roles++;
|
||||
has_roles |= (1 << chunk.role);
|
||||
}
|
||||
n_copies++;
|
||||
}
|
||||
}
|
||||
if (!n_corrupted)
|
||||
{
|
||||
// No chunks newly marked as corrupted - object is already marked or moved
|
||||
return object_state;
|
||||
}
|
||||
int old_pg_state = pg.state;
|
||||
if (object_state)
|
||||
{
|
||||
remove_object_from_state(oid, &object_state, pg, false);
|
||||
deref_object_state(pg, &object_state, ref);
|
||||
}
|
||||
// Calculate object state
|
||||
uint64_t obj_state = OBJ_CORRUPTED;
|
||||
int pg_state_bits = PG_HAS_CORRUPTED;
|
||||
this->corrupted_objects++;
|
||||
pg.corrupted_count++;
|
||||
if (log_level > 1)
|
||||
{
|
||||
printf("Marking object %lx:%lx corrupted: %lu chunks / %lu copies available, %lu corrupted\n",
|
||||
oid.inode, oid.stripe, n_roles, n_copies, n_corrupted);
|
||||
}
|
||||
if (n_roles < pg.pg_data_size)
|
||||
{
|
||||
this->incomplete_objects++;
|
||||
obj_state |= OBJ_INCOMPLETE;
|
||||
pg_state_bits = PG_HAS_INCOMPLETE;
|
||||
}
|
||||
else if (n_roles < pg.pg_cursize)
|
||||
{
|
||||
this->degraded_objects++;
|
||||
obj_state |= OBJ_DEGRADED;
|
||||
pg_state_bits = PG_HAS_DEGRADED;
|
||||
}
|
||||
else
|
||||
{
|
||||
this->misplaced_objects++;
|
||||
obj_state |= OBJ_MISPLACED;
|
||||
pg_state_bits = PG_HAS_MISPLACED;
|
||||
}
|
||||
pg.state |= pg_state_bits;
|
||||
if (pg.state != old_pg_state)
|
||||
{
|
||||
report_pg_state(pg);
|
||||
if ((pg.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)) !=
|
||||
(old_pg_state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)))
|
||||
{
|
||||
peering_state = peering_state | OSD_RECOVERING;
|
||||
if ((pg.state & PG_HAS_DEGRADED) != (old_pg_state & PG_HAS_DEGRADED))
|
||||
{
|
||||
// Restart recovery from degraded objects
|
||||
recovery_last_degraded = true;
|
||||
recovery_last_pg = {};
|
||||
recovery_last_oid = {};
|
||||
}
|
||||
ringloop->wakeup();
|
||||
}
|
||||
}
|
||||
// Insert object into the new state and retry
|
||||
object_state = pg.add_object_to_state(oid, obj_state, corrupted_set);
|
||||
if (ref)
|
||||
object_state->ref_count++;
|
||||
return object_state;
|
||||
}
|
||||
|
||||
// Decrement pg_osd_set_state_t's object_count and change PG state accordingly
|
||||
void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **object_state, pg_t & pg, bool report)
|
||||
{
|
||||
if (!*object_state)
|
||||
{
|
||||
return;
|
||||
}
|
||||
pg_osd_set_state_t *recheck_state = NULL;
|
||||
get_object_osd_set(pg, oid, &recheck_state);
|
||||
if (recheck_state != *object_state)
|
||||
{
|
||||
recheck_state->ref_count++;
|
||||
(*object_state)->ref_count--;
|
||||
*object_state = recheck_state;
|
||||
return;
|
||||
}
|
||||
(*object_state)->object_count--;
|
||||
if ((*object_state)->state & OBJ_CORRUPTED)
|
||||
{
|
||||
this->corrupted_objects--;
|
||||
pg.corrupted_count--;
|
||||
}
|
||||
bool changed = false;
|
||||
if ((*object_state)->state & OBJ_INCOMPLETE)
|
||||
{
|
||||
// Successful write means that object is not incomplete anymore
|
||||
this->incomplete_objects--;
|
||||
@@ -277,41 +429,52 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object
|
||||
if (!pg.incomplete_objects.size())
|
||||
{
|
||||
pg.state = pg.state & ~PG_HAS_INCOMPLETE;
|
||||
report_pg_state(pg);
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
else if (object_state->state & OBJ_DEGRADED)
|
||||
else if ((*object_state)->state & OBJ_DEGRADED)
|
||||
{
|
||||
this->degraded_objects--;
|
||||
pg.degraded_objects.erase(oid);
|
||||
if (!pg.degraded_objects.size())
|
||||
{
|
||||
pg.state = pg.state & ~PG_HAS_DEGRADED;
|
||||
report_pg_state(pg);
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
else if (object_state->state & OBJ_MISPLACED)
|
||||
else if ((*object_state)->state & OBJ_MISPLACED)
|
||||
{
|
||||
this->misplaced_objects--;
|
||||
pg.misplaced_objects.erase(oid);
|
||||
if (!pg.misplaced_objects.size())
|
||||
{
|
||||
pg.state = pg.state & ~PG_HAS_MISPLACED;
|
||||
report_pg_state(pg);
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("BUG: Invalid object state: "+std::to_string(object_state->state));
|
||||
throw std::runtime_error("BUG: Invalid object state: "+std::to_string((*object_state)->state));
|
||||
}
|
||||
if (changed && report)
|
||||
{
|
||||
report_pg_state(pg);
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::free_object_state(pg_t & pg, pg_osd_set_state_t **object_state)
|
||||
void osd_t::deref_object_state(pg_t & pg, pg_osd_set_state_t **object_state, bool deref)
|
||||
{
|
||||
if (*object_state && !(--(*object_state)->object_count))
|
||||
if (*object_state)
|
||||
{
|
||||
pg.state_dict.erase((*object_state)->osd_set);
|
||||
*object_state = NULL;
|
||||
if (deref)
|
||||
{
|
||||
(*object_state)->ref_count--;
|
||||
}
|
||||
if (!(*object_state)->object_count && !(*object_state)->ref_count)
|
||||
{
|
||||
pg.state_dict.erase((*object_state)->osd_set);
|
||||
*object_state = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -341,21 +504,28 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
|
||||
}
|
||||
resume_1:
|
||||
// Determine which OSDs contain this object and delete it
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
|
||||
if (op_data->object_state)
|
||||
{
|
||||
op_data->object_state->ref_count++;
|
||||
}
|
||||
// Submit 1 read to determine the actual version number
|
||||
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
|
||||
op_data->prev_set = NULL;
|
||||
resume_2:
|
||||
op_data->st = 2;
|
||||
return;
|
||||
resume_3:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
// Check CAS version
|
||||
if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
|
||||
{
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
cur_op->reply.hdr.retval = -EINTR;
|
||||
cur_op->reply.rw.version = op_data->fact_ver;
|
||||
goto continue_others;
|
||||
@@ -371,6 +541,7 @@ resume_4:
|
||||
resume_5:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
@@ -383,8 +554,8 @@ resume_5:
|
||||
}
|
||||
else
|
||||
{
|
||||
remove_object_from_state(op_data->oid, op_data->object_state, pg);
|
||||
free_object_state(pg, &op_data->object_state);
|
||||
remove_object_from_state(op_data->oid, &op_data->object_state, pg);
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
}
|
||||
pg.total_count--;
|
||||
cur_op->reply.hdr.retval = 0;
|
||||
|
@@ -9,6 +9,7 @@
|
||||
#define SUBMIT_READ 0
|
||||
#define SUBMIT_RMW_READ 1
|
||||
#define SUBMIT_WRITE 2
|
||||
#define SUBMIT_SCRUB_READ 3
|
||||
|
||||
struct unstable_osd_num_t
|
||||
{
|
||||
@@ -50,6 +51,7 @@ struct osd_primary_op_data_t
|
||||
// for read_bitmaps
|
||||
void *snapshot_bitmaps;
|
||||
inode_t *read_chain;
|
||||
pg_osd_set_state_t **chain_states;
|
||||
uint8_t *missing_flags;
|
||||
int chain_size;
|
||||
osd_chain_read_t *chain_reads;
|
||||
|
@@ -40,10 +40,24 @@ resume_3:
|
||||
resume_4:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
free(op_data->chain_reads);
|
||||
op_data->chain_reads = NULL;
|
||||
finish_op(cur_op, op_data->errcode);
|
||||
return;
|
||||
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
||||
{
|
||||
// Handle corrupted reads and retry...
|
||||
check_corrupted_chained(pg, cur_op);
|
||||
free(cur_op->buf);
|
||||
cur_op->buf = NULL;
|
||||
free(op_data->chain_reads);
|
||||
op_data->chain_reads = NULL;
|
||||
// FIXME: We can in theory retry only specific parts instead of the whole operation
|
||||
goto resume_1;
|
||||
}
|
||||
else
|
||||
{
|
||||
free(op_data->chain_reads);
|
||||
op_data->chain_reads = NULL;
|
||||
finish_op(cur_op, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
}
|
||||
send_chained_read_results(pg, cur_op);
|
||||
finish_op(cur_op, cur_op->req.rw.len);
|
||||
@@ -131,8 +145,7 @@ int osd_t::collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitm
|
||||
object_id cur_oid = { .inode = op_data->read_chain[chain_num], .stripe = op_data->oid.stripe };
|
||||
auto vo_it = pg.ver_override.find(cur_oid);
|
||||
uint64_t target_version = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||
pg_osd_set_state_t *object_state;
|
||||
uint64_t* cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
|
||||
uint64_t* cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_num]);
|
||||
if (pg.scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
osd_num_t read_target = 0;
|
||||
@@ -219,7 +232,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
||||
op_data->n_subops++;
|
||||
}
|
||||
}
|
||||
if (op_data->n_subops > 0)
|
||||
if (op_data->n_subops)
|
||||
{
|
||||
op_data->fact_ver = 0;
|
||||
op_data->done = op_data->errors = 0;
|
||||
@@ -247,6 +260,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
|
||||
osd_op_t *subop = op_data->subops+subop_idx;
|
||||
subop->op_type = OSD_OP_OUT;
|
||||
// FIXME: Use the pre-allocated buffer
|
||||
assert(!subop->buf);
|
||||
subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev));
|
||||
subop->req = (osd_any_op_t){
|
||||
.sec_read_bmp = {
|
||||
@@ -375,6 +389,8 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
||||
op_data->chain_read_count = chain_reads.size();
|
||||
op_data->chain_reads = (osd_chain_read_t*)calloc_or_die(
|
||||
1, sizeof(osd_chain_read_t) * chain_reads.size()
|
||||
// FIXME: Allocate only <chain_reads.size()> instead of <chain_size> stripes
|
||||
// (but it's slightly harder to handle in send_chained_read_results())
|
||||
+ sizeof(osd_rmw_stripe_t) * stripe_count * op_data->chain_size
|
||||
);
|
||||
osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
|
||||
@@ -403,8 +419,7 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
||||
uint64_t *cur_set = pg.cur_set.data();
|
||||
if (pg.state != PG_ACTIVE)
|
||||
{
|
||||
pg_osd_set_state_t *object_state;
|
||||
cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
|
||||
cur_set = get_object_osd_set(pg, cur_oid, &op_data->chain_states[chain_reads[cri].chain_pos]);
|
||||
if (op_data->scheme != POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
if (extend_missing_stripes(stripes, cur_set, pg.pg_data_size, pg.pg_size) < 0)
|
||||
@@ -416,6 +431,17 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
||||
}
|
||||
op_data->degraded = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos];
|
||||
if (cur_state && (cur_state->state & OBJ_INCOMPLETE))
|
||||
{
|
||||
free(op_data->chain_reads);
|
||||
op_data->chain_reads = NULL;
|
||||
finish_op(cur_op, -EIO);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
@@ -433,6 +459,7 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
||||
}
|
||||
}
|
||||
}
|
||||
assert(!cur_op->buf);
|
||||
cur_op->buf = memalign_or_die(MEM_ALIGNMENT, read_buffer_size);
|
||||
void *cur_buf = cur_op->buf;
|
||||
for (int cri = 0; cri < chain_reads.size(); cri++)
|
||||
@@ -468,12 +495,8 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
||||
object_id cur_oid = { .inode = chain_reads[cri].inode, .stripe = op_data->oid.stripe };
|
||||
auto vo_it = pg.ver_override.find(cur_oid);
|
||||
uint64_t target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||
uint64_t *cur_set = pg.cur_set.data();
|
||||
if (pg.state != PG_ACTIVE)
|
||||
{
|
||||
pg_osd_set_state_t *object_state;
|
||||
cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
|
||||
}
|
||||
auto cur_state = op_data->chain_states[chain_reads[cri].chain_pos];
|
||||
uint64_t *cur_set = (pg.state != PG_ACTIVE && cur_state ? cur_state->read_target.data() : pg.cur_set.data());
|
||||
int zero_read = -1;
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
@@ -487,6 +510,33 @@ int osd_t::submit_chained_read_requests(pg_t & pg, osd_op_t *cur_op)
|
||||
return 0;
|
||||
}
|
||||
|
||||
void osd_t::check_corrupted_chained(pg_t & pg, osd_op_t *cur_op)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
int stripe_count = (pg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg.pg_size);
|
||||
osd_rmw_stripe_t *chain_stripes = (osd_rmw_stripe_t*)(
|
||||
(uint8_t*)op_data->chain_reads + sizeof(osd_chain_read_t) * op_data->chain_read_count
|
||||
);
|
||||
for (int cri = 0; cri < op_data->chain_read_count; cri++)
|
||||
{
|
||||
object_id cur_oid = { .inode = op_data->chain_reads[cri].inode, .stripe = op_data->oid.stripe };
|
||||
osd_rmw_stripe_t *stripes = chain_stripes + op_data->chain_reads[cri].chain_pos*stripe_count;
|
||||
bool corrupted = false;
|
||||
for (int i = 0; i < stripe_count; i++)
|
||||
{
|
||||
if (stripes[i].read_error)
|
||||
{
|
||||
corrupted = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (corrupted)
|
||||
{
|
||||
mark_object_corrupted(pg, cur_oid, op_data->chain_states[op_data->chain_reads[cri].chain_pos], stripes, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::send_chained_read_results(pg_t & pg, osd_op_t *cur_op)
|
||||
{
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
|
@@ -9,6 +9,7 @@ void osd_t::autosync()
|
||||
{
|
||||
autosync_op = new osd_op_t();
|
||||
autosync_op->op_type = OSD_OP_IN;
|
||||
autosync_op->peer_fd = -1;
|
||||
autosync_op->req = (osd_any_op_t){
|
||||
.sync = {
|
||||
.header = {
|
||||
@@ -53,10 +54,7 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
|
||||
inode_stats[cur_op->req.rw.inode].op_count[inode_st_op]++;
|
||||
inode_stats[cur_op->req.rw.inode].op_sum[inode_st_op] += usec;
|
||||
if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
|
||||
{
|
||||
if (cur_op->op_data)
|
||||
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg_data_size * bs_block_size;
|
||||
}
|
||||
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg_data_size * bs_block_size;
|
||||
else
|
||||
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->req.rw.len;
|
||||
}
|
||||
@@ -142,34 +140,40 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
// We always submit zero-length writes to all replicas, even if the stripe is not modified
|
||||
if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role))
|
||||
if (!(wr || !rep && stripes[role].read_end != 0 || zero_read == role || submit_type == SUBMIT_SCRUB_READ))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
osd_num_t role_osd_num = osd_set[role];
|
||||
int stripe_num = rep ? 0 : role;
|
||||
if (role_osd_num != 0)
|
||||
{
|
||||
int stripe_num = rep ? 0 : role;
|
||||
osd_op_t *subop = op_data->subops + i;
|
||||
stripes[stripe_num].osd_num = role_osd_num;
|
||||
stripes[stripe_num].read_error = false;
|
||||
subop->bitmap = stripes[stripe_num].bmp_buf;
|
||||
subop->bitmap_len = clean_entry_bitmap_size;
|
||||
// Using rmw_buf to pass pointer to stripes. Dirty but should work
|
||||
subop->rmw_buf = stripes+stripe_num;
|
||||
if (role_osd_num == this->osd_num)
|
||||
{
|
||||
clock_gettime(CLOCK_REALTIME, &subop->tv_begin);
|
||||
subop->op_type = (uint64_t)cur_op;
|
||||
subop->bitmap = stripes[stripe_num].bmp_buf;
|
||||
subop->bitmap_len = clean_entry_bitmap_size;
|
||||
subop->bs_op = new blockstore_op_t({
|
||||
subop->bs_op = new blockstore_op_t((blockstore_op_t){
|
||||
.opcode = (uint64_t)(wr ? (rep ? BS_OP_WRITE_STABLE : BS_OP_WRITE) : BS_OP_READ),
|
||||
.callback = [subop, this](blockstore_op_t *bs_subop)
|
||||
{
|
||||
handle_primary_bs_subop(subop);
|
||||
},
|
||||
.oid = {
|
||||
.inode = inode,
|
||||
.stripe = op_data->oid.stripe | stripe_num,
|
||||
{
|
||||
.oid = (object_id){
|
||||
.inode = inode,
|
||||
.stripe = op_data->oid.stripe | stripe_num,
|
||||
},
|
||||
.version = op_version,
|
||||
.offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
|
||||
.len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start,
|
||||
},
|
||||
.version = op_version,
|
||||
.offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
|
||||
.len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start,
|
||||
.buf = wr ? stripes[stripe_num].write_buf : stripes[stripe_num].read_buf,
|
||||
.bitmap = stripes[stripe_num].bmp_buf,
|
||||
});
|
||||
@@ -185,8 +189,6 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||
else
|
||||
{
|
||||
subop->op_type = OSD_OP_OUT;
|
||||
subop->bitmap = stripes[stripe_num].bmp_buf;
|
||||
subop->bitmap_len = clean_entry_bitmap_size;
|
||||
subop->req.sec_rw = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
@@ -243,6 +245,10 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
|
||||
}
|
||||
i++;
|
||||
}
|
||||
else
|
||||
{
|
||||
stripes[stripe_num].osd_num = 0;
|
||||
}
|
||||
}
|
||||
return i-subop_idx;
|
||||
}
|
||||
@@ -332,9 +338,11 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
||||
if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
|
||||
{
|
||||
printf(
|
||||
"%s subop to %lx:%lx v%lu failed on peer %d: retval = %d (expected %d)\n",
|
||||
subop->peer_fd >= 0
|
||||
? "%1$s subop to %2$lx:%3$lx v%4$lu failed on peer %7$d: retval = %5$d (expected %6$d)\n"
|
||||
: "%1$s subop to %2$lx:%3$lx v%4$lu failed locally: retval = %5$d (expected %6$d)\n",
|
||||
osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, subop->req.sec_rw.version,
|
||||
subop->peer_fd, retval, expected
|
||||
retval, expected, subop->peer_fd
|
||||
);
|
||||
}
|
||||
else
|
||||
@@ -344,22 +352,32 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
||||
osd_op_names[opcode], subop->peer_fd, retval, expected
|
||||
);
|
||||
}
|
||||
// Error priority: EIO > ENOSPC > EPIPE
|
||||
if (op_data->errcode == 0 || retval == -EIO ||
|
||||
if (opcode == OSD_OP_SEC_READ && (retval == -EIO || retval == -EDOM))
|
||||
{
|
||||
// We'll retry reads from other replica(s) on EIO/EDOM and mark object as corrupted
|
||||
((osd_rmw_stripe_t*)subop->rmw_buf)->read_error = true;
|
||||
}
|
||||
subop->rmw_buf = NULL;
|
||||
// Error priority: EIO > EDOM > ENOSPC > EPIPE
|
||||
if (op_data->errcode == 0 ||
|
||||
retval == -EIO ||
|
||||
retval == -EDOM && (op_data->errcode == -ENOSPC || op_data->errcode == -EPIPE) ||
|
||||
retval == -ENOSPC && op_data->errcode == -EPIPE)
|
||||
{
|
||||
op_data->errcode = retval;
|
||||
}
|
||||
op_data->errors++;
|
||||
if (subop->peer_fd >= 0 && (opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE ||
|
||||
retval != -ENOSPC))
|
||||
if (subop->peer_fd >= 0 && retval != -EDOM &&
|
||||
(retval != -ENOSPC || opcode != OSD_OP_SEC_WRITE && opcode != OSD_OP_SEC_WRITE_STABLE) &&
|
||||
(retval != -EIO || opcode != OSD_OP_SEC_READ))
|
||||
{
|
||||
// Drop connection on any error expect ENOSPC
|
||||
// Drop connection on unexpected errors
|
||||
msgr.stop_client(subop->peer_fd);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
subop->rmw_buf = NULL;
|
||||
op_data->done++;
|
||||
if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
|
||||
{
|
||||
@@ -403,6 +421,10 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
|
||||
{
|
||||
continue_primary_del(cur_op);
|
||||
}
|
||||
else if (cur_op->req.hdr.opcode == OSD_OP_SCRUB)
|
||||
{
|
||||
continue_primary_scrub(cur_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("BUG: unknown opcode");
|
||||
@@ -472,7 +494,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
op_data->n_subops = chunks_to_delete_count;
|
||||
op_data->done = op_data->errors = op_data->errcode = 0;
|
||||
if (op_data->n_subops <= 0)
|
||||
if (!op_data->n_subops)
|
||||
{
|
||||
return;
|
||||
}
|
||||
@@ -606,7 +628,9 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
||||
{
|
||||
handle_primary_bs_subop(subop);
|
||||
},
|
||||
.len = (uint32_t)stab_osd.len,
|
||||
{
|
||||
.len = (uint32_t)stab_osd.len,
|
||||
},
|
||||
.buf = (void*)(op_data->unstable_writes + stab_osd.start),
|
||||
});
|
||||
bs->enqueue_op(subops[i].bs_op);
|
||||
|
@@ -166,7 +166,7 @@ resume_6:
|
||||
for (int i = 0; i < unstable_osd.len; i++)
|
||||
{
|
||||
// Except those from peered PGs
|
||||
auto & w = op_data->unstable_writes[unstable_osd.start + i];
|
||||
auto & w = op_data->unstable_writes[i];
|
||||
pool_pg_num_t wpg = {
|
||||
.pool_id = INODE_POOL(w.oid.inode),
|
||||
.pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
|
||||
|
@@ -12,7 +12,6 @@ bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
|
||||
.oid = op_data->oid,
|
||||
.osd_num = 0,
|
||||
});
|
||||
op_data->st = 1;
|
||||
if (act_it != pg.flush_actions.end() &&
|
||||
act_it->first.oid.inode == op_data->oid.inode &&
|
||||
(act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
|
||||
@@ -24,6 +23,7 @@ bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
|
||||
auto vo_it = pg.write_queue.find(op_data->oid);
|
||||
if (vo_it != pg.write_queue.end())
|
||||
{
|
||||
op_data->st = 1;
|
||||
pg.write_queue.emplace(op_data->oid, cur_op);
|
||||
return false;
|
||||
}
|
||||
@@ -58,7 +58,13 @@ resume_1:
|
||||
// Determine blocks to read and write
|
||||
// Missing chunks are allowed to be overwritten even in incomplete objects
|
||||
// FIXME: Allow to do small writes to the old (degraded/misplaced) OSD set for lower performance impact
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
|
||||
if (op_data->object_state)
|
||||
{
|
||||
// Protect object_state from being freed by a parallel read operation changing it
|
||||
op_data->object_state->ref_count++;
|
||||
}
|
||||
retry_1:
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Simplified algorithm
|
||||
@@ -68,6 +74,12 @@ resume_1:
|
||||
if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
|
||||
op_data->stripes[0].write_end != bs_block_size))
|
||||
{
|
||||
if (op_data->object_state->state & OBJ_INCOMPLETE)
|
||||
{
|
||||
// Refuse partial overwrite of an incomplete (corrupted) object
|
||||
cur_op->reply.hdr.retval = -EIO;
|
||||
goto continue_others;
|
||||
}
|
||||
// Object is degraded/misplaced and will be moved to <write_osd_set>
|
||||
op_data->stripes[0].read_start = 0;
|
||||
op_data->stripes[0].read_end = bs_block_size;
|
||||
@@ -81,24 +93,66 @@ resume_1:
|
||||
if (!cur_op->rmw_buf)
|
||||
{
|
||||
// Refuse partial overwrite of an incomplete object
|
||||
cur_op->reply.hdr.retval = -EINVAL;
|
||||
cur_op->reply.hdr.retval = -EIO;
|
||||
goto continue_others;
|
||||
}
|
||||
}
|
||||
// Read required blocks
|
||||
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
|
||||
{
|
||||
if (op_data->object_state && (op_data->object_state->state & OBJ_INCOMPLETE))
|
||||
{
|
||||
// Allow to read version number (just version number!) from corrupted chunks
|
||||
// to allow full overwrite of a corrupted object
|
||||
bool found = false;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (op_data->prev_set[role] != 0 || op_data->stripes[role].read_end > op_data->stripes[role].read_start)
|
||||
{
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found)
|
||||
{
|
||||
osd_num_t corrupted_target[op_data->pg_size];
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
corrupted_target[role] = 0;
|
||||
}
|
||||
for (auto & loc: op_data->object_state->osd_set)
|
||||
{
|
||||
if (!(loc.loc_bad & LOC_OUTDATED) && !corrupted_target[loc.role])
|
||||
{
|
||||
corrupted_target[loc.role] = loc.osd_num;
|
||||
}
|
||||
}
|
||||
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, corrupted_target, cur_op);
|
||||
goto resume_2;
|
||||
}
|
||||
}
|
||||
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
|
||||
}
|
||||
resume_2:
|
||||
op_data->st = 2;
|
||||
return;
|
||||
resume_3:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
||||
{
|
||||
// Mark object corrupted and retry
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, true);
|
||||
op_data->prev_set = op_data->object_state ? op_data->object_state->read_target.data() : pg.cur_set.data();
|
||||
goto retry_1;
|
||||
}
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
// Check CAS version
|
||||
if (cur_op->req.rw.version && op_data->fact_ver != (cur_op->req.rw.version-1))
|
||||
{
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
cur_op->reply.hdr.retval = -EINTR;
|
||||
cur_op->reply.rw.version = op_data->fact_ver;
|
||||
goto continue_others;
|
||||
@@ -182,6 +236,7 @@ resume_10:
|
||||
// Recheck PG state after reporting history - maybe it's already stopping/restarting
|
||||
if (pg.state & (PG_STOPPING|PG_REPEERING))
|
||||
{
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, -EPIPE);
|
||||
return;
|
||||
}
|
||||
@@ -197,6 +252,7 @@ resume_5:
|
||||
}
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
pg_cancel_write_queue(pg, cur_op, op_data->oid, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
@@ -205,7 +261,7 @@ resume_5:
|
||||
// We must forget the unclean state of the object before deleting it
|
||||
// so the next reads don't accidentally read a deleted version
|
||||
// And it should be done at the same time as the removal of the version override
|
||||
remove_object_from_state(op_data->oid, op_data->object_state, pg);
|
||||
remove_object_from_state(op_data->oid, &op_data->object_state, pg);
|
||||
pg.clean_count++;
|
||||
}
|
||||
resume_6:
|
||||
@@ -260,12 +316,12 @@ resume_7:
|
||||
copies_to_delete_after_sync_count++;
|
||||
}
|
||||
}
|
||||
free_object_state(pg, &op_data->object_state);
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
}
|
||||
else
|
||||
{
|
||||
submit_primary_del_subops(cur_op, pg.cur_set.data(), pg.pg_size, op_data->object_state->osd_set);
|
||||
free_object_state(pg, &op_data->object_state);
|
||||
deref_object_state(pg, &op_data->object_state, true);
|
||||
if (op_data->n_subops > 0)
|
||||
{
|
||||
resume_8:
|
||||
|
@@ -142,11 +142,11 @@ inline bool operator < (const reed_sol_erased_t &a, const reed_sol_erased_t &b)
|
||||
for (int i = 0; i < a.size && i < b.size; i++)
|
||||
{
|
||||
if (a.data[i] < b.data[i])
|
||||
return true;
|
||||
return -1;
|
||||
else if (a.data[i] > b.data[i])
|
||||
return false;
|
||||
return 1;
|
||||
}
|
||||
return false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct reed_sol_matrix_t
|
||||
@@ -677,11 +677,11 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
|
||||
static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, uint32_t wr_end, buf_len_t *bufs, int & nbufs)
|
||||
{
|
||||
uint32_t ns = 0, ne = 0, os = 0, oe = 0;
|
||||
if (stripe.write_end > wr_start &&
|
||||
stripe.write_start < wr_end)
|
||||
if (stripe.req_end > wr_start &&
|
||||
stripe.req_start < wr_end)
|
||||
{
|
||||
ns = std::max(stripe.write_start, wr_start);
|
||||
ne = std::min(stripe.write_end, wr_end);
|
||||
ns = std::max(stripe.req_start, wr_start);
|
||||
ne = std::min(stripe.req_end, wr_end);
|
||||
}
|
||||
if (stripe.read_end > wr_start &&
|
||||
stripe.read_start < wr_end)
|
||||
@@ -692,7 +692,7 @@ static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, ui
|
||||
if (ne && (!oe || ns <= os))
|
||||
{
|
||||
// NEW or NEW->OLD
|
||||
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.write_start, .len = ne-ns };
|
||||
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.req_start, .len = ne-ns };
|
||||
if (os < ne)
|
||||
os = ne;
|
||||
if (oe > os)
|
||||
@@ -708,7 +708,7 @@ static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, ui
|
||||
{
|
||||
// OLD->NEW or OLD->NEW->OLD
|
||||
bufs[nbufs++] = { .buf = (uint8_t*)stripe.read_buf + os - stripe.read_start, .len = ns-os };
|
||||
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.write_start, .len = ne-ns };
|
||||
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.req_start, .len = ne-ns };
|
||||
if (oe > ne)
|
||||
{
|
||||
// OLD->NEW->OLD
|
||||
|
@@ -25,7 +25,9 @@ struct osd_rmw_stripe_t
|
||||
uint32_t req_start, req_end;
|
||||
uint32_t read_start, read_end;
|
||||
uint32_t write_start, write_end;
|
||||
bool missing;
|
||||
osd_num_t osd_num;
|
||||
bool missing: 1;
|
||||
bool read_error: 1;
|
||||
};
|
||||
|
||||
// Here pg_minsize is the number of data chunks, not the minimum number of alive OSDs for the PG to operate
|
||||
|
@@ -17,7 +17,6 @@ void test4();
|
||||
void test5();
|
||||
void test6();
|
||||
void test7();
|
||||
void test_rmw_4k_degraded_into_lost_to_normal(bool ec);
|
||||
void test8();
|
||||
void test9();
|
||||
void test10();
|
||||
@@ -40,8 +39,6 @@ int main(int narg, char *args[])
|
||||
test6();
|
||||
// Test 7
|
||||
test7();
|
||||
test_rmw_4k_degraded_into_lost_to_normal(false);
|
||||
test_rmw_4k_degraded_into_lost_to_normal(true);
|
||||
// Test 8
|
||||
test8();
|
||||
// Test 9
|
||||
@@ -319,69 +316,6 @@ void test7()
|
||||
|
||||
/***
|
||||
|
||||
7/2. calc_rmw(offset=48K, len=4K, osd_set=[0,2,3], write_set=[1,2,3])
|
||||
= {
|
||||
read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
|
||||
write: [ [ 48K, 52K ], [ 0, 0 ], [ 48K, 52K ] ],
|
||||
input buffer: [ write0 ],
|
||||
rmw buffer: [ write2, read0, read1, read2 ],
|
||||
}
|
||||
then, after calc_rmw_parity_xor/ec(): {
|
||||
write: [ [ 0, 128K ], [ 0, 0 ], [ 48K, 52K ] ],
|
||||
write0==read0,
|
||||
}
|
||||
+ check write0, write2 buffers
|
||||
|
||||
***/
|
||||
|
||||
void test_rmw_4k_degraded_into_lost_to_normal(bool ec)
|
||||
{
|
||||
osd_num_t osd_set[3] = { 0, 2, 3 };
|
||||
osd_num_t write_osd_set[3] = { 1, 2, 3 };
|
||||
osd_rmw_stripe_t stripes[3] = {};
|
||||
// Subtest 1
|
||||
split_stripes(2, 128*1024, 48*1024, 4096, stripes);
|
||||
void *write_buf = malloc(4096);
|
||||
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024, 0);
|
||||
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
|
||||
assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
|
||||
assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
|
||||
assert(stripes[0].write_start == 48*1024 && stripes[0].write_end == 52*1024);
|
||||
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
|
||||
assert(stripes[2].write_start == 48*1024 && stripes[2].write_end == 52*1024);
|
||||
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
|
||||
assert(stripes[1].read_buf == (uint8_t*)rmw_buf+4*1024+128*1024);
|
||||
assert(stripes[2].read_buf == (uint8_t*)rmw_buf+4*1024+2*128*1024);
|
||||
assert(stripes[0].write_buf == write_buf);
|
||||
assert(stripes[1].write_buf == NULL);
|
||||
assert(stripes[2].write_buf == rmw_buf);
|
||||
// Subtest 2
|
||||
set_pattern(write_buf, 4096, PATTERN2);
|
||||
set_pattern(stripes[1].read_buf, 128*1024, PATTERN1);
|
||||
set_pattern(stripes[2].read_buf, 128*1024, PATTERN0^PATTERN1);
|
||||
if (!ec)
|
||||
calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024, 0);
|
||||
else
|
||||
{
|
||||
use_ec(3, 2, true);
|
||||
calc_rmw_parity_ec(stripes, 3, 2, osd_set, write_osd_set, 128*1024, 0);
|
||||
use_ec(3, 2, false);
|
||||
}
|
||||
assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
|
||||
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
|
||||
assert(stripes[2].write_start == 48*1024 && stripes[2].write_end == 52*1024);
|
||||
assert(stripes[0].write_buf == stripes[0].read_buf);
|
||||
assert(stripes[1].write_buf == NULL);
|
||||
assert(stripes[2].write_buf == rmw_buf);
|
||||
check_pattern(stripes[0].write_buf, 4096, PATTERN0);
|
||||
check_pattern(stripes[0].write_buf+48*1024, 4096, PATTERN2);
|
||||
check_pattern(stripes[2].write_buf, 4096, PATTERN2^PATTERN1); // new parity
|
||||
free(rmw_buf);
|
||||
free(write_buf);
|
||||
}
|
||||
|
||||
/***
|
||||
|
||||
8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
|
||||
= {
|
||||
read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],
|
||||
|
531
src/osd_scrub.cpp
Normal file
531
src/osd_scrub.cpp
Normal file
@@ -0,0 +1,531 @@
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1 (see README.md for details)
|
||||
|
||||
#include "osd_primary.h"
|
||||
|
||||
#define SELF_FD -1
|
||||
|
||||
void osd_t::scrub_list(pool_pg_num_t pg_id, osd_num_t role_osd, object_id min_oid)
|
||||
{
|
||||
pool_id_t pool_id = pg_id.pool_id;
|
||||
pg_num_t pg_num = pg_id.pg_num;
|
||||
assert(!scrub_list_op);
|
||||
if (role_osd == this->osd_num)
|
||||
{
|
||||
// Self
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = 0;
|
||||
op->peer_fd = SELF_FD;
|
||||
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
|
||||
op->bs_op = new blockstore_op_t();
|
||||
op->bs_op->opcode = BS_OP_LIST;
|
||||
op->bs_op->pg_alignment = st_cli.pool_config[pool_id].pg_stripe_size;
|
||||
if (min_oid.inode != 0 || min_oid.stripe != 0)
|
||||
op->bs_op->min_oid = min_oid;
|
||||
else
|
||||
op->bs_op->min_oid.inode = ((uint64_t)pool_id << (64 - POOL_ID_BITS));
|
||||
op->bs_op->max_oid.inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1;
|
||||
op->bs_op->max_oid.stripe = UINT64_MAX;
|
||||
op->bs_op->list_stable_limit = scrub_list_limit;
|
||||
op->bs_op->pg_count = pg_counts[pool_id];
|
||||
op->bs_op->pg_number = pg_num-1;
|
||||
op->bs_op->callback = [this, op](blockstore_op_t *bs_op)
|
||||
{
|
||||
scrub_list_op = NULL;
|
||||
if (op->bs_op->retval < 0)
|
||||
{
|
||||
printf("Local OP_LIST failed: retval=%d\n", op->bs_op->retval);
|
||||
force_stop(1);
|
||||
return;
|
||||
}
|
||||
add_bs_subop_stats(op);
|
||||
scrub_cur_list = {
|
||||
.buf = (obj_ver_id*)op->bs_op->buf,
|
||||
.total_count = (uint64_t)op->bs_op->retval,
|
||||
.stable_count = op->bs_op->version,
|
||||
};
|
||||
delete op->bs_op;
|
||||
op->bs_op = NULL;
|
||||
delete op;
|
||||
continue_scrub();
|
||||
};
|
||||
scrub_list_op = op;
|
||||
bs->enqueue_op(op->bs_op);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Peer
|
||||
osd_op_t *op = new osd_op_t();
|
||||
op->op_type = OSD_OP_OUT;
|
||||
op->peer_fd = msgr.osd_peer_fds.at(role_osd);
|
||||
op->req = (osd_any_op_t){
|
||||
.sec_list = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = msgr.next_subop_id++,
|
||||
.opcode = OSD_OP_SEC_LIST,
|
||||
},
|
||||
.list_pg = pg_num,
|
||||
.pg_count = pg_counts[pool_id],
|
||||
.pg_stripe_size = st_cli.pool_config[pool_id].pg_stripe_size,
|
||||
.min_inode = min_oid.inode ? min_oid.inode : ((uint64_t)(pool_id) << (64 - POOL_ID_BITS)),
|
||||
.max_inode = ((uint64_t)(pool_id+1) << (64 - POOL_ID_BITS)) - 1,
|
||||
.min_stripe = min_oid.stripe,
|
||||
.stable_limit = scrub_list_limit,
|
||||
},
|
||||
};
|
||||
op->callback = [this, role_osd](osd_op_t *op)
|
||||
{
|
||||
scrub_list_op = NULL;
|
||||
if (op->reply.hdr.retval < 0)
|
||||
{
|
||||
printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
|
||||
int fail_fd = op->peer_fd;
|
||||
delete op;
|
||||
msgr.stop_client(fail_fd);
|
||||
return;
|
||||
}
|
||||
scrub_cur_list = {
|
||||
.buf = (obj_ver_id*)op->buf,
|
||||
.total_count = (uint64_t)op->reply.hdr.retval,
|
||||
.stable_count = op->reply.sec_list.stable_count,
|
||||
};
|
||||
// set op->buf to NULL so it doesn't get freed
|
||||
op->buf = NULL;
|
||||
delete op;
|
||||
continue_scrub();
|
||||
};
|
||||
scrub_list_op = op;
|
||||
msgr.outbox_push(op);
|
||||
}
|
||||
}
|
||||
|
||||
bool osd_t::pick_next_scrub(object_id & next_oid)
|
||||
{
|
||||
if (!pgs.size())
|
||||
{
|
||||
if (scrub_cur_list.buf)
|
||||
{
|
||||
free(scrub_cur_list.buf);
|
||||
scrub_cur_list = {};
|
||||
scrub_last_pg = {};
|
||||
}
|
||||
return false;
|
||||
}
|
||||
timespec tv_now;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_now);
|
||||
bool rescan = scrub_last_pg.pool_id != 0 || scrub_last_pg.pg_num != 0;
|
||||
// Restart scanning from the same PG as the last time
|
||||
auto pg_it = pgs.lower_bound(scrub_last_pg);
|
||||
while (pg_it != pgs.end())
|
||||
{
|
||||
if (pg_it->second.state & PG_ACTIVE)
|
||||
{
|
||||
auto & pool_cfg = st_cli.pool_config.at(pg_it->first.pool_id);
|
||||
auto interval = pool_cfg.scrub_interval ? pool_cfg.scrub_interval : global_scrub_interval;
|
||||
if (pg_it->second.scrub_ts < tv_now.tv_sec-interval)
|
||||
{
|
||||
// Continue scrubbing from the next object
|
||||
if (scrub_last_pg == pg_it->first)
|
||||
{
|
||||
while (scrub_list_pos < scrub_cur_list.total_count)
|
||||
{
|
||||
auto oid = scrub_cur_list.buf[scrub_list_pos].oid;
|
||||
oid.stripe &= ~STRIPE_MASK;
|
||||
scrub_list_pos++;
|
||||
if (recovery_ops.find(oid) == recovery_ops.end() &&
|
||||
scrub_ops.find(oid) == scrub_ops.end())
|
||||
{
|
||||
next_oid = oid;
|
||||
if (!(pg_it->second.state & PG_SCRUBBING))
|
||||
{
|
||||
// Currently scrubbing this PG
|
||||
pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
|
||||
report_pg_state(pg_it->second);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (scrub_last_pg == pg_it->first &&
|
||||
scrub_cur_list.total_count && scrub_list_pos >= scrub_cur_list.total_count &&
|
||||
scrub_cur_list.stable_count < scrub_list_limit)
|
||||
{
|
||||
// End of the list, mark this PG as scrubbed and go to the next PG
|
||||
}
|
||||
else
|
||||
{
|
||||
// Continue listing
|
||||
object_id scrub_last_oid;
|
||||
if (scrub_last_pg != pg_it->first)
|
||||
scrub_last_oid = (object_id){};
|
||||
else if (scrub_cur_list.stable_count > 0)
|
||||
{
|
||||
scrub_last_oid = scrub_cur_list.buf[scrub_cur_list.stable_count-1].oid;
|
||||
scrub_last_oid.stripe++;
|
||||
}
|
||||
osd_num_t scrub_osd = 0;
|
||||
for (osd_num_t pg_osd: pg_it->second.cur_set)
|
||||
{
|
||||
if (pg_osd == this->osd_num || scrub_osd == 0)
|
||||
scrub_osd = pg_osd;
|
||||
}
|
||||
if (!(pg_it->second.state & PG_SCRUBBING))
|
||||
{
|
||||
// Currently scrubbing this PG
|
||||
pg_it->second.state = pg_it->second.state | PG_SCRUBBING;
|
||||
report_pg_state(pg_it->second);
|
||||
}
|
||||
if (scrub_cur_list.buf)
|
||||
{
|
||||
free(scrub_cur_list.buf);
|
||||
scrub_cur_list = {};
|
||||
scrub_last_oid = {};
|
||||
}
|
||||
scrub_last_pg = pg_it->first;
|
||||
scrub_list(pg_it->first, scrub_osd, scrub_last_oid);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if (pg_it->second.state & PG_SCRUBBING)
|
||||
{
|
||||
pg_it->second.scrub_ts = tv_now.tv_sec;
|
||||
pg_it->second.state = pg_it->second.state & ~PG_SCRUBBING;
|
||||
pg_it->second.history_changed = true;
|
||||
report_pg_state(pg_it->second);
|
||||
schedule_scrub(pg_it->second);
|
||||
}
|
||||
// The list is definitely not needed anymore
|
||||
if (scrub_cur_list.buf)
|
||||
{
|
||||
free(scrub_cur_list.buf);
|
||||
scrub_cur_list = {};
|
||||
}
|
||||
}
|
||||
pg_it++;
|
||||
if (pg_it == pgs.end() && rescan)
|
||||
{
|
||||
// Scan one more time to guarantee that there are no PGs to scrub
|
||||
pg_it = pgs.begin();
|
||||
rescan = false;
|
||||
}
|
||||
}
|
||||
// Scanned all PGs - no more scrubs to do
|
||||
return false;
|
||||
}
|
||||
|
||||
void osd_t::submit_scrub_op(object_id oid)
|
||||
{
|
||||
auto osd_op = new osd_op_t();
|
||||
osd_op->op_type = OSD_OP_OUT;
|
||||
osd_op->req = (osd_any_op_t){
|
||||
.rw = {
|
||||
.header = {
|
||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||
.id = 1,
|
||||
.opcode = OSD_OP_SCRUB,
|
||||
},
|
||||
.inode = oid.inode,
|
||||
.offset = oid.stripe,
|
||||
.len = 0,
|
||||
},
|
||||
};
|
||||
if (log_level > 2)
|
||||
{
|
||||
printf("Submitting scrub for %lx:%lx\n", oid.inode, oid.stripe);
|
||||
}
|
||||
osd_op->callback = [this](osd_op_t *osd_op)
|
||||
{
|
||||
object_id oid = { .inode = osd_op->req.rw.inode, .stripe = osd_op->req.rw.offset };
|
||||
if (osd_op->reply.hdr.retval < 0 && osd_op->reply.hdr.retval != -ENOENT)
|
||||
{
|
||||
// Scrub error
|
||||
printf(
|
||||
"Scrub failed with object %lx:%lx (PG %u/%u): error %ld\n",
|
||||
oid.inode, oid.stripe, INODE_POOL(oid.inode),
|
||||
map_to_pg(oid, st_cli.pool_config.at(INODE_POOL(oid.inode)).pg_stripe_size),
|
||||
osd_op->reply.hdr.retval
|
||||
);
|
||||
}
|
||||
else if (log_level > 2)
|
||||
{
|
||||
printf("Scrubbed %lx:%lx OK\n", oid.inode, oid.stripe);
|
||||
}
|
||||
delete osd_op;
|
||||
if (scrub_sleep_ms)
|
||||
{
|
||||
this->tfd->set_timer(scrub_sleep_ms, false, [this, oid](int timer_id)
|
||||
{
|
||||
scrub_ops.erase(oid);
|
||||
continue_scrub();
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
scrub_ops.erase(oid);
|
||||
continue_scrub();
|
||||
}
|
||||
};
|
||||
scrub_ops[oid] = osd_op;
|
||||
exec_op(osd_op);
|
||||
}
|
||||
|
||||
// Triggers scrub requests
|
||||
// Scrub reads data from all replicas and compares it
|
||||
// To scrub first we need to read objects listings
|
||||
bool osd_t::continue_scrub()
|
||||
{
|
||||
if (scrub_list_op)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
while (scrub_ops.size() < scrub_queue_depth)
|
||||
{
|
||||
object_id oid;
|
||||
if (pick_next_scrub(oid))
|
||||
submit_scrub_op(oid);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void osd_t::schedule_scrub(pg_t & pg)
|
||||
{
|
||||
auto & pool_cfg = st_cli.pool_config.at(pg.pool_id);
|
||||
auto interval = pool_cfg.scrub_interval ? pool_cfg.scrub_interval : global_scrub_interval;
|
||||
if (!scrub_nearest_ts || scrub_nearest_ts > pg.scrub_ts+interval)
|
||||
{
|
||||
scrub_nearest_ts = pg.scrub_ts+interval;
|
||||
timespec tv_now;
|
||||
clock_gettime(CLOCK_REALTIME, &tv_now);
|
||||
if (scrub_timer_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(scrub_timer_id);
|
||||
scrub_timer_id = -1;
|
||||
}
|
||||
if (tv_now.tv_sec > scrub_nearest_ts)
|
||||
{
|
||||
scrub_nearest_ts = 0;
|
||||
peering_state = peering_state | OSD_SCRUBBING;
|
||||
ringloop->wakeup();
|
||||
}
|
||||
else
|
||||
{
|
||||
scrub_timer_id = tfd->set_timer((scrub_nearest_ts-tv_now.tv_sec)*1000, false, [this](int timer_id)
|
||||
{
|
||||
scrub_timer_id = -1;
|
||||
scrub_nearest_ts = 0;
|
||||
peering_state = peering_state | OSD_SCRUBBING;
|
||||
ringloop->wakeup();
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void osd_t::continue_primary_scrub(osd_op_t *cur_op)
|
||||
{
|
||||
if (!cur_op->op_data && !prepare_primary_rw(cur_op))
|
||||
return;
|
||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||
if (op_data->st == 1)
|
||||
goto resume_1;
|
||||
else if (op_data->st == 2)
|
||||
goto resume_2;
|
||||
{
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
// Determine version
|
||||
auto vo_it = pg.ver_override.find(op_data->oid);
|
||||
op_data->target_ver = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
|
||||
// PG may have degraded or misplaced objects
|
||||
op_data->prev_set = get_object_osd_set(pg, op_data->oid, &op_data->object_state);
|
||||
// Read all available chunks
|
||||
int n_copies = 0;
|
||||
op_data->degraded = false;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
op_data->stripes[role].read_start = 0;
|
||||
op_data->stripes[role].read_end = bs_block_size;
|
||||
if (op_data->prev_set[role] != 0)
|
||||
{
|
||||
n_copies++;
|
||||
}
|
||||
else if (op_data->scheme != POOL_SCHEME_REPLICATED && role < op_data->pg_data_size)
|
||||
{
|
||||
op_data->degraded = true;
|
||||
}
|
||||
}
|
||||
if (n_copies <= op_data->pg_data_size)
|
||||
{
|
||||
// Nothing to compare, even if we'd like to
|
||||
finish_op(cur_op, 0);
|
||||
return;
|
||||
}
|
||||
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_size,
|
||||
op_data->scheme != POOL_SCHEME_REPLICATED ? bs_block_size*(op_data->pg_size-op_data->pg_data_size) : 0);
|
||||
// Submit reads
|
||||
osd_op_t *subops = new osd_op_t[n_copies];
|
||||
op_data->fact_ver = 0;
|
||||
op_data->done = op_data->errors = op_data->errcode = 0;
|
||||
op_data->n_subops = n_copies;
|
||||
op_data->subops = subops;
|
||||
int sent = submit_primary_subop_batch(SUBMIT_SCRUB_READ, op_data->oid.inode, op_data->target_ver,
|
||||
op_data->stripes, op_data->prev_set, cur_op, 0, -1);
|
||||
assert(sent == n_copies);
|
||||
op_data->st = 1;
|
||||
}
|
||||
resume_1:
|
||||
return;
|
||||
resume_2:
|
||||
if (op_data->errors > 0)
|
||||
{
|
||||
if (op_data->errcode == -EIO || op_data->errcode == -EDOM)
|
||||
{
|
||||
// I/O or checksum error
|
||||
int n_copies = 0;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (op_data->stripes[role].read_end != 0 &&
|
||||
!op_data->stripes[role].read_error)
|
||||
{
|
||||
n_copies++;
|
||||
}
|
||||
}
|
||||
if (n_copies <= op_data->pg_data_size)
|
||||
{
|
||||
// Nothing to compare, just mark the object as corrupted
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
|
||||
// Operation is treated as unsuccessful only if the object becomes unreadable
|
||||
finish_op(cur_op, n_copies < op_data->pg_data_size ? op_data->errcode : 0);
|
||||
return;
|
||||
}
|
||||
// Proceed, we can still compare chunks that were successfully read
|
||||
}
|
||||
else
|
||||
{
|
||||
finish_op(cur_op, op_data->errcode);
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (op_data->scheme == POOL_SCHEME_REPLICATED)
|
||||
{
|
||||
// Check that all chunks have returned the same data
|
||||
int total = 0;
|
||||
int eq_to[op_data->pg_size];
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
eq_to[role] = -1;
|
||||
if (op_data->stripes[role].read_end != 0 && !op_data->stripes[role].read_error)
|
||||
{
|
||||
total++;
|
||||
eq_to[role] = role;
|
||||
for (int other = 0; other < role; other++)
|
||||
{
|
||||
// Only compare with unique chunks (eq_to[other] == other)
|
||||
if (eq_to[other] == other && memcmp(op_data->stripes[role].read_buf, op_data->stripes[other].read_buf, bs_block_size) == 0)
|
||||
{
|
||||
eq_to[role] = eq_to[other];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
int votes[op_data->pg_size];
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
votes[role] = 0;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (eq_to[role] != -1)
|
||||
votes[eq_to[role]]++;
|
||||
}
|
||||
int best = -1;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (best < 0 && votes[role] > 0 || votes[role] > votes[best])
|
||||
best = role;
|
||||
}
|
||||
if (best > 0 && votes[best] < total)
|
||||
{
|
||||
// FIXME Add a flag to allow to skip such objects and not recover them automatically
|
||||
bool unknown = false;
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (role != best && votes[role] == votes[best])
|
||||
unknown = true;
|
||||
if (votes[role] > 0 && votes[role] < votes[best])
|
||||
{
|
||||
printf(
|
||||
"[PG %u/%u] Object %lx:%lx copy on OSD %lu doesn't match %d other copies, marking it as corrupted\n",
|
||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||
op_data->oid.inode, op_data->oid.stripe, op_data->stripes[role].osd_num, votes[best]
|
||||
);
|
||||
op_data->stripes[role].read_error = true;
|
||||
}
|
||||
}
|
||||
if (unknown)
|
||||
{
|
||||
// It's unknown which replica is good. There are multiple versions with no majority
|
||||
best = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(op_data->scheme == POOL_SCHEME_EC || op_data->scheme == POOL_SCHEME_XOR);
|
||||
if (op_data->degraded)
|
||||
{
|
||||
// Reconstruct missing stripes
|
||||
// XOR shouldn't come here as it only has 1 parity chunk
|
||||
assert(op_data->scheme == POOL_SCHEME_EC);
|
||||
reconstruct_stripes_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size);
|
||||
}
|
||||
// Generate parity chunks and compare them with actual data
|
||||
osd_num_t fake_osd_set[op_data->pg_size];
|
||||
for (int i = 0; i < op_data->pg_size; i++)
|
||||
{
|
||||
fake_osd_set[i] = 1;
|
||||
op_data->stripes[i].write_buf = i >= op_data->pg_data_size
|
||||
? ((uint8_t*)cur_op->buf + (i-op_data->pg_data_size)*bs_block_size)
|
||||
: op_data->stripes[i].read_buf;
|
||||
}
|
||||
if (op_data->scheme == POOL_SCHEME_XOR)
|
||||
{
|
||||
calc_rmw_parity_xor(op_data->stripes, op_data->pg_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size);
|
||||
}
|
||||
else if (op_data->scheme == POOL_SCHEME_EC)
|
||||
{
|
||||
calc_rmw_parity_ec(op_data->stripes, op_data->pg_size, op_data->pg_data_size, fake_osd_set, fake_osd_set, bs_block_size, clean_entry_bitmap_size);
|
||||
}
|
||||
// Now compare that write_buf == read_buf
|
||||
for (int role = op_data->pg_data_size; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error &&
|
||||
memcmp(op_data->stripes[role].read_buf, op_data->stripes[role].write_buf, bs_block_size) != 0)
|
||||
{
|
||||
// Chunks don't match - something's wrong... but we don't know what :D
|
||||
// FIXME: Try to locate errors (may be possible with >= 2 parity chunks)
|
||||
printf(
|
||||
"[PG %u/%u] Object %lx:%lx parity chunk %d on OSD %lu doesn't match data, marking it as corrupted\n",
|
||||
INODE_POOL(op_data->oid.inode), op_data->pg_num,
|
||||
op_data->oid.inode, op_data->oid.stripe,
|
||||
role-op_data->pg_data_size, op_data->stripes[role].osd_num
|
||||
);
|
||||
op_data->stripes[role].read_error = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int role = 0; role < op_data->pg_size; role++)
|
||||
{
|
||||
if (op_data->stripes[role].osd_num != 0 && !op_data->stripes[role].read_error)
|
||||
{
|
||||
// Got at least 1 read error or mismatch, mark the object as corrupted
|
||||
auto & pg = pgs.at({ .pool_id = INODE_POOL(op_data->oid.inode), .pg_num = op_data->pg_num });
|
||||
// FIXME: ref = true ideally... because new_state != state is not necessarily true if it's freed and recreated
|
||||
op_data->object_state = mark_object_corrupted(pg, op_data->oid, op_data->object_state, op_data->stripes, false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
finish_op(cur_op, 0);
|
||||
}
|
@@ -125,11 +125,18 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
|
||||
secondary_op_callback(cur_op);
|
||||
return;
|
||||
}
|
||||
cur_op->bs_op->oid.stripe = cur_op->req.sec_list.pg_stripe_size;
|
||||
cur_op->bs_op->len = cur_op->req.sec_list.pg_count;
|
||||
cur_op->bs_op->offset = cur_op->req.sec_list.list_pg - 1;
|
||||
cur_op->bs_op->oid.inode = cur_op->req.sec_list.min_inode;
|
||||
cur_op->bs_op->version = cur_op->req.sec_list.max_inode;
|
||||
cur_op->bs_op->pg_alignment = cur_op->req.sec_list.pg_stripe_size;
|
||||
cur_op->bs_op->pg_count = cur_op->req.sec_list.pg_count;
|
||||
cur_op->bs_op->pg_number = cur_op->req.sec_list.list_pg - 1;
|
||||
cur_op->bs_op->min_oid.inode = cur_op->req.sec_list.min_inode;
|
||||
cur_op->bs_op->min_oid.stripe = cur_op->req.sec_list.min_stripe;
|
||||
cur_op->bs_op->max_oid.inode = cur_op->req.sec_list.max_inode;
|
||||
if (cur_op->req.sec_list.max_inode && cur_op->req.sec_list.max_stripe != UINT64_MAX)
|
||||
{
|
||||
cur_op->bs_op->max_oid.stripe = cur_op->req.sec_list.max_stripe
|
||||
? cur_op->req.sec_list.max_stripe : UINT64_MAX;
|
||||
}
|
||||
cur_op->bs_op->list_stable_limit = cur_op->req.sec_list.stable_limit;
|
||||
#ifdef OSD_STUB
|
||||
cur_op->bs_op->retval = 0;
|
||||
cur_op->bs_op->buf = NULL;
|
||||
|
@@ -150,7 +150,6 @@ int connect_osd(const char *osd_address, int osd_port)
|
||||
if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
||||
{
|
||||
perror("connect");
|
||||
close(connect_fd);
|
||||
return -1;
|
||||
}
|
||||
int one = 1;
|
||||
|
@@ -3,9 +3,9 @@
|
||||
|
||||
#include "pg_states.h"
|
||||
|
||||
const int pg_state_bit_count = 14;
|
||||
const int pg_state_bit_count = 16;
|
||||
|
||||
const int pg_state_bits[14] = {
|
||||
const int pg_state_bits[16] = {
|
||||
PG_STARTING,
|
||||
PG_PEERING,
|
||||
PG_INCOMPLETE,
|
||||
@@ -14,15 +14,17 @@ const int pg_state_bits[14] = {
|
||||
PG_STOPPING,
|
||||
PG_OFFLINE,
|
||||
PG_DEGRADED,
|
||||
PG_HAS_CORRUPTED,
|
||||
PG_HAS_INCOMPLETE,
|
||||
PG_HAS_DEGRADED,
|
||||
PG_HAS_MISPLACED,
|
||||
PG_HAS_UNCLEAN,
|
||||
PG_HAS_INVALID,
|
||||
PG_LEFT_ON_DEAD,
|
||||
PG_SCRUBBING,
|
||||
};
|
||||
|
||||
const char *pg_state_names[14] = {
|
||||
const char *pg_state_names[16] = {
|
||||
"starting",
|
||||
"peering",
|
||||
"incomplete",
|
||||
@@ -31,10 +33,12 @@ const char *pg_state_names[14] = {
|
||||
"stopping",
|
||||
"offline",
|
||||
"degraded",
|
||||
"has_corrupted",
|
||||
"has_incomplete",
|
||||
"has_degraded",
|
||||
"has_misplaced",
|
||||
"has_unclean",
|
||||
"has_invalid",
|
||||
"left_on_dead",
|
||||
"scrubbing",
|
||||
};
|
||||
|
@@ -22,7 +22,9 @@
|
||||
#define PG_HAS_MISPLACED (1<<10)
|
||||
#define PG_HAS_UNCLEAN (1<<11)
|
||||
#define PG_HAS_INVALID (1<<12)
|
||||
#define PG_LEFT_ON_DEAD (1<<13)
|
||||
#define PG_HAS_CORRUPTED (1<<13)
|
||||
#define PG_LEFT_ON_DEAD (1<<14)
|
||||
#define PG_SCRUBBING (1<<15)
|
||||
|
||||
// Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
|
||||
// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
|
||||
@@ -32,6 +34,8 @@
|
||||
#define OBJ_DEGRADED 0x02
|
||||
#define OBJ_INCOMPLETE 0x04
|
||||
#define OBJ_MISPLACED 0x08
|
||||
// OBJ_CORRUPTED is always set with one of OBJ_INCOMPLETE/OBJ_DEGRADED/OBJ_MISPLACED
|
||||
#define OBJ_CORRUPTED 0x10
|
||||
#define OBJ_NEEDS_STABLE 0x10000
|
||||
#define OBJ_NEEDS_ROLLBACK 0x20000
|
||||
|
||||
|
@@ -9,9 +9,6 @@
|
||||
#endif
|
||||
#include "qemu/osdep.h"
|
||||
#include "qemu/main-loop.h"
|
||||
#if QEMU_VERSION_MAJOR >= 8
|
||||
#include "block/block-io.h"
|
||||
#endif
|
||||
#include "block/block_int.h"
|
||||
#include "qapi/error.h"
|
||||
#include "qapi/qmp/qdict.h"
|
||||
@@ -271,13 +268,7 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
|
||||
}
|
||||
else
|
||||
{
|
||||
#if QEMU_VERSION_MAJOR >= 8
|
||||
aio_co_enter(bdrv_get_aio_context(bs), qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
|
||||
#elif QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 9 || QEMU_VERSION_MAJOR >= 3
|
||||
bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
|
||||
#else
|
||||
qemu_coroutine_enter(qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
|
||||
#endif
|
||||
BDRV_POLL_WHILE(bs, !task.complete);
|
||||
}
|
||||
client->image = image;
|
||||
@@ -741,13 +732,8 @@ static BlockDriver bdrv_vitastor = {
|
||||
.bdrv_parse_filename = vitastor_parse_filename,
|
||||
|
||||
.bdrv_has_zero_init = bdrv_has_zero_init_1,
|
||||
#if QEMU_VERSION_MAJOR >= 8
|
||||
.bdrv_co_get_info = vitastor_get_info,
|
||||
.bdrv_co_getlength = vitastor_getlength,
|
||||
#else
|
||||
.bdrv_get_info = vitastor_get_info,
|
||||
.bdrv_getlength = vitastor_getlength,
|
||||
#endif
|
||||
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2
|
||||
.bdrv_probe_blocksizes = vitastor_probe_blocksizes,
|
||||
#endif
|
||||
|
@@ -15,7 +15,7 @@ int read_blocking(int fd, void *read_buf, size_t remaining)
|
||||
size_t done = 0;
|
||||
while (done < remaining)
|
||||
{
|
||||
ssize_t r = read(fd, read_buf, remaining-done);
|
||||
size_t r = read(fd, read_buf, remaining-done);
|
||||
if (r <= 0)
|
||||
{
|
||||
if (!errno)
|
||||
@@ -41,7 +41,7 @@ int write_blocking(int fd, void *write_buf, size_t remaining)
|
||||
size_t done = 0;
|
||||
while (done < remaining)
|
||||
{
|
||||
ssize_t r = write(fd, write_buf, remaining-done);
|
||||
size_t r = write(fd, write_buf, remaining-done);
|
||||
if (r < 0)
|
||||
{
|
||||
if (errno != EINTR && errno != EAGAIN && errno != EPIPE)
|
||||
|
@@ -249,3 +249,35 @@ void print_help(const char *help_text, std::string exe_name, std::string cmd, bo
|
||||
fwrite(filtered_text.data(), filtered_text.size(), 1, stdout);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
uint64_t parse_time(std::string time_str, bool *ok)
|
||||
{
|
||||
if (!time_str.length())
|
||||
{
|
||||
if (ok)
|
||||
*ok = false;
|
||||
return 0;
|
||||
}
|
||||
uint64_t mul = 1;
|
||||
char type_char = tolower(time_str[time_str.length()-1]);
|
||||
if (type_char == 's' || type_char == 'm' || type_char == 'h' || type_char == 'd' || type_char == 'y')
|
||||
{
|
||||
if (type_char == 's')
|
||||
mul = 1;
|
||||
else if (time_str[time_str.length()-1] == 'M')
|
||||
mul = 30*86400;
|
||||
else if (type_char == 'm')
|
||||
mul = 60;
|
||||
else if (type_char == 'h')
|
||||
mul = 3600;
|
||||
else if (type_char == 'd')
|
||||
mul = 86400;
|
||||
else /*if (type_char == 'y')*/
|
||||
mul = 86400*365;
|
||||
time_str = time_str.substr(0, time_str.length()-1);
|
||||
}
|
||||
uint64_t ts = stoull_full(time_str, 0) * mul;
|
||||
if (ok)
|
||||
*ok = !(ts == 0 && time_str != "0" && (time_str != "" || mul != 1));
|
||||
return ts;
|
||||
}
|
||||
|
@@ -15,3 +15,4 @@ std::string str_replace(const std::string & in, const std::string & needle, cons
|
||||
uint64_t stoull_full(const std::string & str, int base = 0);
|
||||
std::string format_size(uint64_t size, bool nobytes = false);
|
||||
void print_help(const char *help_text, std::string exe_name, std::string cmd, bool all);
|
||||
uint64_t parse_time(std::string time_str, bool *ok = NULL);
|
||||
|
@@ -83,7 +83,6 @@ int connect_stub(const char *server_address, int server_port)
|
||||
if (connect(connect_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
|
||||
{
|
||||
perror("connect");
|
||||
close(connect_fd);
|
||||
return -1;
|
||||
}
|
||||
int one = 1;
|
||||
|
@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
||||
|
||||
Name: Vitastor
|
||||
Description: Vitastor client library
|
||||
Version: 0.8.8
|
||||
Version: 0.8.5
|
||||
Libs: -L${libdir} -lvitastor_client
|
||||
Cflags: -I${includedir}
|
||||
|
||||
|
@@ -64,4 +64,4 @@ echo leak:librbd >> testdata/lsan-suppress.txt
|
||||
echo leak:_M_mutate >> testdata/lsan-suppress.txt
|
||||
echo leak:_M_assign >> testdata/lsan-suppress.txt
|
||||
export LSAN_OPTIONS=report_objects=true:suppressions=`pwd`/testdata/lsan-suppress.txt
|
||||
export ASAN_OPTIONS=verify_asan_link_order=false:abort_on_error=1
|
||||
export ASAN_OPTIONS=verify_asan_link_order=false
|
||||
|
@@ -17,17 +17,17 @@ else
|
||||
fi
|
||||
|
||||
if [ "$IMMEDIATE_COMMIT" != "" ]; then
|
||||
NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 10"
|
||||
NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 1"
|
||||
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1,"immediate_commit":"all"}'
|
||||
else
|
||||
NO_SAME="--journal_sector_buffer_count 1024 --log_level 10"
|
||||
NO_SAME="--journal_sector_buffer_count 1024 --log_level 1"
|
||||
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1}'
|
||||
fi
|
||||
|
||||
start_osd()
|
||||
{
|
||||
local i=$1
|
||||
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) >>./testdata/osd$i.log 2>&1 &
|
||||
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
|
||||
eval OSD${i}_PID=$!
|
||||
}
|
||||
|
||||
|
@@ -43,6 +43,3 @@ SCHEME=ec ./test_snapshot.sh
|
||||
SCHEME=xor ./test_write.sh
|
||||
|
||||
./test_write_no_same.sh
|
||||
|
||||
./test_heal.sh
|
||||
SCHEME=ec PG_MINSIZE=2 ./test_heal.sh
|
||||
|
@@ -28,7 +28,9 @@ if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only |\
|
||||
format_error "FAILED: OSD NOT ADDED INTO DISTRIBUTION"
|
||||
fi
|
||||
|
||||
wait_finish_rebalance 10
|
||||
if ! ($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == '$PG_COUNT''); then
|
||||
format_error "FAILED: $PG_COUNT PGS NOT ACTIVE"
|
||||
fi
|
||||
|
||||
sleep 1
|
||||
kill -9 $OSD4_PID
|
||||
@@ -50,6 +52,8 @@ if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only |\
|
||||
format_error "FAILED: OSD NOT REMOVED FROM DISTRIBUTION"
|
||||
fi
|
||||
|
||||
wait_finish_rebalance 10
|
||||
if ! ($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"] or .state == ["active", "left_on_dead"]) ] | length) == '$PG_COUNT''); then
|
||||
format_error "FAILED: $PG_COUNT PGS NOT ACTIVE"
|
||||
fi
|
||||
|
||||
format_green OK
|
||||
|
@@ -43,7 +43,7 @@ kill_osds &
|
||||
|
||||
LD_PRELOAD="build/src/libfio_vitastor.so" \
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=16 -fsync=256 -rw=randwrite \
|
||||
-mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120
|
||||
-mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120 2>/dev/null
|
||||
|
||||
qemu-img convert -S 4096 -p \
|
||||
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:image=testimg" \
|
||||
|
@@ -10,7 +10,7 @@ kill -INT $OSD2_PID
|
||||
|
||||
sleep 5
|
||||
|
||||
if ! ($ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active", "degraded", "left_on_dead"]) ] | length == '$PG_COUNT); then
|
||||
if ! ($ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active", "degraded"]) ] | length == '$PG_COUNT); then
|
||||
format_error "FAILED: $PG_COUNT PG(s) NOT ACTIVE+DEGRADED"
|
||||
fi
|
||||
|
||||
|
@@ -7,7 +7,7 @@ OSD_COUNT=5
|
||||
OSD_ARGS=
|
||||
for i in $(seq 1 $OSD_COUNT); do
|
||||
dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
|
||||
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) >>./testdata/osd$i.log 2>&1 &
|
||||
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
|
||||
eval OSD${i}_PID=$!
|
||||
done
|
||||
|
||||
|
@@ -53,7 +53,7 @@ for i in $(seq 1 $OSD_COUNT); do
|
||||
--data_device ./testdata/test_osd$i.bin \
|
||||
--meta_offset 0 \
|
||||
--journal_offset $((1024*1024)) \
|
||||
--data_offset $((128*1024*1024)) >>./testdata/osd$i.log 2>&1 &
|
||||
--data_offset $((128*1024*1024)) &>./testdata/osd$i.log &
|
||||
eval OSD${i}_PID=$!
|
||||
done
|
||||
|
||||
|
@@ -30,7 +30,7 @@ qemu-img create -f qcow2 ./testdata/empty.qcow2 32M
|
||||
|
||||
qemu-img convert -p \
|
||||
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024)):skip-parents=1" \
|
||||
-O qcow2 -o 'cluster_size=4k,backing_fmt=qcow2' -B empty.qcow2 ./testdata/layer1.qcow2
|
||||
-O qcow2 -o 'cluster_size=4k' -B empty.qcow2 ./testdata/layer1.qcow2
|
||||
|
||||
qemu-img convert -S 4096 -p \
|
||||
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024))" \
|
||||
@@ -64,7 +64,7 @@ cmp ./testdata/merged.bin ./testdata/merged-by-tool.bin
|
||||
|
||||
# Test merge by qemu-img
|
||||
|
||||
qemu-img rebase -u -b layer0.qcow2 -F qcow2 ./testdata/layer1.qcow2
|
||||
qemu-img rebase -u -b layer0.qcow2 ./testdata/layer1.qcow2
|
||||
|
||||
qemu-img convert -S 4096 -f qcow2 ./testdata/layer1.qcow2 -O raw ./testdata/rebased.bin
|
||||
|
||||
|
@@ -21,8 +21,7 @@ LD_PRELOAD="build/src/libfio_vitastor.so" \
|
||||
# Kill OSD 2, start OSD 1
|
||||
|
||||
kill $OSD2_PID
|
||||
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL \
|
||||
$(build/src/vitastor-disk simple-offsets --format options --device ./testdata/test_osd2.bin 2>/dev/null) >>./testdata/osd2.log 2>&1 &
|
||||
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options --device ./testdata/test_osd2.bin 2>/dev/null) >>./testdata/osd2.log 2>&1 &
|
||||
sleep 2
|
||||
|
||||
# Check PG state - it should NOT become active
|
||||
|
@@ -10,7 +10,7 @@ etcdctl --endpoints=http://127.0.0.1:12379/v3 del --prefix /vitastor/osd/state
|
||||
OSD_COUNT=3
|
||||
OSD_ARGS=
|
||||
for i in $(seq 1 $OSD_COUNT); do
|
||||
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) >>./testdata/osd$i.log 2>&1 &
|
||||
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
|
||||
eval OSD${i}_PID=$!
|
||||
done
|
||||
|
||||
|
@@ -12,6 +12,6 @@ GLOBAL_CONF='{"immediate_commit":"all"}'
|
||||
# Test basic write
|
||||
|
||||
LD_PRELOAD="build/src/libfio_vitastor.so" \
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -cluster_log_level=10
|
||||
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=1G -cluster_log_level=10
|
||||
|
||||
format_green OK
|
||||
|
Reference in New Issue
Block a user