Compare commits

..

1 Commits

Author SHA1 Message Date
Vitaliy Filippov 29498c9a9e WIP NFS RDMA support
Test / test_dd (push) Successful in 13s Details
Test / test_root_node (push) Successful in 8s Details
Test / test_rebalance_verify_ec (push) Successful in 1m40s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m42s Details
Test / test_write_no_same (push) Successful in 7s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write (push) Successful in 32s Details
Test / test_write_xor (push) Successful in 34s Details
Test / test_heal_pg_size_2 (push) Successful in 2m16s Details
Test / test_heal_ec (push) Successful in 2m17s Details
Test / test_heal_antietcd (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m18s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m19s Details
Test / test_heal_csum_32k (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m18s Details
Test / test_osd_tags (push) Successful in 8s Details
Test / test_snapshot_pool2 (push) Successful in 14s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 10s Details
Test / test_enospc_imm_xor (push) Successful in 12s Details
Test / test_scrub (push) Successful in 13s Details
Test / test_scrub_zero_osd_2 (push) Successful in 13s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 15s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 12s Details
Test / test_heal_csum_4k (push) Successful in 2m11s Details
2024-11-04 18:58:46 +03:00
60 changed files with 350 additions and 1478 deletions

View File

@ -22,7 +22,7 @@ RUN apt-get update
RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \ RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \
liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'` RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
RUN apt-get update && apt-get -y install jq lp-solve sudo nfs-common fdisk parted RUN apt-get -y install jq lp-solve sudo nfs-common
RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'` RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
RUN set -ex; \ RUN set -ex; \

View File

@ -828,42 +828,6 @@ jobs:
echo "" echo ""
done done
test_resize:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_resize.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_resize_auto:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_resize_auto.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_snapshot_pool2: test_snapshot_pool2:
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: build needs: build

View File

@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
project(vitastor) project(vitastor)
set(VITASTOR_VERSION "1.9.3") set(VITASTOR_VERSION "1.9.2")
add_subdirectory(src) add_subdirectory(src)

View File

@ -1,4 +1,4 @@
VITASTOR_VERSION ?= v1.9.3 VITASTOR_VERSION ?= v1.9.2
all: build push all: build push

View File

@ -49,7 +49,7 @@ spec:
capabilities: capabilities:
add: ["SYS_ADMIN"] add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v1.9.3 image: vitalif/vitastor-csi:v1.9.2
args: args:
- "--node=$(NODE_ID)" - "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)" - "--endpoint=$(CSI_ENDPOINT)"

View File

@ -121,7 +121,7 @@ spec:
privileged: true privileged: true
capabilities: capabilities:
add: ["SYS_ADMIN"] add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v1.9.3 image: vitalif/vitastor-csi:v1.9.2
args: args:
- "--node=$(NODE_ID)" - "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)" - "--endpoint=$(CSI_ENDPOINT)"

View File

@ -5,7 +5,7 @@ package vitastor
const ( const (
vitastorCSIDriverName = "csi.vitastor.io" vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "1.9.3" vitastorCSIDriverVersion = "1.9.2"
) )
// Config struct fills the parameters of request or user input // Config struct fills the parameters of request or user input

2
debian/changelog vendored
View File

@ -1,4 +1,4 @@
vitastor (1.9.3-1) unstable; urgency=medium vitastor (1.9.2-1) unstable; urgency=medium
* Bugfixes * Bugfixes

View File

@ -6,150 +6,19 @@
# Architecture # Architecture
- [Server-side components](#server-side-components)
- [Basic concepts](#basic-concepts) - [Basic concepts](#basic-concepts)
- [Client-side components](#client-side-components)
- [Additional utilities](#additional-utilities)
- [Overall read/write process](#overall-read-write-process)
- [Nuances of request handling](#nuances-of-request-handling)
- [Similarities to Ceph](#similarities-to-ceph) - [Similarities to Ceph](#similarities-to-ceph)
- [Differences from Ceph](#differences-from-ceph) - [Differences from Ceph](#differences-from-ceph)
- [Implementation Principles](#implementation-principles) - [Implementation Principles](#implementation-principles)
## Server-side components
- **OSD** (Object Storage Daemon) is a process that directly works with the disk, stores data
and serves read/write requests. One OSD serves one disk (or one partition). OSDs talk to etcd
and to each other — they receive cluster state from etcd, and send read/write requests for
secondary copies of data to other OSDs.
- **etcd** — clustered key/value database, used as a reliable storage for configuration
and high-level cluster state. Etcd is the component that prevents splitbrain in the cluster.
Data blocks are not stored in etcd, etcd doesn't participate in data write or read path.
- **Монитор** — a separate node.js based daemon which monitors the cluster, calculates
required configuration changes and saves them to etcd, thus commanding OSDs to apply these
changes. Monitor also aggregates cluster statistics. OSD don't talk to monitor, monitor
only sends and receives data from etcd.
## Basic concepts ## Basic concepts
- **Pool** is a container for data that has equal redundancy scheme and disk placement rules. - OSD (Object Storage Daemon) is a process that stores data and serves read/write requests.
- **PG (Placement Group)** is a "shard" of the cluster, subdivision unit that has its own - PG (Placement Group) is a "shard" of the cluster, group of data stored on one set of replicas.
set of OSDs for data storage. - Pool is a container for data that has equal redundancy scheme and placement rules.
- **Failure Domain** is a group of OSDs, from the simultaneous failure of which you are - Monitor is a separate daemon that watches cluster state and handles failures.
protected by Vitastor. Default failure domain is "host" (server), but you choose a - Failure Domain is a group of OSDs that you allow to fail. It's "host" by default.
larger (for example, a rack of servers) or smaller (a single drive) failure domain - Placement Tree groups OSDs in a hierarchy to later split them into Failure Domains.
for every pool.
- **Placement Tree** (similar to Ceph CRUSH Tree) groups OSDs in a hierarchy to later
split them into Failure Domains.
## Client-side components
- **Client library** encapsulates client I/O logic. Client library connects to etcd and to all OSDs,
receives cluster state from etcd, sends read and write requests directly to all OSDs. Due
to the symmetric distributed architecture, all data blocks (each 128 KB by default) are placed
to different OSDs, but clients always know where each data block is stored and connect directly
to the right OSD.
All other client-side components are based on the client library:
- **[vitastor-cli](../usage/cli.en.md)** — command-line utility for cluster management.
Allows to view cluster state, manage pools and images, i.e. create, modify and remove
virtual disks, their snapshots and clones.
- **[QEMU driver](../usage/qemu.en.md)** — pluggable QEMU module allowing QEMU/KVM virtual
machines work with virtual Vitastor disks directly from userspace through the client library,
without the need to attach disks as kernel block devices. However, if you want to attach
disks, you can also do that with the same driver and [VDUSE](../usage/qemu.en.md#vduse).
- **[vitastor-nbd](../usage/nbd.en.md)** — utility that allows to attach Vitastor disks as
kernel block devices using NBD (Network Block Device), which works more like "BUSE"
(Block Device In Userspace). Vitastor doesn't have Linux kernel modules for the same task
(at least by now). NBD is an older, non-recommended way to attach disks — you should use
VDUSE whenever you can.
- **[CSI driver](../installation/kubernetes.en.md)** — driver for attaching Vitastor images
as Kubernetes persistent volumes. Works through VDUSE (when available) or NBD — images are
attached as kernel block devices and mounted into containers.
- **Drivers for Proxmox, OpenStack and so on** — pluggable modules for corresponding systems,
allowing to use Vitastor as storage in them.
- **[vitastor-nfs](../usage/nfs.en.md)** — NFS 3.0 server allowing export of two file system variants:
the first is a simplified pseudo-FS for file-based access to Vitastor block images (for non-QEMU
hypervisors with NFS support), the second is **VitastorFS**, full-featured clustered POSIX FS.
Both variants support parallel access from multiple vitastor-nfs servers. In fact, you are
not required to setup separate NFS servers at all and use vitastor-nfs mount command on every
client node — it starts the NFS server and mounts the FS locally.
- **[fio driver](../usage/fio.en.md)** — pluggable module for fio disk benchmarking tool for
running performance tests on your Vitastor cluster.
- **vitastor-kv** — client for a key-value DB working over shared block volumes (usual
vitastor images). VitastorFS metadata is stored in vitastor-kv.
## Additional utilities
- **vitastor-disk** — a Vitastor OSD disk management tool. You can create, remove,
resize and move OSD partitions with it.
## Overall read/write process
- Vitastor stores virtual disks, also named "images" or "inodes".
- Each image is stored in some pool. Pool specifies storage parameters such as redundancy
scheme (replication or EC — erasure codes, i.e. error correction codes), failure domain
and restrictions on OSD selection for image data placement. See [Pool configuration](../config/pool.en.md) for details.
- Each image is split into objects/blocks of fixed size, equal to [block_size](../config/layout-cluster.en.md#block_size)
(128 KB by default), multiplied by data part count for EC or 1 for replicas. That is,
if a pool uses EC 4+2 coding scheme (4 data parts + 2 parity parts), then, with the
default block_size, images are split into 512 KB objects.
- Client read/write requests are split into parts at object boundaries.
- Each object is mapped to a PG number it belongs to, by simply taking a remainder of
division of its offset by PG count of the image's pool.
- Client reads primary OSD for all PGs from etcd. Primary OSD for each PG is assigned
by the monitor during cluster operation, along with the full PG OSD set.
- If not already connected, client connects to primary OSDs of all PGs involved in a
read/write request and sends parts of the request to them.
- If a primary OSD is unavailable, client retries connection attempts indefinitely
either until it becomes available or until the monitor assigns another OSD as primary
for that PG.
- Client also retries requests if the primary OSD replies with error code EPIPE, meaning
that the PG is inactive at this OSD at the moment - for example, when the primary OSD
is switched, or if the primary OSD itself loses connection to replicas during request
handling.
- Primary OSD determines where the parts of the object are stored. By default, all objects
are assumed to be stored at the target OSD set of a PG, but some of them may be present
at a different OSD set if they are degraded or moved, or if the data rebalancing process
is active. OSDs doesn't do any network requests, if calculates locations of all objects
during PG activation and stores it in memory.
- Primary OSD handles the request locally when it can - for example, when it's a read
from a replicated pool or when it's a read from a EC pool involving only one data part
stored on the OSD's local disk.
- When a request requires reads or writes to additional OSDs, primary OSD uses already
established connections to secondary OSDs of the PG to execute these requests. This happens
in parallel to local disk operations. All such connections are guaranteed to be already
established when the PG is active, and if any of them is dropped, PG is restarted and
all current read/write operations to it fail with EPIPE error and are retried by clients.
- After completing all secondary read/write requests, primary OSD sends the response to
the client.
### Nuances of request handling
- If a pool uses erasure codes and some of the OSDs are unavailable, primary OSDs recover
data from the remaining parts during read.
- Each object has a version number. During write, primary OSD first determines the current
version of the object. As primary OSD usually stores the object or its part itself, most
of the time version is read from the memory of the OSD itself. However, if primary OSD
doesn't contain parts of the object, it requests the version number from a secondary OSD
which has that part. Such request still doesn't involve reading from the disk though,
because object metadata, including version number, is always stored in OSD memory.
- If a pool uses erasure codes, partial writes of an object require reading other parts of
it from secondary OSDs or from the local disk of the primary OSD itself. This is called
"read-modify-write" process.
- If a pool uses erasure codes, two-phase write process is used to get rid of the Write Hole
problem: first a new version of object parts is written to all secondary OSDs without
removing the previous version, and then, after receiving successful write confirmations
from all OSDs, new version is committed and the old one is allowed to be removed.
- In a pool doesn't use immediate_commit mode, then write requests sent by clients aren't
treated as committed to physical media instantly. Clients have to send separate type of
requests (SYNC) to commit changes, and before it isn't sent, new versions of data are
allowed to be lost if some OSDs die. Thus, when immediate_commit is disabled, clients
store copies of all write requests in memory and repeat them from there when the
connection to primary OSD is lost. This in-memory copy is removed after a successful
SYNC, and to prevent excessive memory usage, clients also do an automatic SYNC
every [client_dirty_limit](../config/network.en.md#client_dirty_limit) written bytes.
## Similarities to Ceph ## Similarities to Ceph

View File

@ -11,7 +11,6 @@
- [Серверные компоненты](#серверные-компоненты) - [Серверные компоненты](#серверные-компоненты)
- [Базовые понятия](#базовые-понятия) - [Базовые понятия](#базовые-понятия)
- [Клиентские компоненты](#клиентские-компоненты) - [Клиентские компоненты](#клиентские-компоненты)
- [Дополнительные утилиты](#дополнительные-утилиты)
- [Общий процесс записи и чтения](#общий-процесс-записи-и-чтения) - [Общий процесс записи и чтения](#общий-процесс-записи-и-чтения)
- [Особенности обработки запросов](#особенности-обработки-запросов) - [Особенности обработки запросов](#особенности-обработки-запросов)
- [Схожесть с Ceph](#схожесть-с-ceph) - [Схожесть с Ceph](#схожесть-с-ceph)
@ -35,9 +34,8 @@
- **Пул (Pool)** — контейнер для данных, имеющих одну и ту же схему избыточности и правила распределения по OSD. - **Пул (Pool)** — контейнер для данных, имеющих одну и ту же схему избыточности и правила распределения по OSD.
- **PG (Placement Group)** — "шард", единица деления пулов в кластере, которой назначается свой набор - **PG (Placement Group)** — "шард", единица деления пулов в кластере, которой назначается свой набор
OSD для хранения данных (копий или частей объектов). OSD для хранения данных (копий или частей объектов).
- **Домен отказа (Failure Domain)** — группа OSD, от одновременного падения которых должен защищать - **Домен отказа (Failure Domain)** — группа OSD, одновременное падение которых рассматривается
Vitastor. По умолчанию домен отказа — "host" (сервер), но вы можете установить для пула как больший как вероятное. По умолчанию это "host" (сервер).
домен отказа (например, стойку серверов), так и меньший (например, отдельный диск).
- **Дерево распределения** (Placement Tree, в Ceph CRUSH Tree) — иерархическая группировка OSD - **Дерево распределения** (Placement Tree, в Ceph CRUSH Tree) — иерархическая группировка OSD
в узлы, которые далее можно использовать как домены отказа. в узлы, которые далее можно использовать как домены отказа.
@ -51,39 +49,25 @@
На базе клиентской библиотеки реализованы все остальные клиенты: На базе клиентской библиотеки реализованы все остальные клиенты:
- **[vitastor-cli](../usage/cli.ru.md)** — утилита командной строки для управления кластером. - **vitastor-cli** — утилита командной строки для управления кластером. В данный момент позволяет
Позволяет просматривать общее состояние кластера, управлять пулами и образами — то есть просматривать общее состояние кластера и управлять образами — т.е. создавать, менять и удалять
создавать, менять и удалять виртуальные диски, их снимки и клоны. виртуальные диски, их снимки и клоны.
- **[Драйвер QEMU](../usage/qemu.ru.md)** — подключаемый модуль QEMU, позволяющий QEMU/KVM - **Драйвер QEMU** — подключаемый модуль QEMU, позволяющий QEMU/KVM виртуальным машинам работать
виртуальным машинам работать с виртуальными дисками Vitastor напрямую из пространства пользователя с виртуальными дисками Vitastor напрямую из пространства пользователя с помощью клиентской
с помощью клиентской библиотеки, без необходимости подключения дисков в виде блочных устройств библиотеки, без необходимости отображения дисков в виде блочных устройств. Тот же драйвер
Linux. Если, однако, вы хотите подключать диски в виде блочных устройств, то вы тоже можете позволяет подключать диски в систему через [VDUSE](../usage/qemu.ru.md#vduse).
сделать это с помощью того же самого драйвера и [VDUSE](../usage/qemu.ru.md#vduse). - **vitastor-nbd** — утилита, позволяющая монтировать образы Vitastor в виде блочных устройств
- **[vitastor-nbd](../usage/nbd.ru.md)** — утилита, позволяющая монтировать образы Vitastor с помощью NBD (Network Block Device), на самом деле скорее работающего как "BUSE"
в виде блочных устройств с помощью NBD (Network Block Device), на самом деле скорее работающего (Block Device In Userspace). Модуля ядра Linux для выполнения той же задачи в Vitastor нет
как "BUSE" (Block Device In Userspace). Модуля ядра Linux для выполнения той же задачи в (по крайней мере, пока).
Vitastor нет (по крайней мере, пока). NBD — более старый и нерекомендуемый способ подключения - **CSI драйвер** — драйвер для подключения Vitastor-образов в виде персистентных томов (PV) Kubernetes.
дисков — вам следует использовать VDUSE всегда, когда это возможно. Работает через vitastor-nbd — образы отражаются в виде блочных устройств и монтируются
- **[CSI драйвер](../installation/kubernetes.ru.md)** — драйвер для подключения Vitastor-образов в контейнеры.
в виде персистентных томов (PV) Kubernetes. Работает через VDUSE (если доступно) или через
NBD — образы отражаются в виде блочных устройств и монтируются в контейнеры.
- **Драйвера Proxmox, OpenStack и т.п.** — подключаемые модули для соответствующих систем, - **Драйвера Proxmox, OpenStack и т.п.** — подключаемые модули для соответствующих систем,
позволяющие использовать Vitastor как хранилище в оных. позволяющие использовать Vitastor как хранилище в оных.
- **[vitastor-nfs](../usage/nfs.ru.md)** — NFS 3.0 сервер, предоставляющий два варианта файловой системы: - **vitastor-nfs** — утилита, предоставляющая файловый доступ к образам в кластере Vitastor
первая — упрощённая для файлового доступа к блочным образам (для не-QEMU гипервизоров, поддерживающих NFS), по протоколу NFS 3.0. Предназначена для гипервизоров, не основанных на QEMU и Linux, но при
вторая — VitastorFS, полноценная кластерная POSIX ФС. Оба варианта поддерживают параллельный этом поддерживающих NFS.
доступ с нескольких vitastor-nfs серверов. На самом деле можно вообще не выделять
отдельные NFS-серверы, а вместо этого использовать команду vitastor-nfs mount, запускающую
NFS-сервер прямо на клиентской машине и монтирующую ФС локально.
- **[Драйвер fio](../usage/fio.ru.md)** — подключаемый модуль для утилиты тестирования
производительности дисков fio, позволяющий тестировать Vitastor-кластеры.
- **vitastor-kv** — клиент для key-value базы данных, работающей поверх разделяемого блочного
образа (обычного блочного образа vitastor). Метаданные VitastorFS хранятся именно в vitastor-kv.
## Дополнительные утилиты
- **vitastor-disk** — утилита для разметки дисков под Vitastor OSD. С её помощью можно
создавать, удалять, менять размеры или перемещать разделы OSD.
## Общий процесс записи и чтения ## Общий процесс записи и чтения
@ -114,22 +98,16 @@
находиться на других OSD, если эти объекты деградированы или перемещены, или идёт процесс находиться на других OSD, если эти объекты деградированы или перемещены, или идёт процесс
ребаланса. Запросы для проверки по сети не отправляются, информация о местоположении всех ребаланса. Запросы для проверки по сети не отправляются, информация о местоположении всех
объектов рассчитывается первичным OSD при активации PG и хранится в памяти. объектов рассчитывается первичным OSD при активации PG и хранится в памяти.
- Когда это возможно, первичный OSD обрабатывает запрос локально. Например, так происходит - Первичный OSD соединяется (если ещё не соединён) с вторичными OSD, на которых располагаются
при чтениях объектов из пулов с репликацией или при чтении из EC пула, затрагивающего части объекта, и отправляет им запросы чтения/записи, а также читает/пишет из/в своё локальное
только часть, хранимую на диске самого первичного OSD. хранилище, если сам входит в набор.
- Когда запрос требует записи или чтения с вторичных OSD, первичный OSD использует заранее
установленные соединения с ними для выполнения этих запросов. Это происходит параллельно
локальным операциям чтения/записи с диска самого OSD. Так как соединения к вторичным OSD PG
устанавливаются при её запуске, то они уже гарантированно установлены, когда PG активна,
и если любое из этих соединений отключается, PG перезапускается, а все текущие запросы чтения
и записи в неё завершаются с ошибкой EPIPE, после чего повторяются клиентами.
- После завершения всех вторичных операций чтения/записи первичный OSD отправляет ответ клиенту. - После завершения всех вторичных операций чтения/записи первичный OSD отправляет ответ клиенту.
### Особенности обработки запросов ### Особенности обработки запросов
- Если в пуле используются коды коррекции ошибок и при этом часть OSD недоступна, первичный - Если в пуле используются коды коррекции ошибок и при этом часть OSD недоступна, первичный
OSD при чтении восстанавливает данные из оставшихся частей. OSD при чтении восстанавливает данные из оставшихся частей.
- Каждый объект имеет номер версии. При записи объекта первичный OSD сначала получает номер - Каждый объект имеет номер версии. При записи объекта первичный OSD сначала читает из номер
версии объекта. Так как первичный OSD обычно сам хранит копию или часть объекта, номер версии объекта. Так как первичный OSD обычно сам хранит копию или часть объекта, номер
версии обычно читается из памяти самого OSD. Однако, если ни одна часть обновляемого объекта версии обычно читается из памяти самого OSD. Однако, если ни одна часть обновляемого объекта
не находится на первичном OSD, для получения номера версии он обращается к одному из вторичных не находится на первичном OSD, для получения номера версии он обращается к одному из вторичных
@ -137,20 +115,20 @@
так как метаданные объектов, включая номер версии, все OSD хранят в памяти. так как метаданные объектов, включая номер версии, все OSD хранят в памяти.
- Если в пуле используются коды коррекции ошибок, перед частичной записью объекта для вычисления - Если в пуле используются коды коррекции ошибок, перед частичной записью объекта для вычисления
чётности зачастую требуется чтение частей объекта с вторичных OSD или с локального диска чётности зачастую требуется чтение частей объекта с вторичных OSD или с локального диска
самого первичного OSD. Это называется процессом "чтение-модификация-запись" (read-modify-write). самого первичного OSD.
- Если в пуле используются коды коррекции ошибок, для закрытия Write Hole применяется - Также, если в пуле используются коды коррекции ошибок, для закрытия Write Hole применяется
двухфазный алгоритм записи: сначала на все вторичные OSD записывается новая версия частей двухфазный алгоритм записи: сначала на все вторичные OSD записывается новая версия частей
объекта, но при этом старая версия не удаляется, а потом, после получения подтверждения объекта, но при этом старая версия не удаляется, а потом, после получения подтверждения
успешной записи от всех вторичных OSD, новая версия фиксируется и разрешается удаление старой. успешной записи от всех вторичных OSD, новая версия фиксируется и разрешается удаление старой.
- Если в пуле не включён режим immediate_commit, то запросы записи, отправляемые клиентами, - Если в кластере не включён режим immediate_commit, то запросы записи, отправляемые клиентами,
не считаются зафиксированными на физических накопителях сразу. Для фиксации данных клиенты не считаются зафиксированными на физических накопителях сразу. Для фиксации данных клиенты
должны отдельно отправлять запросы SYNC (отдельный от чтения и записи вид запроса), должны отдельно отправлять запросы SYNC (отдельный от чтения и записи вид запроса),
а пока такой запрос не отправлен, считается, что записанные данные могут исчезнуть, а пока такой запрос не отправлен, считается, что записанные данные могут исчезнуть,
если соответствующий OSD упадёт. Поэтому, когда режим immediate_commit отключён, все если соответствующий OSD упадёт. Поэтому, когда режим immediate_commit отключён, все
запросы записи клиенты копируют в памяти и при потере соединения и повторном соединении запросы записи клиенты копируют в памяти и при потере соединения и повторном соединении
с OSD повторяют из памяти. Скопированные в память данные удаляются при успешном SYNC, с OSD повторяют из памяти. Скопированные в память данные удаляются при успешном fsync,
а чтобы хранение этих данных не приводило к чрезмерному потреблению памяти, клиенты а чтобы хранение этих данных не приводило к чрезмерному потреблению памяти, клиенты
автоматически выполняют SYNC каждые [client_dirty_limit](../config/network.ru.md#client_dirty_limit) автоматически выполняют fsync каждые [client_dirty_limit](../config/network.ru.md#client_dirty_limit)
записанных байт. записанных байт.
## Схожесть с Ceph ## Схожесть с Ceph

View File

@ -171,14 +171,7 @@ to make them use the new version of the client library.
### 1.7.x to 1.8.0 ### 1.7.x to 1.8.0
It's recommended to upgrade from version <= 1.7.x to version >= 1.8.0 with full downtime, After upgrading version <= 1.7.x to version >= 1.8.0, BUT <= 1.9.0: restart all clients
i.e. you should first stop clients and then the cluster (OSDs and monitor), because 1.8.0
includes a fix for etcd event stream inconsistency which could lead to "incomplete" objects
appearing in EC pools, and in rare cases, probably, even to data corruption during mass OSD
restarts. It doesn't mean that you WILL hit this problem if you upgrade without full downtime,
but it's better to secure yourself against it.
Also, if you upgrade version from <= 1.7.x to version >= 1.8.0, BUT <= 1.9.0: restart all clients
(VMs and so on), otherwise they will hang when monitor clears old PG configuration key, (VMs and so on), otherwise they will hang when monitor clears old PG configuration key,
which happens 24 hours after upgrade. which happens 24 hours after upgrade.

View File

@ -168,14 +168,7 @@ done
### 1.7.x -> 1.8.0 ### 1.7.x -> 1.8.0
Обновляться с версий <= 1.7.x до версий >= 1.8.0 рекомендуется с полной остановкой После обновления с версий <= 1.7.x до версий >= 1.8.0, НО <= 1.9.0: перезапустите всех
сначала клиентов, а затем кластера, так как в 1.8.0 исправлена проблема (неконсистентность
потоков событий от etcd), способная приводить к появлению incomplete объектов в EC-пулах
и, хоть и редко, но даже к повреждению данных при массовых перезапусках OSD. Если вы
обновляетесь без полной остановки - это не значит, что вы обязательно столкнётесь с этой
проблемой, но лучше подстраховаться.
Также, если вы обновляетесь с версии <= 1.7.x до версии >= 1.8.0, НО <= 1.9.0: перезапустите всех
клиентов (процессы виртуальных машин можно перезапустить путём миграции на другой сервер), клиентов (процессы виртуальных машин можно перезапустить путём миграции на другой сервер),
иначе они зависнут, когда монитор удалит старый ключ конфигурации PG, что происходит через иначе они зависнут, когда монитор удалит старый ключ конфигурации PG, что происходит через
24 часа после обновления. 24 часа после обновления.

View File

@ -51,16 +51,12 @@ Options (automatic mode):
--osd_per_disk <N> --osd_per_disk <N>
Create <N> OSDs on each disk (default 1) Create <N> OSDs on each disk (default 1)
--hybrid --hybrid
Prepare hybrid (HDD+SSD, NVMe+SATA or etc) OSDs using provided devices. By default, Prepare hybrid (HDD+SSD) OSDs using provided devices. SSDs will be used for
any passed SSDs will be used for journals and metadata, HDDs will be used for data, journals and metadata, HDDs will be used for data. Partitions for journals and
but you can override this behaviour with --fast-devices option. Journal and metadata metadata will be created automatically. Whether disks are SSD or HDD is decided
partitions will be created automatically. In the default mode, SSD and HDD disks by the `/sys/block/.../queue/rotational` flag. In hybrid mode, default object
are distinguished by the `/sys/block/.../queue/rotational` flag. When HDDs are used size is 1 MB instead of 128 KB, default journal size is 1 GB instead of 32 MB,
for data in hybrid mode, default block_size is 1 MB instead of 128 KB, default journal and throttle_small_writes is enabled by default.
size is 1 GB instead of 32 MB, and throttle_small_writes is enabled by default.
--fast-devices /dev/nvmeX,/dev/nvmeY
In --hybrid mode, use these devices for journal and metadata instead of auto-detecting
and extracting them from the main [devices...] list.
--disable_data_fsync auto --disable_data_fsync auto
Disable data device cache and fsync (1/yes/true = on, default auto) Disable data device cache and fsync (1/yes/true = on, default auto)
--disable_meta_fsync auto --disable_meta_fsync auto

View File

@ -51,17 +51,12 @@ vitastor-disk - инструмент командной строки для уп
--osd_per_disk <N> --osd_per_disk <N>
Создавать по несколько (<N>) OSD на каждом диске (по умолчанию 1) Создавать по несколько (<N>) OSD на каждом диске (по умолчанию 1)
--hybrid --hybrid
Инициализировать гибридные (HDD+SSD, NVMe+SATA и т.п.) OSD на указанных дисках. Инициализировать гибридные (HDD+SSD) OSD на указанных дисках. SSD будут
По умолчанию, SSD будут использованы для журналов и метаданных, а HDD - для данных, использованы для журналов и метаданных, а HDD - для данных. Разделы для журналов
но вы можете поменять это поведение опцией --fast-devices. Разделы для журналов и метаданных будут созданы автоматически. Является ли диск SSD или HDD, определяется
и метаданных будут созданы автоматически. В режиме по умолчанию SSD и HDD-диски по флагу `/sys/block/.../queue/rotational`. В гибридном режиме по умолчанию
различаются по флагу `/sys/block/.../queue/rotational`. Когда в гибридном режиме используется размер объекта 1 МБ вместо 128 КБ, размер журнала 1 ГБ вместо 32 МБ
для данных используются HDD, по умолчанию размер блока устанавливается 1 МБ вместо и включённый throttle_small_writes.
128 КБ, размер журнала 1 ГБ вместо 32 МБ, и throttle_small_writes включается по
умолчанию.
--fast-devices /dev/nvmeX,/dev/nvmeY
Использовать данные диски для журналов и метаданных в гибридном режиме вместо их
автоопределения и извлечения из основного списка [devices...].
--disable_data_fsync auto --disable_data_fsync auto
Отключать кэш и fsync-и для устройств данных. (1/yes/true = да, по умолчанию автоопределение) Отключать кэш и fsync-и для устройств данных. (1/yes/true = да, по умолчанию автоопределение)
--disable_meta_fsync auto --disable_meta_fsync auto

View File

@ -1,6 +1,6 @@
{ {
"name": "vitastor-mon", "name": "vitastor-mon",
"version": "1.9.3", "version": "1.9.2",
"description": "Vitastor SDS monitor service", "description": "Vitastor SDS monitor service",
"main": "mon-main.js", "main": "mon-main.js",
"scripts": { "scripts": {

View File

@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver from cinder.volume import driver
from cinder.volume import volume_utils from cinder.volume import volume_utils
VITASTOR_VERSION = '1.9.3' VITASTOR_VERSION = '1.9.2'
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)

View File

@ -306,12 +306,12 @@ index e5ff653a60..884ecc79ea 100644
+ etcd = virBufferContentAndReset(&buf); + etcd = virBufferContentAndReset(&buf);
+ } + }
+ +
+ if (virJSONValueObjectAdd(&ret, + if (virJSONValueObjectCreate(&ret,
+ "S:etcd-host", etcd, + "S:etcd-host", etcd,
+ "S:etcd-prefix", src->query, + "S:etcd-prefix", src->query,
+ "S:config-path", src->configFile, + "S:config-path", src->configFile,
+ "s:image", src->path, + "s:image", src->path,
+ NULL) < 0) + NULL) < 0)
+ return NULL; + return NULL;
+ +
+ return ret; + return ret;

View File

@ -1,193 +0,0 @@
Index: pve-qemu-kvm-9.0.0/block/meson.build
===================================================================
--- pve-qemu-kvm-9.0.0.orig/block/meson.build
+++ pve-qemu-kvm-9.0.0/block/meson.build
@@ -126,6 +126,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
Index: pve-qemu-kvm-9.0.0/meson.build
===================================================================
--- pve-qemu-kvm-9.0.0.orig/meson.build
+++ pve-qemu-kvm-9.0.0/meson.build
@@ -1452,6 +1452,26 @@ if not get_option('rbd').auto() or have_
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'))
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -2254,6 +2274,7 @@ endif
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
@@ -4454,6 +4475,7 @@ summary_info += {'fdt support': fd
summary_info += {'libcap-ng support': libcap_ng}
summary_info += {'bpf support': libbpf}
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
Index: pve-qemu-kvm-9.0.0/meson_options.txt
===================================================================
--- pve-qemu-kvm-9.0.0.orig/meson_options.txt
+++ pve-qemu-kvm-9.0.0/meson_options.txt
@@ -194,6 +194,8 @@ option('lzo', type : 'feature', value :
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
Index: pve-qemu-kvm-9.0.0/qapi/block-core.json
===================================================================
--- pve-qemu-kvm-9.0.0.orig/qapi/block-core.json
+++ pve-qemu-kvm-9.0.0/qapi/block-core.json
@@ -3481,7 +3481,7 @@
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
'pbs',
- 'ssh', 'throttle', 'vdi', 'vhdx',
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -4591,6 +4591,28 @@
'*server': ['InetSocketAddressBase'] } }
##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
+##
# @ReplicationMode:
#
# An enumeration of replication modes.
@@ -5053,6 +5075,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'virtio-blk-vfio-pci':
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
'if': 'CONFIG_BLKIO' },
@@ -5498,6 +5521,20 @@
'*encrypt' : 'RbdEncryptionCreateOptions' } }
##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @location: Where to store the new image file. This location cannot
+# point to a snapshot.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
+##
# @BlockdevVmdkSubformat:
#
# Subformat options for VMDK images
@@ -5719,6 +5753,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
Index: pve-qemu-kvm-9.0.0/scripts/ci/org.centos/stream/8/x86_64/configure
===================================================================
--- pve-qemu-kvm-9.0.0.orig/scripts/ci/org.centos/stream/8/x86_64/configure
+++ pve-qemu-kvm-9.0.0/scripts/ci/org.centos/stream/8/x86_64/configure
@@ -30,7 +30,7 @@
--with-suffix="qemu-kvm" \
--firmwarepath=/usr/share/qemu-firmware \
--target-list="x86_64-softmmu" \
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
--audio-drv-list="" \
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
--with-coroutine=ucontext \
@@ -176,6 +176,7 @@
--enable-opengl \
--enable-pie \
--enable-rbd \
+--enable-vitastor \
--enable-rdma \
--enable-seccomp \
--enable-snappy \
Index: pve-qemu-kvm-9.0.0/scripts/meson-buildoptions.sh
===================================================================
--- pve-qemu-kvm-9.0.0.orig/scripts/meson-buildoptions.sh
+++ pve-qemu-kvm-9.0.0/scripts/meson-buildoptions.sh
@@ -168,6 +168,7 @@ meson_options_help() {
printf "%s\n" ' qed qed image format support'
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' rdma Enable RDMA-based migration'
printf "%s\n" ' replication replication support'
printf "%s\n" ' rutabaga-gfx rutabaga_gfx support'
@@ -445,6 +446,8 @@ _meson_option_parse() {
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-rdma) printf "%s" -Drdma=enabled ;;
--disable-rdma) printf "%s" -Drdma=disabled ;;
--enable-relocatable) printf "%s" -Drelocatable=true ;;

View File

@ -1,172 +0,0 @@
diff --git a/block/meson.build b/block/meson.build
index f1262ec2ba..3cf3e23f16 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -114,6 +114,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
diff --git a/meson.build b/meson.build
index fbda17c987..3edac22aff 100644
--- a/meson.build
+++ b/meson.build
@@ -1510,6 +1510,26 @@ if not get_option('rbd').auto() or have_block
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'))
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -2351,6 +2371,7 @@ endif
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
@@ -4510,6 +4531,7 @@ summary_info += {'fdt support': fdt_opt == 'internal' ? 'internal' : fdt}
summary_info += {'libcap-ng support': libcap_ng}
summary_info += {'bpf support': libbpf}
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
diff --git a/meson_options.txt b/meson_options.txt
index 0269fa0f16..4740ffdc27 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -194,6 +194,8 @@ option('lzo', type : 'feature', value : 'auto',
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
diff --git a/qapi/block-core.json b/qapi/block-core.json
index aa40d44f1d..bbee6a0e9c 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3203,7 +3203,7 @@
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
- 'ssh', 'throttle', 'vdi', 'vhdx',
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -4286,6 +4286,28 @@
'*key-secret': 'str',
'*server': ['InetSocketAddressBase'] } }
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
##
# @ReplicationMode:
#
@@ -4742,6 +4764,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'virtio-blk-vfio-pci':
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
'if': 'CONFIG_BLKIO' },
@@ -5183,6 +5206,20 @@
'*cluster-size' : 'size',
'*encrypt' : 'RbdEncryptionCreateOptions' } }
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @location: Where to store the new image file. This location cannot
+# point to a snapshot.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
##
# @BlockdevVmdkSubformat:
#
@@ -5405,6 +5442,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index c97079a38c..4623f552ec 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -168,6 +168,7 @@ meson_options_help() {
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
printf "%s\n" ' qpl Query Processing Library support'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' rdma Enable RDMA-based migration'
printf "%s\n" ' replication replication support'
printf "%s\n" ' rutabaga-gfx rutabaga_gfx support'
@@ -444,6 +445,8 @@ _meson_option_parse() {
--disable-qpl) printf "%s" -Dqpl=disabled ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-rdma) printf "%s" -Drdma=enabled ;;
--disable-rdma) printf "%s" -Drdma=disabled ;;
--enable-relocatable) printf "%s" -Drelocatable=true ;;

View File

@ -1,11 +1,11 @@
Name: vitastor Name: vitastor
Version: 1.9.3 Version: 1.9.2
Release: 1%{?dist} Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1 License: Vitastor Network Public License 1.1
URL: https://vitastor.io/ URL: https://vitastor.io/
Source0: vitastor-1.9.3.el7.tar.gz Source0: vitastor-1.9.2.el7.tar.gz
BuildRequires: liburing-devel >= 0.6 BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel BuildRequires: gperftools-devel

View File

@ -1,11 +1,11 @@
Name: vitastor Name: vitastor
Version: 1.9.3 Version: 1.9.2
Release: 1%{?dist} Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1 License: Vitastor Network Public License 1.1
URL: https://vitastor.io/ URL: https://vitastor.io/
Source0: vitastor-1.9.3.el8.tar.gz Source0: vitastor-1.9.2.el8.tar.gz
BuildRequires: liburing-devel >= 0.6 BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel BuildRequires: gperftools-devel

View File

@ -1,11 +1,11 @@
Name: vitastor Name: vitastor
Version: 1.9.3 Version: 1.9.2
Release: 1%{?dist} Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1 License: Vitastor Network Public License 1.1
URL: https://vitastor.io/ URL: https://vitastor.io/
Source0: vitastor-1.9.3.el9.tar.gz Source0: vitastor-1.9.2.el9.tar.gz
BuildRequires: liburing-devel >= 0.6 BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel BuildRequires: gperftools-devel

View File

@ -19,7 +19,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif() endif()
add_definitions(-DVITASTOR_VERSION="1.9.3") add_definitions(-DVITASTOR_VERSION="1.9.2")
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src) add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
add_link_options(-fno-omit-frame-pointer) add_link_options(-fno-omit-frame-pointer)
if (${WITH_ASAN}) if (${WITH_ASAN})

View File

@ -176,7 +176,7 @@ void etcd_state_client_t::add_etcd_url(std::string addr)
exit(1); exit(1);
} }
if (!local_ips.size()) if (!local_ips.size())
local_ips = getifaddr_list(std::vector<std::string>(), true); local_ips = getifaddr_list();
std::string check_addr; std::string check_addr;
int pos = addr.find('/'); int pos = addr.find('/');
int pos2 = addr.find(':'); int pos2 = addr.find(':');

View File

@ -62,7 +62,6 @@ struct http_co_t
inline void end() { ended = true; if (!onstack) { delete this; } } inline void end() { ended = true; if (!onstack) { delete this; } }
void run_cb_and_clear(); void run_cb_and_clear();
void start_connection(); void start_connection();
void start_ws_connection();
void close_connection(); void close_connection();
void next_request(); void next_request();
void handle_events(); void handle_events();
@ -113,7 +112,7 @@ http_co_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, cons
handler->keepalive = false; handler->keepalive = false;
handler->request = request; handler->request = request;
handler->response_callback = response_callback; handler->response_callback = response_callback;
handler->start_ws_connection(); handler->start_connection();
return handler; return handler;
} }
@ -283,27 +282,6 @@ void http_co_t::close_connection()
epoll_events = 0; epoll_events = 0;
} }
void http_co_t::start_ws_connection()
{
stackin();
start_connection();
if (request_timeout > 0)
{
timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
{
stackin();
if (state != HTTP_CO_WEBSOCKET)
{
close_connection();
parsed = { .error = "Websocket connection timed out" };
run_cb_and_clear();
}
stackout();
});
}
stackout();
}
void http_co_t::start_connection() void http_co_t::start_connection()
{ {
stackin(); stackin();

View File

@ -121,7 +121,7 @@ void osd_messenger_t::init()
if (use_rdma) if (use_rdma)
{ {
rdma_context = msgr_rdma_context_t::create( rdma_context = msgr_rdma_context_t::create(
osd_networks, rdma_device != "" ? rdma_device.c_str() : NULL, rdma_device != "" ? rdma_device.c_str() : NULL,
rdma_port_num, rdma_gid_index, rdma_mtu, rdma_odp, log_level rdma_port_num, rdma_gid_index, rdma_mtu, rdma_odp, log_level
); );
if (!rdma_context) if (!rdma_context)
@ -266,8 +266,7 @@ void osd_messenger_t::parse_config(const json11::Json & config)
this->rdma_port_num = (uint8_t)config["rdma_port_num"].uint64_value(); this->rdma_port_num = (uint8_t)config["rdma_port_num"].uint64_value();
if (!this->rdma_port_num) if (!this->rdma_port_num)
this->rdma_port_num = 1; this->rdma_port_num = 1;
if (!config["rdma_gid_index"].is_null()) this->rdma_gid_index = (uint8_t)config["rdma_gid_index"].uint64_value();
this->rdma_gid_index = (uint8_t)config["rdma_gid_index"].uint64_value();
this->rdma_mtu = (uint32_t)config["rdma_mtu"].uint64_value(); this->rdma_mtu = (uint32_t)config["rdma_mtu"].uint64_value();
this->rdma_max_sge = config["rdma_max_sge"].uint64_value(); this->rdma_max_sge = config["rdma_max_sge"].uint64_value();
if (!this->rdma_max_sge) if (!this->rdma_max_sge)
@ -282,15 +281,6 @@ void osd_messenger_t::parse_config(const json11::Json & config)
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024) if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
this->rdma_max_msg = 129*1024; this->rdma_max_msg = 129*1024;
this->rdma_odp = config["rdma_odp"].bool_value(); this->rdma_odp = config["rdma_odp"].bool_value();
std::vector<std::string> mask;
if (config["bind_address"].is_string())
mask.push_back(config["bind_address"].string_value());
else if (config["osd_network"].is_string())
mask.push_back(config["osd_network"].string_value());
else
for (auto v: config["osd_network"].array_items())
mask.push_back(v.string_value());
this->osd_networks = mask;
#endif #endif
if (!osd_num) if (!osd_num)
this->iothread_count = (uint32_t)config["client_iothread_count"].uint64_value(); this->iothread_count = (uint32_t)config["client_iothread_count"].uint64_value();

View File

@ -165,9 +165,8 @@ protected:
#ifdef WITH_RDMA #ifdef WITH_RDMA
bool use_rdma = true; bool use_rdma = true;
std::vector<std::string> osd_networks;
std::string rdma_device; std::string rdma_device;
uint64_t rdma_port_num = 1, rdma_gid_index = -1, rdma_mtu = 0; uint64_t rdma_port_num = 1, rdma_gid_index = 0, rdma_mtu = 0;
msgr_rdma_context_t *rdma_context = NULL; msgr_rdma_context_t *rdma_context = NULL;
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0; uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
uint64_t rdma_max_msg = 0; uint64_t rdma_max_msg = 0;
@ -178,7 +177,7 @@ protected:
std::vector<int> read_ready_clients; std::vector<int> read_ready_clients;
std::vector<int> write_ready_clients; std::vector<int> write_ready_clients;
// We don't use ringloop->set_immediate here because we may have no ringloop in client :) // We don't use ringloop->set_immediate here because we may have no ringloop in client :)
std::vector<osd_op_t*> set_immediate_ops; std::vector<std::function<void()>> set_immediate;
public: public:
timerfd_manager_t *tfd; timerfd_manager_t *tfd;
@ -238,8 +237,6 @@ protected:
void handle_op_hdr(osd_client_t *cl); void handle_op_hdr(osd_client_t *cl);
bool handle_reply_hdr(osd_client_t *cl); bool handle_reply_hdr(osd_client_t *cl);
void handle_reply_ready(osd_op_t *op); void handle_reply_ready(osd_op_t *op);
void handle_immediate_ops();
void clear_immediate_ops(int peer_fd);
#ifdef WITH_RDMA #ifdef WITH_RDMA
void try_send_rdma(osd_client_t *cl); void try_send_rdma(osd_client_t *cl);

View File

@ -3,7 +3,6 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include "addr_util.h"
#include "msgr_rdma.h" #include "msgr_rdma.h"
#include "messenger.h" #include "messenger.h"
@ -70,126 +69,7 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
send_out_size = 0; send_out_size = 0;
} }
static bool is_ipv4_gid(ibv_gid_entry *gidx) msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level)
{
return (((uint64_t*)gidx->gid.raw)[0] == 0 &&
((uint32_t*)gidx->gid.raw)[2] == 0xffff0000);
}
static bool match_gid(ibv_gid_entry *gidx, addr_mask_t *networks, int nnet)
{
if (gidx->gid_type != IBV_GID_TYPE_ROCE_V1 &&
gidx->gid_type != IBV_GID_TYPE_ROCE_V2 ||
((uint64_t*)gidx->gid.raw)[0] == 0 &&
((uint64_t*)gidx->gid.raw)[1] == 0)
{
return false;
}
if (is_ipv4_gid(gidx))
{
for (int i = 0; i < nnet; i++)
{
if (networks[i].family == AF_INET && cidr_match(*(in_addr*)(gidx->gid.raw+12), networks[i].ipv4, networks[i].bits))
return true;
}
}
else
{
for (int i = 0; i < nnet; i++)
{
if (networks[i].family == AF_INET6 && cidr6_match(*(in6_addr*)gidx->gid.raw, networks[i].ipv6, networks[i].bits))
return true;
}
}
return false;
}
struct matched_dev
{
int dev = -1;
int port = -1;
int gid = -1;
bool rocev2 = false;
};
static void log_rdma_dev_port_gid(ibv_device *dev, int ib_port, int gid_index, ibv_gid_entry & gidx)
{
bool is4 = ((uint64_t*)gidx.gid.raw)[0] == 0 && ((uint32_t*)gidx.gid.raw)[2] == 0xffff0000;
char buf[256];
inet_ntop(is4 ? AF_INET : AF_INET6, is4 ? gidx.gid.raw+12 : gidx.gid.raw, buf, sizeof(buf));
fprintf(
stderr, "Auto-selected RDMA device %s port %d GID %d - ROCEv%d IPv%d %s\n",
ibv_get_device_name(dev), ib_port, gid_index,
gidx.gid_type == IBV_GID_TYPE_ROCE_V2 ? 2 : 1, is4 ? 4 : 6, buf
);
}
static matched_dev match_device(ibv_device **dev_list, addr_mask_t *networks, int nnet, int log_level)
{
matched_dev best;
ibv_device_attr attr;
ibv_port_attr portinfo;
ibv_gid_entry best_gidx;
int res;
for (int i = 0; dev_list[i]; ++i)
{
auto dev = dev_list[i];
ibv_context *context = ibv_open_device(dev_list[i]);
if ((res = ibv_query_device(context, &attr)) != 0)
{
fprintf(stderr, "Couldn't query RDMA device %s for its features: %s\n", ibv_get_device_name(dev_list[i]), strerror(res));
goto cleanup;
}
for (int j = 1; j <= attr.phys_port_cnt; j++)
{
// Try to find a port with matching address
if ((res = ibv_query_port(context, j, &portinfo)) != 0)
{
fprintf(stderr, "Couldn't get RDMA device %s port %d info: %s\n", ibv_get_device_name(dev), j, strerror(res));
goto cleanup;
}
for (int k = 0; k < portinfo.gid_tbl_len; k++)
{
ibv_gid_entry gidx;
if ((res = ibv_query_gid_ex(context, j, k, &gidx, 0)) != 0)
{
if (res != ENODATA)
{
fprintf(stderr, "Couldn't read RDMA device %s GID index %d: %s\n", ibv_get_device_name(dev), k, strerror(res));
goto cleanup;
}
else
break;
}
if (match_gid(&gidx, networks, nnet))
{
// Prefer RoCEv2
if (!best.rocev2)
{
best.dev = i;
best.port = j;
best.gid = k;
best.rocev2 = (gidx.gid_type == IBV_GID_TYPE_ROCE_V2);
best_gidx = gidx;
}
}
}
}
cleanup:
ibv_close_device(context);
if (best.rocev2)
{
break;
}
}
if (best.dev >= 0 && log_level > 0)
{
log_rdma_dev_port_gid(dev_list[best.dev], best.port, best.gid, best_gidx);
}
return best;
}
msgr_rdma_context_t *msgr_rdma_context_t::create(std::vector<std::string> osd_networks, const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level)
{ {
int res; int res;
ibv_device **dev_list = NULL; ibv_device **dev_list = NULL;
@ -200,23 +80,28 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(std::vector<std::string> osd_ne
clock_gettime(CLOCK_REALTIME, &tv); clock_gettime(CLOCK_REALTIME, &tv);
srand48(tv.tv_sec*1000000000 + tv.tv_nsec); srand48(tv.tv_sec*1000000000 + tv.tv_nsec);
dev_list = ibv_get_device_list(NULL); dev_list = ibv_get_device_list(NULL);
if (!dev_list || !*dev_list) if (!dev_list)
{ {
if (errno == -ENOSYS || errno == ENOSYS) if (errno == -ENOSYS || errno == ENOSYS)
{ {
if (log_level > 0) if (log_level > 0)
fprintf(stderr, "No RDMA devices found (RDMA device list returned ENOSYS)\n"); fprintf(stderr, "No RDMA devices found (RDMA device list returned ENOSYS)\n");
} }
else if (!*dev_list)
{
if (log_level > 0)
fprintf(stderr, "No RDMA devices found\n");
}
else else
fprintf(stderr, "Failed to get RDMA device list: %s\n", strerror(errno)); fprintf(stderr, "Failed to get RDMA device list: %s\n", strerror(errno));
goto cleanup; goto cleanup;
} }
if (ib_devname) if (!ib_devname)
{
ctx->dev = *dev_list;
if (!ctx->dev)
{
if (log_level > 0)
fprintf(stderr, "No RDMA devices found\n");
goto cleanup;
}
}
else
{ {
int i; int i;
for (i = 0; dev_list[i]; ++i) for (i = 0; dev_list[i]; ++i)
@ -229,31 +114,6 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(std::vector<std::string> osd_ne
goto cleanup; goto cleanup;
} }
} }
else if (osd_networks.size())
{
std::vector<addr_mask_t> nets;
for (auto & netstr: osd_networks)
{
nets.push_back(cidr_parse(netstr));
}
auto best = match_device(dev_list, nets.data(), nets.size(), log_level);
if (best.dev < 0)
{
if (log_level > 0)
fprintf(stderr, "RDMA device matching osd_network is not found, using first available device\n");
best.dev = 0;
}
else
{
ib_port = best.port;
gid_index = best.gid;
}
ctx->dev = dev_list[best.dev];
}
else
{
ctx->dev = *dev_list;
}
ctx->context = ibv_open_device(ctx->dev); ctx->context = ibv_open_device(ctx->dev);
if (!ctx->context) if (!ctx->context)
@ -263,6 +123,7 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(std::vector<std::string> osd_ne
} }
ctx->ib_port = ib_port; ctx->ib_port = ib_port;
ctx->gid_index = gid_index;
if ((res = ibv_query_port(ctx->context, ib_port, &ctx->portinfo)) != 0) if ((res = ibv_query_port(ctx->context, ib_port, &ctx->portinfo)) != 0)
{ {
fprintf(stderr, "Couldn't get RDMA device %s port %d info: %s\n", ibv_get_device_name(ctx->dev), ib_port, strerror(res)); fprintf(stderr, "Couldn't get RDMA device %s port %d info: %s\n", ibv_get_device_name(ctx->dev), ib_port, strerror(res));
@ -274,47 +135,10 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(std::vector<std::string> osd_ne
fprintf(stderr, "RDMA device %s must have local LID because it's not Ethernet, but LID is zero\n", ibv_get_device_name(ctx->dev)); fprintf(stderr, "RDMA device %s must have local LID because it's not Ethernet, but LID is zero\n", ibv_get_device_name(ctx->dev));
goto cleanup; goto cleanup;
} }
if (ibv_query_gid(ctx->context, ib_port, gid_index, &ctx->my_gid))
if (gid_index != -1)
{ {
ctx->gid_index = gid_index; fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), gid_index);
if (ibv_query_gid_ex(ctx->context, ib_port, gid_index, &ctx->my_gid, 0)) goto cleanup;
{
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), gid_index);
goto cleanup;
}
}
else
{
// Auto-guess GID
for (int k = 0; k < ctx->portinfo.gid_tbl_len; k++)
{
ibv_gid_entry gidx;
if (ibv_query_gid_ex(ctx->context, ib_port, k, &gidx, 0) != 0)
{
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), k);
goto cleanup;
}
// Skip empty GID
if (((uint64_t*)gidx.gid.raw)[0] == 0 &&
((uint64_t*)gidx.gid.raw)[1] == 0)
{
continue;
}
// Prefer IPv4 RoCEv2 GID by default
if (gid_index == -1 ||
gidx.gid_type == IBV_GID_TYPE_ROCE_V2 &&
(ctx->my_gid.gid_type != IBV_GID_TYPE_ROCE_V2 || is_ipv4_gid(&gidx)))
{
gid_index = k;
ctx->my_gid = gidx;
}
}
ctx->gid_index = gid_index = (gid_index == -1 ? 0 : gid_index);
if (log_level > 0)
{
log_rdma_dev_port_gid(ctx->dev, ctx->ib_port, ctx->gid_index, ctx->my_gid);
}
} }
ctx->pd = ibv_alloc_pd(ctx->context); ctx->pd = ibv_alloc_pd(ctx->context);
@ -431,7 +255,7 @@ msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx,
} }
conn->addr.lid = ctx->my_lid; conn->addr.lid = ctx->my_lid;
conn->addr.gid = ctx->my_gid.gid; conn->addr.gid = ctx->my_gid;
conn->addr.qpn = conn->qp->qp_num; conn->addr.qpn = conn->qp->qp_num;
conn->addr.psn = lrand48() & 0xffffff; conn->addr.psn = lrand48() & 0xffffff;
@ -774,7 +598,6 @@ void osd_messenger_t::handle_rdma_events()
} }
fprintf(stderr, " with status: %s, stopping client\n", ibv_wc_status_str(wc[i].status)); fprintf(stderr, " with status: %s, stopping client\n", ibv_wc_status_str(wc[i].status));
stop_client(client_id); stop_client(client_id);
clear_immediate_ops(client_id);
continue; continue;
} }
if (!is_send) if (!is_send)
@ -783,7 +606,6 @@ void osd_messenger_t::handle_rdma_events()
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len)) if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
{ {
// handle_read_buffer may stop the client // handle_read_buffer may stop the client
clear_immediate_ops(client_id);
continue; continue;
} }
try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]); try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]);
@ -844,5 +666,9 @@ void osd_messenger_t::handle_rdma_events()
} }
} }
} while (event_count > 0); } while (event_count > 0);
handle_immediate_ops(); for (auto cb: set_immediate)
{
cb();
}
set_immediate.clear();
} }

View File

@ -31,12 +31,12 @@ struct msgr_rdma_context_t
uint8_t ib_port; uint8_t ib_port;
uint8_t gid_index; uint8_t gid_index;
uint16_t my_lid; uint16_t my_lid;
ibv_gid_entry my_gid; ibv_gid my_gid;
uint32_t mtu; uint32_t mtu;
int max_cqe = 0; int max_cqe = 0;
int used_max_cqe = 0; int used_max_cqe = 0;
static msgr_rdma_context_t *create(std::vector<std::string> osd_networks, const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level); static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level);
~msgr_rdma_context_t(); ~msgr_rdma_context_t();
}; };

View File

@ -65,7 +65,6 @@ void osd_messenger_t::read_requests()
bool osd_messenger_t::handle_read(int result, osd_client_t *cl) bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
{ {
bool ret = false; bool ret = false;
int peer_fd = cl->peer_fd;
cl->read_msg.msg_iovlen = 0; cl->read_msg.msg_iovlen = 0;
cl->refs--; cl->refs--;
if (cl->peer_state == PEER_STOPPED) if (cl->peer_state == PEER_STOPPED)
@ -102,8 +101,7 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
{ {
if (!handle_read_buffer(cl, cl->in_buf, result)) if (!handle_read_buffer(cl, cl->in_buf, result))
{ {
clear_immediate_ops(peer_fd); goto fin;
return false;
} }
} }
else else
@ -115,8 +113,7 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
{ {
if (!handle_finished_read(cl)) if (!handle_finished_read(cl))
{ {
clear_immediate_ops(peer_fd); goto fin;
return false;
} }
} }
} }
@ -125,47 +122,15 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
ret = true; ret = true;
} }
} }
handle_immediate_ops(); fin:
for (auto cb: set_immediate)
{
cb();
}
set_immediate.clear();
return ret; return ret;
} }
void osd_messenger_t::clear_immediate_ops(int peer_fd)
{
size_t i = 0, j = 0;
while (i < set_immediate_ops.size())
{
if (set_immediate_ops[i]->peer_fd == peer_fd)
{
delete set_immediate_ops[i];
}
else
{
if (i != j)
set_immediate_ops[j] = set_immediate_ops[i];
j++;
}
i++;
}
set_immediate_ops.resize(j);
}
void osd_messenger_t::handle_immediate_ops()
{
for (auto op: set_immediate_ops)
{
if (op->op_type == OSD_OP_IN)
{
exec_op(op);
}
else
{
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(op->callback)(op);
}
}
set_immediate_ops.clear();
}
bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int remain) bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int remain)
{ {
// Compose operation(s) from the buffer // Compose operation(s) from the buffer
@ -234,7 +199,7 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
{ {
// Operation is ready // Operation is ready
cl->received_ops.push_back(cl->read_op); cl->received_ops.push_back(cl->read_op);
set_immediate_ops.push_back(cl->read_op); set_immediate.push_back([this, op = cl->read_op]() { exec_op(op); });
cl->read_op = NULL; cl->read_op = NULL;
cl->read_state = 0; cl->read_state = 0;
} }
@ -330,7 +295,7 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
{ {
// Operation is ready // Operation is ready
cl->received_ops.push_back(cur_op); cl->received_ops.push_back(cur_op);
set_immediate_ops.push_back(cur_op); set_immediate.push_back([this, cur_op]() { exec_op(cur_op); });
cl->read_op = NULL; cl->read_op = NULL;
cl->read_state = 0; cl->read_state = 0;
} }
@ -451,5 +416,9 @@ void osd_messenger_t::handle_reply_ready(osd_op_t *op)
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 + (tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000 (tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
); );
set_immediate_ops.push_back(op); set_immediate.push_back([op]()
{
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(op->callback)(op);
});
} }

View File

@ -16,6 +16,7 @@
#include "qapi/error.h" #include "qapi/error.h"
#include "qapi/qmp/qdict.h" #include "qapi/qmp/qdict.h"
#include "qapi/qmp/qerror.h" #include "qapi/qmp/qerror.h"
#include "qemu/uri.h"
#include "qemu/error-report.h" #include "qemu/error-report.h"
#include "qemu/module.h" #include "qemu/module.h"
#include "qemu/option.h" #include "qemu/option.h"
@ -1020,11 +1021,7 @@ static BlockDriver bdrv_vitastor = {
// FIXME: Implement it along with per-inode statistics // FIXME: Implement it along with per-inode statistics
//.bdrv_get_allocated_file_size = vitastor_get_allocated_file_size, //.bdrv_get_allocated_file_size = vitastor_get_allocated_file_size,
#if QEMU_VERSION_MAJOR > 9 || QEMU_VERSION_MAJOR == 9 && QEMU_VERSION_MINOR > 0
.bdrv_open = vitastor_file_open,
#else
.bdrv_file_open = vitastor_file_open, .bdrv_file_open = vitastor_file_open,
#endif
.bdrv_close = vitastor_close, .bdrv_close = vitastor_close,
// Option list for the create operation // Option list for the create operation

View File

@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: Vitastor Name: Vitastor
Description: Vitastor client library Description: Vitastor client library
Version: 1.9.3 Version: 1.9.2
Libs: -L${libdir} -lvitastor_client Libs: -L${libdir} -lvitastor_client
Cflags: -I${includedir} Cflags: -I${includedir}

View File

@ -431,7 +431,7 @@ struct cli_dd_t
if (read_op->retval < 0) if (read_op->retval < 0)
{ {
fprintf( fprintf(
stderr, "Failed to read bitmap for %ju bytes from image %s at offset %ju: %s (code %d)\n", stderr, "Failed to read bitmap for %lu bytes from image %s at offset %lu: %s (code %d)\n",
read_op->len, iinfo.iimg.c_str(), read_op->offset, read_op->len, iinfo.iimg.c_str(), read_op->offset,
strerror(read_op->retval < 0 ? -read_op->retval : EIO), read_op->retval strerror(read_op->retval < 0 ? -read_op->retval : EIO), read_op->retval
); );
@ -476,7 +476,7 @@ struct cli_dd_t
if (read_op->retval != read_op->len) if (read_op->retval != read_op->len)
{ {
fprintf( fprintf(
stderr, "Failed to read %ju bytes from image %s at offset %ju: %s (code %d)\n", stderr, "Failed to read %lu bytes from image %s at offset %lu: %s (code %d)\n",
read_op->len, iinfo.iimg.c_str(), read_op->offset, read_op->len, iinfo.iimg.c_str(), read_op->offset,
strerror(read_op->retval < 0 ? -read_op->retval : EIO), read_op->retval strerror(read_op->retval < 0 ? -read_op->retval : EIO), read_op->retval
); );
@ -547,7 +547,7 @@ struct cli_dd_t
if (data->res < 0) if (data->res < 0)
{ {
fprintf( fprintf(
stderr, "Failed to read %ju bytes from %s at offset %ju: %s (code %d)\n", stderr, "Failed to read %lu bytes from %s at offset %lu: %s (code %d)\n",
data->iov.iov_len, iinfo.ifile == "" ? "stdin" : iinfo.ifile.c_str(), cur_read->offset, data->iov.iov_len, iinfo.ifile == "" ? "stdin" : iinfo.ifile.c_str(), cur_read->offset,
strerror(-data->res), data->res strerror(-data->res), data->res
); );
@ -644,7 +644,7 @@ struct cli_dd_t
if (write_op->retval != write_op->len) if (write_op->retval != write_op->len)
{ {
fprintf( fprintf(
stderr, "Failed to write %ju bytes to image %s at offset %ju: %s (code %d)\n", stderr, "Failed to write %lu bytes to image %s at offset %lu: %s (code %d)\n",
write_op->len, oinfo.oimg.c_str(), write_op->offset, write_op->len, oinfo.oimg.c_str(), write_op->offset,
strerror(write_op->retval < 0 ? -write_op->retval : EIO), write_op->retval strerror(write_op->retval < 0 ? -write_op->retval : EIO), write_op->retval
); );
@ -680,7 +680,7 @@ struct cli_dd_t
if (data->res < 0) if (data->res < 0)
{ {
fprintf( fprintf(
stderr, "Failed to write %ju bytes to %s at offset %ju: %s (code %d)\n", stderr, "Failed to write %lu bytes to %s at offset %lu: %s (code %d)\n",
data->iov.iov_len, oinfo.ofile == "" ? "stdout" : oinfo.ofile.c_str(), data->iov.iov_len, oinfo.ofile == "" ? "stdout" : oinfo.ofile.c_str(),
oinfo.out_seekable ? cur_read->offset+cur_read->len+oseek : 0, oinfo.out_seekable ? cur_read->offset+cur_read->len+oseek : 0,
strerror(-data->res), data->res strerror(-data->res), data->res
@ -727,7 +727,7 @@ struct cli_dd_t
{ {
char buf[256]; char buf[256];
snprintf( snprintf(
buf, sizeof(buf), "%ju bytes (%s) copied, %.1f s, %sB/s", buf, sizeof(buf), "%lu bytes (%s) copied, %.1f s, %sB/s",
written_size, format_size(written_size).c_str(), sec_total, written_size, format_size(written_size).c_str(), sec_total,
format_size((uint64_t)(written_size/sec_total), true).c_str() format_size((uint64_t)(written_size/sec_total), true).c_str()
); );
@ -749,7 +749,7 @@ struct cli_dd_t
else else
{ {
fprintf( fprintf(
stderr, "\r%ju bytes (%s) copied, %.1f s, %sB/s, avg %sB/s\033[K", stderr, "\r%lu bytes (%s) copied, %.1f s, %sB/s, avg %sB/s\033[K",
written_size, format_size(written_size).c_str(), sec_total, written_size, format_size(written_size).c_str(), sec_total,
format_size((uint64_t)(delta/sec_delta), true).c_str(), format_size((uint64_t)(delta/sec_delta), true).c_str(),
format_size((uint64_t)(written_size/sec_total), true).c_str() format_size((uint64_t)(written_size/sec_total), true).c_str()

View File

@ -216,7 +216,7 @@ resume_1:
for (uint64_t osd_num: node.child_osds) for (uint64_t osd_num: node.child_osds)
{ {
auto & osd = placement_tree->osds.at(osd_num); auto & osd = placement_tree->osds.at(osd_num);
auto json_osd = json11::Json::object{ fmt_items.push_back(json11::Json::object{
{ "type", "osd" }, { "type", "osd" },
{ "name", osd.num }, { "name", osd.num },
{ "parent", node.name }, { "parent", node.name },
@ -230,16 +230,7 @@ resume_1:
{ "bitmap", (uint64_t)osd.bitmap_granularity }, { "bitmap", (uint64_t)osd.bitmap_granularity },
{ "commit", osd.immediate_commit == IMMEDIATE_NONE ? "none" : (osd.immediate_commit == IMMEDIATE_ALL ? "all" : "small") }, { "commit", osd.immediate_commit == IMMEDIATE_NONE ? "none" : (osd.immediate_commit == IMMEDIATE_ALL ? "all" : "small") },
{ "op_stats", osd_stats[osd_num]["op_stats"] }, { "op_stats", osd_stats[osd_num]["op_stats"] },
}; });
if (osd_stats[osd_num]["slow_ops_primary"].uint64_value() > 0)
{
json_osd["slow_ops_primary"] = osd_stats[osd_num]["slow_ops_primary"];
}
if (osd_stats[osd_num]["slow_ops_secondary"].uint64_value() > 0)
{
json_osd["slow_ops_secondary"] = osd_stats[osd_num]["slow_ops_secondary"];
}
fmt_items.push_back(json_osd);
} }
} }
result.data = fmt_items; result.data = fmt_items;

View File

@ -134,7 +134,6 @@ resume_2:
} }
int osd_count = 0, osd_up = 0; int osd_count = 0, osd_up = 0;
uint64_t total_raw = 0, free_raw = 0, free_down_raw = 0, down_raw = 0; uint64_t total_raw = 0, free_raw = 0, free_down_raw = 0, down_raw = 0;
std::vector<uint64_t> slow_op_primary_osds, slow_op_secondary_osds;
parent->iterate_kvs_1(osd_stats, "/osd/stats/", [&](uint64_t stat_osd_num, json11::Json value) parent->iterate_kvs_1(osd_stats, "/osd/stats/", [&](uint64_t stat_osd_num, json11::Json value)
{ {
osd_count++; osd_count++;
@ -154,14 +153,6 @@ resume_2:
if (peer_it != parent->cli->st_cli.peer_states.end()) if (peer_it != parent->cli->st_cli.peer_states.end())
{ {
osd_up++; osd_up++;
if (value["slow_ops_primary"].uint64_value() > 0)
{
slow_op_primary_osds.push_back(stat_osd_num);
}
if (value["slow_ops_secondary"].uint64_value() > 0)
{
slow_op_secondary_osds.push_back(stat_osd_num);
}
} }
else else
{ {
@ -225,10 +216,6 @@ resume_2:
{ "mon_master", mon_master }, { "mon_master", mon_master },
{ "osd_up", osd_up }, { "osd_up", osd_up },
{ "osd_count", osd_count }, { "osd_count", osd_count },
{ "osds_full", osds_full },
{ "osds_nearfull", osds_nearfull },
{ "osds_primary_slow_ops", slow_op_primary_osds },
{ "osds_secondary_slow_ops", slow_op_secondary_osds },
{ "total_raw", total_raw }, { "total_raw", total_raw },
{ "free_raw", free_raw }, { "free_raw", free_raw },
{ "down_raw", down_raw }, { "down_raw", down_raw },
@ -313,26 +300,6 @@ resume_2:
warning_str += " "+std::to_string(osds_nearfull)+ warning_str += " "+std::to_string(osds_nearfull)+
(osds_nearfull > 1 ? " osds are almost full\n" : " osd is almost full\n"); (osds_nearfull > 1 ? " osds are almost full\n" : " osd is almost full\n");
} }
if (slow_op_primary_osds.size() > 0)
{
warning_str += " "+std::to_string(slow_op_primary_osds.size());
warning_str += (slow_op_primary_osds.size() > 1 ? " osds have" : " osd has");
warning_str += " slow client ops: ";
for (int i = 0; i < slow_op_primary_osds.size(); i++)
{
warning_str += (i > 0 ? ", " : "")+std::to_string(slow_op_primary_osds[i])+"\n";
}
}
if (slow_op_secondary_osds.size() > 0)
{
warning_str += " "+std::to_string(slow_op_secondary_osds.size());
warning_str += (slow_op_secondary_osds.size() > 1 ? " osds have" : " osd has");
warning_str += " slow replication ops: ";
for (int i = 0; i < slow_op_secondary_osds.size(); i++)
{
warning_str += (i > 0 ? ", " : "")+std::to_string(slow_op_secondary_osds[i])+"\n";
}
}
if (warning_str != "") if (warning_str != "")
{ {
warning_str = "\n warning:\n"+warning_str; warning_str = "\n warning:\n"+warning_str;

View File

@ -27,16 +27,12 @@ static const char *help_text =
" --osd_per_disk <N>\n" " --osd_per_disk <N>\n"
" Create <N> OSDs on each disk (default 1)\n" " Create <N> OSDs on each disk (default 1)\n"
" --hybrid\n" " --hybrid\n"
" Prepare hybrid (HDD+SSD, NVMe+SATA or etc) OSDs using provided devices. By default,\n" " Prepare hybrid (HDD+SSD) OSDs using provided devices. SSDs will be used for\n"
" any passed SSDs will be used for journals and metadata, HDDs will be used for data,\n" " journals and metadata, HDDs will be used for data. Partitions for journals and\n"
" but you can override this behaviour with --fast-devices option. Journal and metadata\n" " metadata will be created automatically. Whether disks are SSD or HDD is decided\n"
" partitions will be created automatically. In the default mode, SSD and HDD disks\n" " by the `/sys/block/.../queue/rotational` flag. In hybrid mode, default object\n"
" are distinguished by the `/sys/block/.../queue/rotational` flag. When HDDs are used\n" " size is 1 MB instead of 128 KB, default journal size is 1 GB instead of 32 MB,\n"
" for data in hybrid mode, default block_size is 1 MB instead of 128 KB, default journal\n" " and throttle_small_writes is enabled by default.\n"
" size is 1 GB instead of 32 MB, and throttle_small_writes is enabled by default.\n"
" --fast-devices /dev/nvmeX,/dev/nvmeY\n"
" In --hybrid mode, use these devices for journal and metadata instead of auto-detecting\n"
" and extracting them from the main [devices...] list.\n"
" --disable_data_fsync auto\n" " --disable_data_fsync auto\n"
" Disable data device cache and fsync (1/yes/true = on, default auto)\n" " Disable data device cache and fsync (1/yes/true = on, default auto)\n"
" --disable_meta_fsync auto\n" " --disable_meta_fsync auto\n"
@ -200,7 +196,6 @@ static const char *help_text =
" --device_size 0 Set device size\n" " --device_size 0 Set device size\n"
" --format text Result format: json, options, env, or text\n" " --format text Result format: json, options, env, or text\n"
"\n" "\n"
"Default I/O mode for commands involving disk I/O is O_DIRECT. If you don't want it, add --io cached.\n"
"Use vitastor-disk --help <command> for command details or vitastor-disk --help --all for all details.\n" "Use vitastor-disk --help <command> for command details or vitastor-disk --help --all for all details.\n"
; ;
@ -225,10 +220,6 @@ int main(int argc, char *argv[])
cmd.push_back((char*)"dump-journal"); cmd.push_back((char*)"dump-journal");
aliased = true; aliased = true;
} }
else if (!strcmp(exe_name, "vitastor-disk-test"))
{
self.test_mode = true;
}
for (int i = 1; i < argc; i++) for (int i = 1; i < argc; i++)
{ {
if (!strcmp(argv[i], "--all")) if (!strcmp(argv[i], "--all"))
@ -323,7 +314,6 @@ int main(int argc, char *argv[])
// First argument is an OSD device - take metadata layout parameters from it // First argument is an OSD device - take metadata layout parameters from it
if (self.dump_load_check_superblock(self.new_journal_device)) if (self.dump_load_check_superblock(self.new_journal_device))
return 1; return 1;
self.new_journal_device = self.dsk.journal_device;
self.new_journal_offset = self.dsk.journal_offset; self.new_journal_offset = self.dsk.journal_offset;
self.new_journal_len = self.dsk.journal_len; self.new_journal_len = self.dsk.journal_len;
} }
@ -389,7 +379,6 @@ int main(int argc, char *argv[])
// First argument is an OSD device - take metadata layout parameters from it // First argument is an OSD device - take metadata layout parameters from it
if (self.dump_load_check_superblock(self.new_meta_device)) if (self.dump_load_check_superblock(self.new_meta_device))
return 1; return 1;
self.new_meta_device = self.dsk.meta_device;
self.new_meta_offset = self.dsk.meta_offset; self.new_meta_offset = self.dsk.meta_offset;
self.new_meta_len = self.dsk.meta_len; self.new_meta_len = self.dsk.meta_len;
} }

View File

@ -22,7 +22,6 @@
#define VITASTOR_DISK_MAX_SB_SIZE 128*1024 #define VITASTOR_DISK_MAX_SB_SIZE 128*1024
#define VITASTOR_PART_TYPE "e7009fac-a5a1-4d72-af72-53de13059903" #define VITASTOR_PART_TYPE "e7009fac-a5a1-4d72-af72-53de13059903"
#define DEFAULT_HYBRID_JOURNAL "1G" #define DEFAULT_HYBRID_JOURNAL "1G"
#define DEFAULT_HYBRID_SSD_JOURNAL "128M"
struct resizer_data_moving_t; struct resizer_data_moving_t;
@ -41,7 +40,6 @@ struct disk_tool_t
/**** Parameters ****/ /**** Parameters ****/
std::map<std::string, std::string> options; std::map<std::string, std::string> options;
bool test_mode = false;
bool all, json, now; bool all, json, now;
bool dump_with_blocks, dump_with_data; bool dump_with_blocks, dump_with_data;
blockstore_disk_t dsk; blockstore_disk_t dsk;
@ -128,8 +126,7 @@ struct disk_tool_t
uint32_t write_osd_superblock(std::string device, json11::Json params); uint32_t write_osd_superblock(std::string device, json11::Json params);
int prepare_one(std::map<std::string, std::string> options, int is_hdd = -1); int prepare_one(std::map<std::string, std::string> options, int is_hdd = -1);
int check_existing_partition(std::string & dev_by_uuid); int check_existing_partition(const std::string & dev);
int fix_partition_type(std::string & dev_by_uuid);
int prepare(std::vector<std::string> devices); int prepare(std::vector<std::string> devices);
std::vector<vitastor_dev_info_t> collect_devices(const std::vector<std::string> & devices); std::vector<vitastor_dev_info_t> collect_devices(const std::vector<std::string> & devices);
json11::Json add_partitions(vitastor_dev_info_t & devinfo, std::vector<std::string> sizes); json11::Json add_partitions(vitastor_dev_info_t & devinfo, std::vector<std::string> sizes);
@ -151,6 +148,6 @@ int write_zero(int fd, uint64_t offset, uint64_t size);
json11::Json read_parttable(std::string dev); json11::Json read_parttable(std::string dev);
uint64_t dev_size_from_parttable(json11::Json pt); uint64_t dev_size_from_parttable(json11::Json pt);
uint64_t free_from_parttable(json11::Json pt); uint64_t free_from_parttable(json11::Json pt);
int fix_partition_type_uuid(std::string & dev_by_uuid, const std::string & type_uuid); int fix_partition_type(std::string dev_by_uuid);
std::string csum_type_str(uint32_t data_csum_type); std::string csum_type_str(uint32_t data_csum_type);
uint32_t csum_type_from_str(std::string data_csum_type); uint32_t csum_type_from_str(std::string data_csum_type);

View File

@ -18,7 +18,7 @@ int disk_tool_t::dump_journal()
printf("[\n"); printf("[\n");
if (all) if (all)
{ {
dsk.journal_fd = open(dsk.journal_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY); dsk.journal_fd = open(dsk.journal_device.c_str(), O_DIRECT|O_RDONLY);
if (dsk.journal_fd < 0) if (dsk.journal_fd < 0)
{ {
fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno)); fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno));
@ -121,7 +121,7 @@ int disk_tool_t::dump_journal()
int disk_tool_t::process_journal(std::function<int(void*)> block_fn) int disk_tool_t::process_journal(std::function<int(void*)> block_fn)
{ {
dsk.journal_fd = open(dsk.journal_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY); dsk.journal_fd = open(dsk.journal_device.c_str(), O_DIRECT|O_RDONLY);
if (dsk.journal_fd < 0) if (dsk.journal_fd < 0)
{ {
fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno)); fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno));

View File

@ -14,7 +14,7 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
fprintf(stderr, "Invalid metadata block size: is not a multiple of %d\n", DIRECT_IO_ALIGNMENT); fprintf(stderr, "Invalid metadata block size: is not a multiple of %d\n", DIRECT_IO_ALIGNMENT);
return 1; return 1;
} }
dsk.meta_fd = open(dsk.meta_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY); dsk.meta_fd = open(dsk.meta_device.c_str(), O_DIRECT|O_RDONLY);
if (dsk.meta_fd < 0) if (dsk.meta_fd < 0)
{ {
fprintf(stderr, "Failed to open metadata device %s: %s\n", dsk.meta_device.c_str(), strerror(errno)); fprintf(stderr, "Failed to open metadata device %s: %s\n", dsk.meta_device.c_str(), strerror(errno));
@ -159,7 +159,7 @@ int disk_tool_t::dump_load_check_superblock(const std::string & device)
{ {
auto cfg = json_to_string_map(sb["params"].object_items()); auto cfg = json_to_string_map(sb["params"].object_items());
dsk.parse_config(cfg); dsk.parse_config(cfg);
dsk.data_io = dsk.meta_io = dsk.journal_io = "cached"; dsk.data_io = dsk.meta_io = dsk.journal_io = "direct";
dsk.open_data(); dsk.open_data();
dsk.open_meta(); dsk.open_meta();
dsk.open_journal(); dsk.open_journal();
@ -315,7 +315,8 @@ int disk_tool_t::write_json_meta(json11::Json meta)
fromhexstr(e["data_csum"].string_value(), new_data_csum_size, fromhexstr(e["data_csum"].string_value(), new_data_csum_size,
((uint8_t*)new_entry) + sizeof(clean_disk_entry) + 2*new_clean_entry_bitmap_size); ((uint8_t*)new_entry) + sizeof(clean_disk_entry) + 2*new_clean_entry_bitmap_size);
} }
uint32_t *new_entry_csum = (uint32_t*)(((uint8_t*)new_entry) + new_clean_entry_size - 4); uint32_t *new_entry_csum = (uint32_t*)(((uint8_t*)new_entry) + sizeof(clean_disk_entry) +
2*new_clean_entry_bitmap_size + new_data_csum_size);
*new_entry_csum = crc32c(0, new_entry, new_clean_entry_size - 4); *new_entry_csum = crc32c(0, new_entry, new_clean_entry_size - 4);
} }
} }

View File

@ -29,12 +29,18 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
}; };
if (options.find("force") == options.end()) if (options.find("force") == options.end())
{ {
std::string* all_devs[] = { &options["data_device"], &options["meta_device"], &options["journal_device"] }; std::vector<std::string> all_devs = { options["data_device"], options["meta_device"], options["journal_device"] };
for (int i = 0; i < 3; i++) for (int i = 0; i < all_devs.size(); i++)
{ {
auto & dev = *all_devs[i]; const auto & dev = all_devs[i];
if (dev == "") if (dev == "")
continue; continue;
if (dev.substr(0, 22) != "/dev/disk/by-partuuid/")
{
// Partitions should be identified by GPT partition UUID
fprintf(stderr, "%s does not start with /dev/disk/by-partuuid/. Partitions should be identified by GPT partition UUIDs\n", dev.c_str());
return 1;
}
std::string real_dev = realpath_str(dev, false); std::string real_dev = realpath_str(dev, false);
if (real_dev == "") if (real_dev == "")
return 1; return 1;
@ -108,11 +114,7 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
try try
{ {
dsk.parse_config(options); dsk.parse_config(options);
// Set all offsets to 4096 to calculate metadata size with excess dsk.data_io = dsk.meta_io = dsk.journal_io = "direct";
dsk.journal_offset = 4096;
dsk.meta_offset = 4096;
dsk.data_offset = 4096;
dsk.data_io = dsk.meta_io = dsk.journal_io = (options["io"] == "cached" ? "cached" : "direct");
dsk.open_data(); dsk.open_data();
dsk.open_meta(); dsk.open_meta();
dsk.open_journal(); dsk.open_journal();
@ -157,11 +159,7 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
return 1; return 1;
} }
std::string osd_num_str; std::string osd_num_str;
if (test_mode && options.find("osd_num") != options.end()) if (shell_exec({ "vitastor-cli", "alloc-osd" }, "", &osd_num_str, NULL) != 0)
{
osd_num_str = options["osd_num"];
}
else if (shell_exec({ "vitastor-cli", "alloc-osd" }, "", &osd_num_str, NULL) != 0)
{ {
dsk.close_all(); dsk.close_all();
return 1; return 1;
@ -175,8 +173,8 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
} }
sb["osd_num"] = osd_num; sb["osd_num"] = osd_num;
// Zero out metadata and journal // Zero out metadata and journal
if (write_zero(dsk.meta_fd, sb["meta_offset"].uint64_value(), dsk.meta_len) != 0 || if (write_zero(dsk.meta_fd, dsk.meta_offset, dsk.meta_len) != 0 ||
write_zero(dsk.journal_fd, sb["journal_offset"].uint64_value(), dsk.journal_len) != 0) write_zero(dsk.journal_fd, dsk.journal_offset, dsk.journal_len) != 0)
{ {
fprintf(stderr, "Failed to zero out metadata or journal: %s\n", strerror(errno)); fprintf(stderr, "Failed to zero out metadata or journal: %s\n", strerror(errno));
dsk.close_all(); dsk.close_all();
@ -201,18 +199,15 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
if (sep_j) if (sep_j)
desc += (sep_m ? " and journal on " : " with journal on ") + realpath_str(options["journal_device"]); desc += (sep_m ? " and journal on " : " with journal on ") + realpath_str(options["journal_device"]);
fprintf(stderr, "Initialized OSD %ju on %s\n", osd_num, desc.c_str()); fprintf(stderr, "Initialized OSD %ju on %s\n", osd_num, desc.c_str());
if (!test_mode || options.find("no_init") == options.end()) if (shell_exec({ "systemctl", "enable", "--now", "vitastor-osd@"+std::to_string(osd_num) }, "", NULL, NULL) != 0)
{ {
if (shell_exec({ "systemctl", "enable", "--now", "vitastor-osd@"+std::to_string(osd_num) }, "", NULL, NULL) != 0) fprintf(stderr, "Failed to enable systemd unit vitastor-osd@%ju\n", osd_num);
{ return 1;
fprintf(stderr, "Failed to enable systemd unit vitastor-osd@%ju\n", osd_num);
return 1;
}
} }
return 0; return 0;
} }
int disk_tool_t::check_existing_partition(std::string & dev) int disk_tool_t::check_existing_partition(const std::string & dev)
{ {
std::string out; std::string out;
if (shell_exec({ "wipefs", dev }, "", &out, NULL) != 0 || out != "") if (shell_exec({ "wipefs", dev }, "", &out, NULL) != 0 || out != "")
@ -234,27 +229,11 @@ int disk_tool_t::check_existing_partition(std::string & dev)
return 0; return 0;
} }
int disk_tool_t::fix_partition_type(std::string & dev)
{
std::string type_uuid = VITASTOR_PART_TYPE;
if (test_mode && options.find("part_type_uuid") != options.end())
{
type_uuid = options["part_type_uuid"];
}
return fix_partition_type_uuid(dev, type_uuid);
}
std::vector<vitastor_dev_info_t> disk_tool_t::collect_devices(const std::vector<std::string> & devices) std::vector<vitastor_dev_info_t> disk_tool_t::collect_devices(const std::vector<std::string> & devices)
{ {
std::vector<vitastor_dev_info_t> devinfo; std::vector<vitastor_dev_info_t> devinfo;
std::set<std::string> seen;
for (auto & dev: devices) for (auto & dev: devices)
{ {
if (seen.find(dev) != seen.end())
{
fprintf(stderr, "%s is specified multiple times, ignoring\n", dev.c_str());
continue;
}
// Check if the device is a whole disk // Check if the device is a whole disk
if (dev.substr(0, 5) != "/dev/") if (dev.substr(0, 5) != "/dev/")
{ {
@ -315,6 +294,10 @@ std::vector<vitastor_dev_info_t> disk_tool_t::collect_devices(const std::vector<
.free = !pt.is_null() ? free_from_parttable(pt) : dev_size, .free = !pt.is_null() ? free_from_parttable(pt) : dev_size,
}); });
} }
if (!devinfo.size())
{
fprintf(stderr, "No suitable devices found\n");
}
return devinfo; return devinfo;
} }
@ -365,12 +348,47 @@ json11::Json disk_tool_t::add_partitions(vitastor_dev_info_t & devinfo, std::vec
fprintf(stderr, "Failed to add %zu partition(s) with sfdisk: new partitions not found in table\n", sizes.size()); fprintf(stderr, "Failed to add %zu partition(s) with sfdisk: new partitions not found in table\n", sizes.size());
return {}; return {};
} }
// Check if new devices exist, run partprobe if not, then wait until they appear // Check if new nodes exist and run partprobe if not
// FIXME: We could use parted instead of sfdisk because partprobe is already a part of parted // FIXME: We could use parted instead of sfdisk because partprobe is already a part of parted
int iter = 0, r;
while (true)
{
for (const auto & part: new_parts)
{
std::string link_path = "/dev/disk/by-partuuid/"+strtolower(part["uuid"].string_value());
struct stat st;
if (lstat(link_path.c_str(), &st) < 0)
{
if (errno == ENOENT)
{
iter++;
// Run partprobe
std::string out;
if (iter > 1 || (r = shell_exec({ "partprobe", devinfo.path }, "", &out, NULL)) != 0)
{
fprintf(
stderr, iter == 1 && r == 255
? "partprobe utility is required to reread partition table while disk %s is in use\n"
: "partprobe failed to re-read partition table while disk %s is in use\n",
devinfo.path.c_str()
);
return {};
}
break;
}
else
{
fprintf(stderr, "Failed to lstat %s: %s\n", link_path.c_str(), strerror(errno));
return {};
}
}
}
break;
}
// Wait until device symlinks in /dev/disk/by-partuuid/ appear
bool exists = false; bool exists = false;
const int max_iter = 300; // max 30 sec const int max_iter = 300; // max 30 sec
int iter = 0; iter = 0;
int r = 0;
while (!exists && iter < max_iter) while (!exists && iter < max_iter)
{ {
exists = true; exists = true;
@ -378,48 +396,28 @@ json11::Json disk_tool_t::add_partitions(vitastor_dev_info_t & devinfo, std::vec
{ {
std::string link_path = "/dev/disk/by-partuuid/"+strtolower(part["uuid"].string_value()); std::string link_path = "/dev/disk/by-partuuid/"+strtolower(part["uuid"].string_value());
struct stat st; struct stat st;
if (stat(part["node"].string_value().c_str(), &st) < 0 || if (lstat(link_path.c_str(), &st) < 0)
lstat(link_path.c_str(), &st) < 0)
{ {
if (errno == ENOENT) if (errno == ENOENT)
{ {
exists = false; exists = false;
if (iter == 4) if (iter == 4)
{ {
// Print message after 400ms
fprintf(stderr, "Waiting for %s to appear for up to %d sec...\n", link_path.c_str(), max_iter/10); fprintf(stderr, "Waiting for %s to appear for up to %d sec...\n", link_path.c_str(), max_iter/10);
} }
} }
else else
{ {
fprintf(stderr, "Failed to stat %s or lstat %s: %s\n", part["node"].string_value().c_str(), fprintf(stderr, "Failed to lstat %s: %s\n", link_path.c_str(), strerror(errno));
link_path.c_str(), strerror(errno));
return {}; return {};
} }
} }
} }
if (exists) if (!exists)
{ {
break; struct timespec ts = { .tv_sec = 0, .tv_nsec = 100000000 }; // 100ms
iter += (nanosleep(&ts, NULL) == 0);
} }
if (!exists && iter == 0)
{
// Run partprobe
std::string out;
r = shell_exec({ "partprobe", devinfo.path }, "", &out, NULL);
if (r != 0)
{
fprintf(
stderr, r == 255
? "partprobe utility is required to reread partition table while disk %s is in use\n"
: "partprobe failed to re-read partition table while disk %s is in use\n",
devinfo.path.c_str()
);
return {};
}
}
struct timespec ts = { .tv_sec = 0, .tv_nsec = 100000000 }; // 100ms
iter += (nanosleep(&ts, NULL) == 0 || !iter);
} }
devinfo.pt = newpt; devinfo.pt = newpt;
devinfo.osd_part_count += sizes.size(); devinfo.osd_part_count += sizes.size();
@ -502,10 +500,7 @@ int disk_tool_t::get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std
{ {
blockstore_disk_t dsk; blockstore_disk_t dsk;
dsk.parse_config(options); dsk.parse_config(options);
dsk.journal_offset = 4096; dsk.data_io = dsk.meta_io = dsk.journal_io = "direct";
dsk.meta_offset = 4096;
dsk.data_offset = 4096;
dsk.data_io = dsk.meta_io = dsk.journal_io = "cached";
dsk.open_data(); dsk.open_data();
dsk.open_meta(); dsk.open_meta();
dsk.open_journal(); dsk.open_journal();
@ -515,7 +510,6 @@ int disk_tool_t::get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std
} }
catch (std::exception & e) catch (std::exception & e)
{ {
dsk.close_all();
fprintf(stderr, "%s\n", e.what()); fprintf(stderr, "%s\n", e.what());
return 1; return 1;
} }
@ -570,12 +564,9 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
{ {
if (options.find("data_device") != options.end() && options["data_device"] != "") if (options.find("data_device") != options.end() && options["data_device"] != "")
{ {
if (options.find("hybrid") != options.end() || if (options.find("hybrid") != options.end() || options.find("osd_per_disk") != options.end() || devices.size())
options.find("fast_devices") != options.end() ||
options.find("osd_per_disk") != options.end() ||
devices.size())
{ {
fprintf(stderr, "Device list (positional arguments), --osd_per_disk, --hybrid and --fast-devices are incompatible with --data_device\n"); fprintf(stderr, "Device list (positional arguments) and --hybrid are incompatible with --data_device\n");
return 1; return 1;
} }
return prepare_one(options, options.find("hdd") != options.end() ? 1 : 0); return prepare_one(options, options.find("hdd") != options.end() ? 1 : 0);
@ -592,10 +583,8 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
auto devinfo = collect_devices(devices); auto devinfo = collect_devices(devices);
if (!devinfo.size()) if (!devinfo.size())
{ {
fprintf(stderr, "No suitable devices found\n");
return 1; return 1;
} }
bool explicit_fast = options.find("fast_devices") != options.end();
uint64_t osd_per_disk = stoull_full(options["osd_per_disk"]); uint64_t osd_per_disk = stoull_full(options["osd_per_disk"]);
if (!osd_per_disk) if (!osd_per_disk)
osd_per_disk = 1; osd_per_disk = 1;
@ -614,55 +603,21 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
if (options.find("disable_meta_fsync") == options.end()) if (options.find("disable_meta_fsync") == options.end())
options["disable_meta_fsync"] = "auto"; options["disable_meta_fsync"] = "auto";
options["disable_journal_fsync"] = options["disable_meta_fsync"]; options["disable_journal_fsync"] = options["disable_meta_fsync"];
if (explicit_fast) for (auto & dev: devinfo)
if (!dev.is_hdd)
ssds.push_back(dev);
if (!ssds.size())
{ {
auto fast = explode(",", options["fast_devices"], true); fprintf(stderr, "No SSDs found\n");
ssds = collect_devices(fast); return 1;
if (!ssds.size())
{
fprintf(stderr, "No fast devices found\n");
return 1;
}
if (options["journal_size"] == "")
{
auto auto_journal_size = DEFAULT_HYBRID_SSD_JOURNAL;
for (auto & dev: devinfo)
{
if (dev.is_hdd)
{
auto_journal_size = DEFAULT_HYBRID_JOURNAL;
break;
}
}
options["journal_size"] = auto_journal_size;
}
} }
else else if (ssds.size() == devinfo.size())
{ {
std::vector<vitastor_dev_info_t> hdds; fprintf(stderr, "No HDDs found\n");
for (auto & dev: devinfo) return 1;
{
if (!dev.is_hdd)
ssds.push_back(dev);
else
hdds.push_back(dev);
}
if (!ssds.size())
{
fprintf(stderr, "No SSDs found\n");
return 1;
}
if (!hdds.size())
{
fprintf(stderr, "No HDDs found\n");
return 1;
}
devinfo = hdds;
if (options["journal_size"] == "")
{
options["journal_size"] = DEFAULT_HYBRID_JOURNAL;
}
} }
if (options["journal_size"] == "")
options["journal_size"] = DEFAULT_HYBRID_JOURNAL;
} }
else else
{ {
@ -672,28 +627,31 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
auto journal_size = options["journal_size"]; auto journal_size = options["journal_size"];
for (auto & dev: devinfo) for (auto & dev: devinfo)
{ {
// Select new partitions and create an OSD on each of them if (!hybrid || dev.is_hdd)
for (const auto & uuid: get_new_data_parts(dev, osd_per_disk, max_other_percent))
{ {
options["force"] = true; // Select new partitions and create an OSD on each of them
options["data_device"] = "/dev/disk/by-partuuid/"+strtolower(uuid); for (const auto & uuid: get_new_data_parts(dev, osd_per_disk, max_other_percent))
if (hybrid)
{ {
// Select/create journal and metadata partitions options["force"] = true;
int r = get_meta_partition(ssds, options); options["data_device"] = "/dev/disk/by-partuuid/"+strtolower(uuid);
if (r != 0) if (hybrid)
{ {
return 1; // Select/create journal and metadata partitions
int r = get_meta_partition(ssds, options);
if (r != 0)
{
return 1;
}
options.erase("journal_size");
}
// Treat all disks as SSDs if not in the hybrid mode
prepare_one(options, dev.is_hdd ? 1 : 0);
if (hybrid)
{
options["journal_size"] = journal_size;
options.erase("journal_device");
options.erase("meta_device");
} }
options.erase("journal_size");
}
// Treat all disks as SSDs if not in the hybrid mode
prepare_one(options, dev.is_hdd ? 1 : 0);
if (hybrid)
{
options["journal_size"] = journal_size;
options.erase("journal_device");
options.erase("meta_device");
} }
} }
} }

View File

@ -91,7 +91,7 @@ int disk_tool_t::resize_parse_params()
try try
{ {
dsk.parse_config(options); dsk.parse_config(options);
dsk.data_io = dsk.meta_io = dsk.journal_io = "cached"; dsk.data_io = dsk.meta_io = dsk.journal_io = "direct";
dsk.open_data(); dsk.open_data();
dsk.open_meta(); dsk.open_meta();
dsk.open_journal(); dsk.open_journal();
@ -114,10 +114,7 @@ int disk_tool_t::resize_parse_params()
new_data_offset = options.find("new_data_offset") != options.end() new_data_offset = options.find("new_data_offset") != options.end()
? parse_size(options["new_data_offset"]) : dsk.data_offset; ? parse_size(options["new_data_offset"]) : dsk.data_offset;
new_data_len = options.find("new_data_len") != options.end() new_data_len = options.find("new_data_len") != options.end()
? parse_size(options["new_data_len"]) ? parse_size(options["new_data_len"]) : dsk.data_len;
: (options.find("new_data_offset") != options.end()
? dsk.data_device_size-new_data_offset
: dsk.data_len);
new_meta_offset = options.find("new_meta_offset") != options.end() new_meta_offset = options.find("new_meta_offset") != options.end()
? parse_size(options["new_meta_offset"]) : dsk.meta_offset; ? parse_size(options["new_meta_offset"]) : dsk.meta_offset;
new_meta_len = options.find("new_meta_len") != options.end() new_meta_len = options.find("new_meta_len") != options.end()
@ -126,14 +123,6 @@ int disk_tool_t::resize_parse_params()
? parse_size(options["new_journal_offset"]) : dsk.journal_offset; ? parse_size(options["new_journal_offset"]) : dsk.journal_offset;
new_journal_len = options.find("new_journal_len") != options.end() new_journal_len = options.find("new_journal_len") != options.end()
? parse_size(options["new_journal_len"]) : dsk.journal_len; ? parse_size(options["new_journal_len"]) : dsk.journal_len;
if (new_data_len+new_data_offset > dsk.data_device_size)
new_data_len = dsk.data_device_size-new_data_offset;
if (new_meta_device == dsk.data_device && new_data_offset < new_meta_offset &&
new_data_len+new_data_offset > new_meta_offset)
new_data_len = new_meta_offset-new_data_offset;
if (new_journal_device == dsk.data_device && new_data_offset < new_journal_offset &&
new_data_len+new_data_offset > new_journal_offset)
new_data_len = new_journal_offset-new_data_offset;
if (new_meta_device == dsk.meta_device && if (new_meta_device == dsk.meta_device &&
new_journal_device == dsk.journal_device && new_journal_device == dsk.journal_device &&
new_data_offset == dsk.data_offset && new_data_offset == dsk.data_offset &&
@ -170,10 +159,10 @@ void disk_tool_t::resize_init(blockstore_meta_header_v2_t *hdr)
dsk.data_csum_type = hdr->data_csum_type; dsk.data_csum_type = hdr->data_csum_type;
dsk.csum_block_size = hdr->csum_block_size; dsk.csum_block_size = hdr->csum_block_size;
} }
if (((new_data_offset-dsk.data_offset) % dsk.data_block_size)) if (((new_data_len-dsk.data_len) % dsk.data_block_size) ||
((new_data_offset-dsk.data_offset) % dsk.data_block_size))
{ {
fprintf(stderr, "Data alignment mismatch: old data offset is 0x%jx, new is 0x%jx, but alignment on %x should be equal\n", fprintf(stderr, "Data alignment mismatch\n");
dsk.data_offset, new_data_offset, dsk.data_block_size);
exit(1); exit(1);
} }
data_idx_diff = ((int64_t)(dsk.data_offset-new_data_offset)) / dsk.data_block_size; data_idx_diff = ((int64_t)(dsk.data_offset-new_data_offset)) / dsk.data_block_size;
@ -231,10 +220,10 @@ int disk_tool_t::resize_remap_blocks()
} }
for (uint64_t i = 0; i < free_last; i++) for (uint64_t i = 0; i < free_last; i++)
{ {
if (data_alloc->get(total_blocks-i-1)) if (data_alloc->get(total_blocks-i))
data_remap[total_blocks-i-1] = 0; data_remap[total_blocks-i] = 0;
else else
data_alloc->set(total_blocks-i-1, true); data_alloc->set(total_blocks-i, true);
} }
for (auto & p: data_remap) for (auto & p: data_remap)
{ {
@ -257,7 +246,7 @@ int disk_tool_t::resize_copy_data()
iodepth = 32; iodepth = 32;
} }
ringloop = new ring_loop_t(iodepth < RINGLOOP_DEFAULT_SIZE ? RINGLOOP_DEFAULT_SIZE : iodepth); ringloop = new ring_loop_t(iodepth < RINGLOOP_DEFAULT_SIZE ? RINGLOOP_DEFAULT_SIZE : iodepth);
dsk.data_fd = open(dsk.data_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR); dsk.data_fd = open(dsk.data_device.c_str(), O_DIRECT|O_RDWR);
if (dsk.data_fd < 0) if (dsk.data_fd < 0)
{ {
fprintf(stderr, "Failed to open data device %s: %s\n", dsk.data_device.c_str(), strerror(errno)); fprintf(stderr, "Failed to open data device %s: %s\n", dsk.data_device.c_str(), strerror(errno));
@ -452,7 +441,7 @@ int disk_tool_t::resize_rewrite_journal()
int disk_tool_t::resize_write_new_journal() int disk_tool_t::resize_write_new_journal()
{ {
new_journal_fd = open(new_journal_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR); new_journal_fd = open(new_journal_device.c_str(), O_DIRECT|O_RDWR);
if (new_journal_fd < 0) if (new_journal_fd < 0)
{ {
fprintf(stderr, "Failed to open new journal device %s: %s\n", new_journal_device.c_str(), strerror(errno)); fprintf(stderr, "Failed to open new journal device %s: %s\n", new_journal_device.c_str(), strerror(errno));
@ -478,13 +467,12 @@ int disk_tool_t::resize_rewrite_meta()
blockstore_meta_header_v2_t *new_hdr = (blockstore_meta_header_v2_t *)new_meta_buf; blockstore_meta_header_v2_t *new_hdr = (blockstore_meta_header_v2_t *)new_meta_buf;
new_hdr->zero = 0; new_hdr->zero = 0;
new_hdr->magic = BLOCKSTORE_META_MAGIC_V1; new_hdr->magic = BLOCKSTORE_META_MAGIC_V1;
new_hdr->version = BLOCKSTORE_META_FORMAT_V2; new_hdr->version = BLOCKSTORE_META_FORMAT_V1;
new_hdr->meta_block_size = dsk.meta_block_size; new_hdr->meta_block_size = dsk.meta_block_size;
new_hdr->data_block_size = dsk.data_block_size; new_hdr->data_block_size = dsk.data_block_size;
new_hdr->bitmap_granularity = dsk.bitmap_granularity ? dsk.bitmap_granularity : 4096; new_hdr->bitmap_granularity = dsk.bitmap_granularity ? dsk.bitmap_granularity : 4096;
new_hdr->data_csum_type = dsk.data_csum_type; new_hdr->data_csum_type = dsk.data_csum_type;
new_hdr->csum_block_size = dsk.csum_block_size; new_hdr->csum_block_size = dsk.csum_block_size;
new_hdr->header_csum = crc32c(0, new_hdr, sizeof(*new_hdr));
}, },
[this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap) [this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
{ {
@ -493,7 +481,7 @@ int disk_tool_t::resize_rewrite_meta()
block_num = remap_it->second; block_num = remap_it->second;
if (block_num < free_first || block_num >= total_blocks-free_last) if (block_num < free_first || block_num >= total_blocks-free_last)
{ {
fprintf(stderr, "BUG: remapped block %ju not in range %ju..%ju\n", block_num, free_first, total_blocks-free_last); fprintf(stderr, "BUG: remapped block not in range\n");
exit(1); exit(1);
} }
block_num += data_idx_diff; block_num += data_idx_diff;
@ -506,8 +494,6 @@ int disk_tool_t::resize_rewrite_meta()
memcpy(new_entry->bitmap, bitmap, 2*new_clean_entry_bitmap_size + new_data_csum_size); memcpy(new_entry->bitmap, bitmap, 2*new_clean_entry_bitmap_size + new_data_csum_size);
else else
memset(new_entry->bitmap, 0xff, 2*new_clean_entry_bitmap_size); memset(new_entry->bitmap, 0xff, 2*new_clean_entry_bitmap_size);
uint32_t *new_entry_csum = (uint32_t*)(((uint8_t*)new_entry) + new_clean_entry_size - 4);
*new_entry_csum = crc32c(0, new_entry, new_clean_entry_size - 4);
} }
); );
if (r != 0) if (r != 0)
@ -521,7 +507,7 @@ int disk_tool_t::resize_rewrite_meta()
int disk_tool_t::resize_write_new_meta() int disk_tool_t::resize_write_new_meta()
{ {
new_meta_fd = open(new_meta_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR); new_meta_fd = open(new_meta_device.c_str(), O_DIRECT|O_RDWR);
if (new_meta_fd < 0) if (new_meta_fd < 0)
{ {
fprintf(stderr, "Failed to open new metadata device %s: %s\n", new_meta_device.c_str(), strerror(errno)); fprintf(stderr, "Failed to open new metadata device %s: %s\n", new_meta_device.c_str(), strerror(errno));

View File

@ -37,8 +37,6 @@ int disk_tool_t::resize_data(std::string device)
fprintf(stderr, "%s\n", e.what()); fprintf(stderr, "%s\n", e.what());
return 1; return 1;
} }
// Save FD numbers because calc_lengths() relies on them
int old_journal_fd = dsk.journal_fd, old_meta_fd = dsk.meta_fd, old_data_fd = dsk.data_fd;
dsk.close_all(); dsk.close_all();
bool dry_run = options.find("dry_run") != options.end(); bool dry_run = options.find("dry_run") != options.end();
auto old_journal_device = dsk.journal_device; auto old_journal_device = dsk.journal_device;
@ -50,22 +48,6 @@ int disk_tool_t::resize_data(std::string device)
if (options.find("move_journal") == options.end()) if (options.find("move_journal") == options.end())
options["move_journal"] = dsk.journal_device == dsk.data_device ? "" : dsk.journal_device; options["move_journal"] = dsk.journal_device == dsk.data_device ? "" : dsk.journal_device;
} }
uint64_t new_data_dev_size = 0;
if (options.find("data_size") != options.end())
{
new_data_dev_size = parse_size(options["data_size"]);
new_data_dev_size = options["data_size"] == "max" || new_data_dev_size > dsk.data_device_size
? dsk.data_device_size : new_data_dev_size;
dsk.data_device_size = new_data_dev_size;
dsk.cfg_data_size = 0;
dsk.journal_fd = old_journal_fd;
dsk.meta_fd = old_meta_fd;
dsk.data_fd = old_data_fd;
dsk.calc_lengths(true);
dsk.journal_fd = -1;
dsk.meta_fd = -1;
dsk.data_fd = -1;
}
std::map<std::string, std::string> move_options; std::map<std::string, std::string> move_options;
if (options.find("move_journal") != options.end()) if (options.find("move_journal") != options.end())
{ {
@ -87,8 +69,14 @@ int disk_tool_t::resize_data(std::string device)
new_data_offset += ((dsk.data_offset-new_data_offset) % dsk.data_block_size); new_data_offset += ((dsk.data_offset-new_data_offset) % dsk.data_block_size);
if (new_data_offset != dsk.data_offset) if (new_data_offset != dsk.data_offset)
move_options["new_data_offset"] = std::to_string(new_data_offset); move_options["new_data_offset"] = std::to_string(new_data_offset);
if (new_data_dev_size != 0) if (options.find("data_size") != options.end())
move_options["new_data_len"] = std::to_string(new_data_dev_size-new_data_offset); {
auto new_data_dev_size = parse_size(options["data_size"]);
new_data_dev_size = options["data_size"] == "max" || new_data_dev_size > dsk.data_device_size
? dsk.data_device_size : new_data_dev_size;
if (new_data_dev_size-dsk.data_offset != dsk.data_len)
move_options["new_data_len"] = std::to_string(new_data_dev_size-new_data_offset);
}
new_meta_offset = 4096 + (new_meta_device == new_journal_device ? new_journal_len : 0); new_meta_offset = 4096 + (new_meta_device == new_journal_device ? new_journal_len : 0);
if (new_meta_offset != dsk.meta_offset) if (new_meta_offset != dsk.meta_offset)
move_options["new_meta_offset"] = std::to_string(new_meta_offset); move_options["new_meta_offset"] = std::to_string(new_meta_offset);
@ -200,12 +188,17 @@ int disk_tool_t::resize_parse_move_journal(std::map<std::string, std::string> &
else else
options["move_journal"] = "<new journal partition on "+parent_dev+">"; options["move_journal"] = "<new journal partition on "+parent_dev+">";
} }
else if (options["move_journal"].substr(0, 22) != "/dev/disk/by-partuuid/")
{
// Partitions should be identified by GPT partition UUID
fprintf(stderr, "%s does not start with /dev/disk/by-partuuid/. Partitions should be identified by GPT partition UUIDs\n", options["move_journal"].c_str());
return 1;
}
else else
{ {
// already a partition - check that it's a GPT partition with correct type // already a partition - check that it's a GPT partition with correct type
if ((options.find("force") == options.end() if (options.find("force") == options.end() &&
? check_existing_partition(options["move_journal"]) check_existing_partition(real_dev) != 0)
: fix_partition_type(options["move_journal"])) != 0)
{ {
return 1; return 1;
} }
@ -276,12 +269,17 @@ int disk_tool_t::resize_parse_move_meta(std::map<std::string, std::string> & mov
else else
options["move_meta"] = "<new metadata partition on "+parent_dev+">"; options["move_meta"] = "<new metadata partition on "+parent_dev+">";
} }
else if (options["move_meta"].substr(0, 22) != "/dev/disk/by-partuuid/")
{
// Partitions should be identified by GPT partition UUID
fprintf(stderr, "%s does not start with /dev/disk/by-partuuid/. Partitions should be identified by GPT partition UUIDs\n", options["move_meta"].c_str());
return 1;
}
else else
{ {
// already a partition - check that it's a GPT partition with correct type // already a partition - check that it's a GPT partition with correct type
if ((options.find("force") == options.end() if (options.find("force") == options.end() &&
? check_existing_partition(options["move_meta"]) check_existing_partition(real_dev) != 0)
: fix_partition_type(options["move_meta"])) != 0)
{ {
return 1; return 1;
} }

View File

@ -122,7 +122,7 @@ uint32_t disk_tool_t::write_osd_superblock(std::string device, json11::Json para
sb->size = sb_size; sb->size = sb_size;
memcpy(sb->json_data, json_data.c_str(), json_data.size()); memcpy(sb->json_data, json_data.c_str(), json_data.size());
sb->crc32c = crc32c(0, &sb->size, sb->size - ((uint8_t*)&sb->size - buf)); sb->crc32c = crc32c(0, &sb->size, sb->size - ((uint8_t*)&sb->size - buf));
int fd = open(device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR); int fd = open(device.c_str(), O_DIRECT|O_RDWR);
if (fd < 0) if (fd < 0)
{ {
fprintf(stderr, "Failed to open device %s: %s\n", device.c_str(), strerror(errno)); fprintf(stderr, "Failed to open device %s: %s\n", device.c_str(), strerror(errno));
@ -150,7 +150,7 @@ json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_ex
json11::Json osd_params; json11::Json osd_params;
std::string json_err; std::string json_err;
std::string real_device, device_type, real_data, real_meta, real_journal; std::string real_device, device_type, real_data, real_meta, real_journal;
int r, fd = open(device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR); int r, fd = open(device.c_str(), O_DIRECT|O_RDWR);
if (fd < 0) if (fd < 0)
{ {
fprintf(stderr, "Failed to open device %s: %s\n", device.c_str(), strerror(errno)); fprintf(stderr, "Failed to open device %s: %s\n", device.c_str(), strerror(errno));
@ -385,7 +385,7 @@ int disk_tool_t::pre_exec_osd(std::string device)
int disk_tool_t::clear_osd_superblock(const std::string & dev) int disk_tool_t::clear_osd_superblock(const std::string & dev)
{ {
uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 4096); uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 4096);
int fd = -1, r = open(dev.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR); int fd = -1, r = open(dev.c_str(), O_DIRECT|O_RDWR);
if (r >= 0) if (r >= 0)
{ {
fd = r; fd = r;

View File

@ -343,42 +343,23 @@ uint64_t free_from_parttable(json11::Json pt)
return free; return free;
} }
int fix_partition_type_uuid(std::string & dev_by_uuid, const std::string & type_uuid) int fix_partition_type(std::string dev_by_uuid)
{ {
bool is_partuuid = dev_by_uuid.substr(0, 22) == "/dev/disk/by-partuuid/"; auto uuid = strtolower(dev_by_uuid.substr(dev_by_uuid.rfind('/')+1));
auto uuid = is_partuuid ? strtolower(dev_by_uuid.substr(22)) : ""; std::string parent_dev = get_parent_device(realpath_str(dev_by_uuid, false));
auto node = realpath_str(dev_by_uuid, false);
std::string parent_dev = get_parent_device(node);
if (parent_dev == "") if (parent_dev == "")
return 1; return 1;
auto pt = read_parttable(parent_dev); auto pt = read_parttable(parent_dev);
if (pt.is_null() || pt.is_bool()) if (pt.is_null() || pt.is_bool())
return 1; return 1;
bool found = false;
std::string script = "label: gpt\n\n"; std::string script = "label: gpt\n\n";
for (const auto & part: pt["partitions"].array_items()) for (const auto & part: pt["partitions"].array_items())
{ {
bool this_part = (part["node"].string_value() == node) && bool this_part = (strtolower(part["uuid"].string_value()) == uuid);
(!is_partuuid || strtolower(part["uuid"].string_value()) == uuid); if (this_part && strtolower(part["type"].string_value()) == "e7009fac-a5a1-4d72-af72-53de13059903")
if (this_part)
{ {
found = true; // Already correct type
if (!is_partuuid) return 0;
{
if (part["uuid"] == "")
{
fprintf(stderr, "Could not determine partition UUID for %s. Please use GPT partitions\n", dev_by_uuid.c_str());
return 1;
}
auto new_dev = "/dev/disk/by-partuuid/"+strtolower(part["uuid"].string_value());
fprintf(stderr, "Using %s instead of %s\n", new_dev.c_str(), dev_by_uuid.c_str());
dev_by_uuid = new_dev;
}
if (strtolower(part["type"].string_value()) == type_uuid)
{
// Already correct type
return 0;
}
} }
script += part["node"].string_value()+": "; script += part["node"].string_value()+": ";
bool first = true; bool first = true;
@ -388,18 +369,13 @@ int fix_partition_type_uuid(std::string & dev_by_uuid, const std::string & type_
{ {
script += (first ? "" : ", ")+kv.first+"="+ script += (first ? "" : ", ")+kv.first+"="+
(kv.first == "type" && this_part (kv.first == "type" && this_part
? type_uuid ? "e7009fac-a5a1-4d72-af72-53de13059903"
: (kv.second.is_string() ? kv.second.string_value() : kv.second.dump())); : (kv.second.is_string() ? kv.second.string_value() : kv.second.dump()));
first = false; first = false;
} }
} }
script += "\n"; script += "\n";
} }
if (!found)
{
fprintf(stderr, "Could not find partition table entry for %s\n", dev_by_uuid.c_str());
return 1;
}
std::string out; std::string out;
return shell_exec({ "sfdisk", "--no-reread", "--no-tell-kernel", "--force", parent_dev }, script, &out, NULL); return shell_exec({ "sfdisk", "--no-reread", "--no-tell-kernel", "--force", parent_dev }, script, &out, NULL);
} }

View File

@ -137,7 +137,7 @@ void nfs_proxy_t::handle_rdmacm_events()
} }
if (ev->event == RDMA_CM_EVENT_CONNECT_REQUEST) if (ev->event == RDMA_CM_EVENT_CONNECT_REQUEST)
{ {
rdmacm_accept(ev); rdmacm_accept();
} }
else if (ev->event == RDMA_CM_EVENT_CONNECT_ERROR || else if (ev->event == RDMA_CM_EVENT_CONNECT_ERROR ||
ev->event == RDMA_CM_EVENT_REJECTED || ev->event == RDMA_CM_EVENT_REJECTED ||

View File

@ -535,12 +535,10 @@ void osd_t::print_stats()
void osd_t::print_slow() void osd_t::print_slow()
{ {
cur_slow_op_primary = 0; bool has_slow = false;
cur_slow_op_secondary = 0;
char alloc[1024]; char alloc[1024];
timespec now; timespec now;
clock_gettime(CLOCK_REALTIME, &now); clock_gettime(CLOCK_REALTIME, &now);
// FIXME: Also track slow local blockstore ops and recovery/flush/scrub ops
for (auto & kv: msgr.clients) for (auto & kv: msgr.clients)
{ {
for (auto op: kv.second->received_ops) for (auto op: kv.second->received_ops)
@ -610,7 +608,6 @@ void osd_t::print_slow()
op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK || op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK ||
op->req.hdr.opcode == OSD_OP_SEC_READ_BMP) op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
{ {
cur_slow_op_secondary++;
bufprintf(" state=%d", op->bs_op ? PRIV(op->bs_op)->op_state : -1); bufprintf(" state=%d", op->bs_op ? PRIV(op->bs_op)->op_state : -1);
int wait_for = op->bs_op ? PRIV(op->bs_op)->wait_for : 0; int wait_for = op->bs_op ? PRIV(op->bs_op)->wait_for : 0;
if (wait_for) if (wait_for)
@ -621,19 +618,15 @@ void osd_t::print_slow()
else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE || else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
op->req.hdr.opcode == OSD_OP_SYNC || op->req.hdr.opcode == OSD_OP_DELETE) op->req.hdr.opcode == OSD_OP_SYNC || op->req.hdr.opcode == OSD_OP_DELETE)
{ {
cur_slow_op_primary++;
bufprintf(" state=%d", !op->op_data ? -1 : op->op_data->st); bufprintf(" state=%d", !op->op_data ? -1 : op->op_data->st);
} }
else
{
cur_slow_op_primary++;
}
#undef bufprintf #undef bufprintf
printf("%s\n", alloc); printf("%s\n", alloc);
has_slow = true;
} }
} }
} }
if ((cur_slow_op_primary+cur_slow_op_secondary) > 0 && bs) if (has_slow && bs)
{ {
bs->dump_diagnostics(); bs->dump_diagnostics();
} }

View File

@ -150,9 +150,7 @@ class osd_t
bool pg_config_applied = false; bool pg_config_applied = false;
bool etcd_reporting_pg_state = false; bool etcd_reporting_pg_state = false;
bool etcd_reporting_stats = false; bool etcd_reporting_stats = false;
int print_stats_timer_id = -1, slow_log_timer_id = -1; int autosync_timer_id = -1, print_stats_timer_id = -1, slow_log_timer_id = -1;
uint64_t cur_slow_op_primary = 0;
uint64_t cur_slow_op_secondary = 0;
// peers and PGs // peers and PGs
@ -170,8 +168,6 @@ class osd_t
object_id recovery_last_oid; object_id recovery_last_oid;
int recovery_pg_done = 0, recovery_done = 0; int recovery_pg_done = 0, recovery_done = 0;
osd_op_t *autosync_op = NULL; osd_op_t *autosync_op = NULL;
int autosync_copies_to_delete = 0;
int autosync_timer_id = -1;
// Scrubbing // Scrubbing
uint64_t scrub_nearest_ts = 0; uint64_t scrub_nearest_ts = 0;

View File

@ -201,14 +201,6 @@ json11::Json osd_t::get_statistics()
st["immediate_commit"] = immediate_commit == IMMEDIATE_ALL ? "all" : (immediate_commit == IMMEDIATE_SMALL ? "small" : "none"); st["immediate_commit"] = immediate_commit == IMMEDIATE_ALL ? "all" : (immediate_commit == IMMEDIATE_SMALL ? "small" : "none");
st["host"] = self_state["host"]; st["host"] = self_state["host"];
st["version"] = VITASTOR_VERSION; st["version"] = VITASTOR_VERSION;
if (cur_slow_op_primary > 0)
{
st["slow_ops_primary"] = cur_slow_op_primary;
}
if (cur_slow_op_secondary > 0)
{
st["slow_ops_secondary"] = cur_slow_op_secondary;
}
json11::Json::object op_stats, subop_stats; json11::Json::object op_stats, subop_stats;
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++) for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
{ {

View File

@ -13,11 +13,10 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
bool first = true; bool first = true;
while (it != pg.flush_actions.end()) while (it != pg.flush_actions.end())
{ {
if (!first && if (!first && (it->first.oid.inode != prev_it->first.oid.inode ||
(it->first.oid.inode != prev_it->first.oid.inode || (it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK)) &&
(it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK)) && fb->rollback_lists[it->first.osd_num].size() >= FLUSH_BATCH ||
(fb->rollback_lists[it->first.osd_num].size() >= FLUSH_BATCH || fb->stable_lists[it->first.osd_num].size() >= FLUSH_BATCH)
fb->stable_lists[it->first.osd_num].size() >= FLUSH_BATCH))
{ {
// Stop only at the object boundary // Stop only at the object boundary
break; break;
@ -76,7 +75,6 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
// Throw the result away // Throw the result away
return; return;
} }
fb->flush_done++;
if (retval != 0) if (retval != 0)
{ {
if (peer_osd == this->osd_num) if (peer_osd == this->osd_num)
@ -94,11 +92,12 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
auto fd_it = msgr.osd_peer_fds.find(peer_osd); auto fd_it = msgr.osd_peer_fds.find(peer_osd);
if (fd_it != msgr.osd_peer_fds.end()) if (fd_it != msgr.osd_peer_fds.end())
{ {
// Will repeer/stop this PG
msgr.stop_client(fd_it->second); msgr.stop_client(fd_it->second);
} }
return;
} }
} }
fb->flush_done++;
if (fb->flush_done == fb->flush_ops) if (fb->flush_done == fb->flush_ops)
{ {
// This flush batch is done // This flush batch is done

View File

@ -645,18 +645,6 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec
{ {
throw std::runtime_error("BUG: Invalid object state: "+std::to_string((*object_state)->state)); throw std::runtime_error("BUG: Invalid object state: "+std::to_string((*object_state)->state));
} }
if (changed && immediate_commit != IMMEDIATE_ALL)
{
// Trigger double automatic sync after changing PG state when we're running with fsyncs.
// First autosync commits all written objects and applies copies_to_delete_after_sync;
// Second autosync commits all deletions run by the first sync.
// Without it, rebalancing in a cluster without load may result in some small amount of
// garbage left on "extra" OSDs of the PG, because last deletions are not synced at all.
// FIXME: 1000% correct way is to switch PG state only after copies_to_delete_after_sync.
// But it's much more complicated.
unstable_write_count += autosync_writes;
autosync_copies_to_delete = 2;
}
if (changed && report) if (changed && report)
{ {
report_pg_state(pg); report_pg_state(pg);

View File

@ -9,10 +9,6 @@ void osd_t::autosync()
{ {
if (immediate_commit != IMMEDIATE_ALL && !autosync_op) if (immediate_commit != IMMEDIATE_ALL && !autosync_op)
{ {
if (autosync_copies_to_delete > 0)
{
autosync_copies_to_delete--;
}
autosync_op = new osd_op_t(); autosync_op = new osd_op_t();
autosync_op->op_type = OSD_OP_IN; autosync_op->op_type = OSD_OP_IN;
autosync_op->peer_fd = SELF_FD; autosync_op->peer_fd = SELF_FD;
@ -33,11 +29,6 @@ void osd_t::autosync()
} }
delete autosync_op; delete autosync_op;
autosync_op = NULL; autosync_op = NULL;
if (autosync_copies_to_delete > 0)
{
// Trigger the second "copies_to_delete" autosync
autosync();
}
}; };
exec_op(autosync_op); exec_op(autosync_op);
} }

View File

@ -213,15 +213,6 @@ resume_8:
{ {
goto resume_6; goto resume_6;
} }
if (immediate_commit == IMMEDIATE_NONE)
{
// Mark OSDs as dirty because deletions have to be synced too!
for (int i = 0; i < op_data->copies_to_delete_count; i++)
{
auto & chunk = op_data->copies_to_delete[i];
this->dirty_osds.insert(chunk.osd_num);
}
}
} }
for (int i = 0; i < op_data->dirty_pg_count; i++) for (int i = 0; i < op_data->dirty_pg_count; i++)
{ {
@ -236,7 +227,7 @@ resume_8:
start_pg_peering(pg); start_pg_peering(pg);
} }
} }
// FIXME: Free those in the destructor (not here)? // FIXME: Free those in the destructor?
free(op_data->dirty_pgs); free(op_data->dirty_pgs);
op_data->dirty_pgs = NULL; op_data->dirty_pgs = NULL;
op_data->dirty_osds = NULL; op_data->dirty_osds = NULL;

View File

@ -7,12 +7,6 @@
bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg) bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
{ {
osd_primary_op_data_t *op_data = cur_op->op_data; osd_primary_op_data_t *op_data = cur_op->op_data;
// First check if PG is not active anymore
if (!(pg.state & PG_ACTIVE))
{
pg_cancel_write_queue(pg, cur_op, op_data->oid, -EPIPE);
return false;
}
// Check if actions are pending for this object // Check if actions are pending for this object
auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){ auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){
.oid = op_data->oid, .oid = op_data->oid,

View File

@ -65,7 +65,7 @@ std::string addr_to_string(const sockaddr_storage &addr)
return std::string(peer_str)+":"+std::to_string(port); return std::string(peer_str)+":"+std::to_string(port);
} }
bool cidr_match(const in_addr &addr, const in_addr &net, uint8_t bits) static bool cidr_match(const in_addr &addr, const in_addr &net, uint8_t bits)
{ {
if (bits == 0) if (bits == 0)
{ {
@ -75,7 +75,7 @@ bool cidr_match(const in_addr &addr, const in_addr &net, uint8_t bits)
return !((addr.s_addr ^ net.s_addr) & htonl(0xFFFFFFFFu << (32 - bits))); return !((addr.s_addr ^ net.s_addr) & htonl(0xFFFFFFFFu << (32 - bits)));
} }
bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_t bits) static bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_t bits)
{ {
const uint32_t *a = address.s6_addr32; const uint32_t *a = address.s6_addr32;
const uint32_t *n = network.s6_addr32; const uint32_t *n = network.s6_addr32;
@ -93,49 +93,47 @@ bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_t bits)
return true; return true;
} }
addr_mask_t cidr_parse(std::string mask) struct addr_mask_t
{ {
unsigned bits = 255; sa_family_t family;
int p = mask.find('/');
if (p != std::string::npos)
{
char null_byte = 0;
if (sscanf(mask.c_str()+p+1, "%u%c", &bits, &null_byte) != 1 || bits > 128)
throw std::runtime_error("Invalid IP address mask: " + mask);
mask = mask.substr(0, p);
}
in_addr ipv4; in_addr ipv4;
in6_addr ipv6; in6_addr ipv6;
if (inet_pton(AF_INET, mask.c_str(), &ipv4) == 1) uint8_t bits;
{ };
if (bits == 255)
bits = 32;
if (bits > 32)
throw std::runtime_error("Invalid IP address mask: " + mask);
return (addr_mask_t){ .family = AF_INET, .ipv4 = ipv4, .bits = (uint8_t)(bits ? bits : 32) };
}
else if (inet_pton(AF_INET6, mask.c_str(), &ipv6) == 1)
{
if (bits == 255)
bits = 128;
return (addr_mask_t){ .family = AF_INET6, .ipv6 = ipv6, .bits = (uint8_t)bits };
}
else
{
throw std::runtime_error("Invalid IP address mask: " + mask);
}
}
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg, bool include_v6) std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg, bool include_v6)
{ {
std::vector<addr_mask_t> masks; std::vector<addr_mask_t> masks;
for (auto mask: mask_cfg) for (auto mask: mask_cfg)
{ {
masks.push_back(cidr_parse(mask)); unsigned bits = 0;
if (masks[masks.size()-1].family == AF_INET6) int p = mask.find('/');
if (p != std::string::npos)
{ {
// Auto-enable IPv6 addresses char null_byte = 0;
include_v6 = true; if (sscanf(mask.c_str()+p+1, "%u%c", &bits, &null_byte) != 1 || bits > 128)
{
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
}
mask = mask.substr(0, p);
}
in_addr ipv4;
in6_addr ipv6;
if (inet_pton(AF_INET, mask.c_str(), &ipv4) == 1)
{
if (bits > 32)
{
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
}
masks.push_back((addr_mask_t){ .family = AF_INET, .ipv4 = ipv4, .bits = (uint8_t)bits });
}
else if (include_v6 && inet_pton(AF_INET6, mask.c_str(), &ipv6) == 1)
{
masks.push_back((addr_mask_t){ .family = AF_INET6, .ipv6 = ipv6, .bits = (uint8_t)bits });
}
else
{
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
} }
} }
std::set<std::string> addresses; std::set<std::string> addresses;

View File

@ -1,22 +1,10 @@
#pragma once #pragma once
#include <netinet/in.h>
#include <sys/socket.h> #include <sys/socket.h>
#include <string> #include <string>
#include <vector> #include <vector>
struct addr_mask_t
{
sa_family_t family;
in_addr ipv4;
in6_addr ipv6;
uint8_t bits;
};
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr_storage *addr); bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr_storage *addr);
std::string addr_to_string(const sockaddr_storage &addr); std::string addr_to_string(const sockaddr_storage &addr);
addr_mask_t cidr_parse(std::string mask);
bool cidr_match(const in_addr &address, const in_addr &network, uint8_t bits);
bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_t bits);
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg = std::vector<std::string>(), bool include_v6 = false); std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg = std::vector<std::string>(), bool include_v6 = false);
int create_and_bind_socket(std::string bind_address, int bind_port, int listen_backlog, int *listening_port); int create_and_bind_socket(std::string bind_address, int bind_port, int listen_backlog, int *listening_port);

View File

@ -62,7 +62,7 @@ int timerfd_manager_t::set_timer_us(uint64_t micros, bool repeat, std::function<
.callback = callback, .callback = callback,
}); });
inc_timer(timers[timers.size()-1]); inc_timer(timers[timers.size()-1]);
set_nearest(false); set_nearest();
return timer_id; return timer_id;
} }
@ -82,13 +82,13 @@ void timerfd_manager_t::clear_timer(int timer_id)
{ {
nearest--; nearest--;
} }
set_nearest(false); set_nearest();
break; break;
} }
} }
} }
void timerfd_manager_t::set_nearest(bool trigger_inline) void timerfd_manager_t::set_nearest()
{ {
if (onstack > 0) if (onstack > 0)
{ {
@ -134,13 +134,10 @@ again:
} }
if (exp.it_value.tv_sec < 0 || exp.it_value.tv_sec == 0 && exp.it_value.tv_nsec <= 0) if (exp.it_value.tv_sec < 0 || exp.it_value.tv_sec == 0 && exp.it_value.tv_nsec <= 0)
{ {
// It already happened - set minimal timeout // It already happened
if (trigger_inline) // FIXME: Postpone to setImmediate/BH to avoid reenterability problems
{ trigger_nearest();
trigger_nearest(); goto again;
goto again;
}
exp.it_value = { .tv_sec = 0, .tv_nsec = 1 };
} }
if (timerfd_settime(timerfd, 0, &exp, NULL)) if (timerfd_settime(timerfd, 0, &exp, NULL))
{ {
@ -160,7 +157,7 @@ void timerfd_manager_t::handle_readable()
trigger_nearest(); trigger_nearest();
} }
wait_state = 0; wait_state = 0;
set_nearest(true); set_nearest();
} }
void timerfd_manager_t::trigger_nearest() void timerfd_manager_t::trigger_nearest()

View File

@ -26,7 +26,7 @@ class timerfd_manager_t
std::vector<timerfd_timer_t> timers; std::vector<timerfd_timer_t> timers;
void inc_timer(timerfd_timer_t & t); void inc_timer(timerfd_timer_t & t);
void set_nearest(bool trigger_inline); void set_nearest();
void trigger_nearest(); void trigger_nearest();
void handle_readable(); void handle_readable();
public: public:

View File

@ -68,9 +68,6 @@ TEST_NAME=csum_4k_dmj OSD_ARGS="--data_csum_type crc32c --inmemory_metadata fal
TEST_NAME=csum_4k_dj OSD_ARGS="--data_csum_type crc32c --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh TEST_NAME=csum_4k_dj OSD_ARGS="--data_csum_type crc32c --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
TEST_NAME=csum_4k OSD_ARGS="--data_csum_type crc32c" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh TEST_NAME=csum_4k OSD_ARGS="--data_csum_type crc32c" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
./test_resize.sh
./test_resize_auto.sh
./test_snapshot_pool2.sh ./test_snapshot_pool2.sh
./test_osd_tags.sh ./test_osd_tags.sh

View File

@ -3,7 +3,6 @@
PG_COUNT=${PG_COUNT:-32} PG_COUNT=${PG_COUNT:-32}
. `dirname $0`/run_3osds.sh . `dirname $0`/run_3osds.sh
check_qemu
LD_PRELOAD="build/src/client/libfio_vitastor.so" \ LD_PRELOAD="build/src/client/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/client/libfio_vitastor.so -bs=4M -direct=1 -iodepth=4 \ fio -thread -name=test -ioengine=build/src/client/libfio_vitastor.so -bs=4M -direct=1 -iodepth=4 \
@ -27,22 +26,22 @@ for i in $(seq 1 $OSD_COUNT); do
offsets=$(build/src/disk_tool/vitastor-disk simple-offsets --format json ./testdata/bin/test_osd$i.bin) offsets=$(build/src/disk_tool/vitastor-disk simple-offsets --format json ./testdata/bin/test_osd$i.bin)
meta_offset=$(echo $offsets | jq -r .meta_offset) meta_offset=$(echo $offsets | jq -r .meta_offset)
data_offset=$(echo $offsets | jq -r .data_offset) data_offset=$(echo $offsets | jq -r .data_offset)
build/src/disk_tool/vitastor-disk dump-journal --io cached --json ./testdata/bin/test_osd$i.bin 4096 0 $meta_offset >./testdata/journal_before_resize.json build/src/disk_tool/vitastor-disk dump-journal --json ./testdata/bin/test_osd$i.bin 4096 0 $meta_offset >./testdata/journal_before_resize.json
build/src/disk_tool/vitastor-disk dump-meta --io cached ./testdata/bin/test_osd$i.bin 4096 $meta_offset $((data_offset-meta_offset)) >./testdata/meta_before_resize.json build/src/disk_tool/vitastor-disk dump-meta ./testdata/bin/test_osd$i.bin 4096 $meta_offset $((data_offset-meta_offset)) >./testdata/meta_before_resize.json
build/src/disk_tool/vitastor-disk raw-resize --io cached \ build/src/disk_tool/vitastor-disk resize \
$(build/src/disk_tool/vitastor-disk simple-offsets --format options ./testdata/bin/test_osd$i.bin 2>/dev/null) \ $(build/src/disk_tool/vitastor-disk simple-offsets --format options ./testdata/bin/test_osd$i.bin 2>/dev/null) \
--new_meta_offset 0 \ --new_meta_offset 0 \
--new_meta_len $((1024*1024)) \ --new_meta_len $((1024*1024)) \
--new_journal_offset $((1024*1024)) \ --new_journal_offset $((1024*1024)) \
--new_data_offset $((128*1024*1024+32768)) --new_data_offset $((128*1024*1024))
build/src/disk_tool/vitastor-disk dump-journal --io cached --json ./testdata/bin/test_osd$i.bin 4096 $((1024*1024)) $((127*1024*1024)) >./testdata/journal_after_resize.json build/src/disk_tool/vitastor-disk dump-journal --json ./testdata/bin/test_osd$i.bin 4096 $((1024*1024)) $((127*1024*1024)) >./testdata/journal_after_resize.json
build/src/disk_tool/vitastor-disk dump-meta --io cached ./testdata/bin/test_osd$i.bin 4096 0 $((1024*1024)) >./testdata/meta_after_resize.json build/src/disk_tool/vitastor-disk dump-meta ./testdata/bin/test_osd$i.bin 4096 0 $((1024*1024)) >./testdata/meta_after_resize.json
if ! (cat ./testdata/meta_before_resize.json ./testdata/meta_after_resize.json | \ if ! (cat ./testdata/meta_before_resize.json ./testdata/meta_after_resize.json | \
jq -e -s 'map([ .entries[] | del(.block) ] | sort_by(.pool, .inode, .stripe)) | .[0] == .[1] and (.[0] | length) > 1000'); then jq -e -s 'map([ .entries[] | del(.block) ] | sort_by(.pool, .inode, .stripe)) | .[0] == .[1] and (.[0] | length) > 1000'); then
format_error "OSD $i metadata corrupted after resizing" format_error "OSD $i metadata corrupted after resizing"
fi fi
if ! (cat ./testdata/journal_before_resize.json ./testdata/journal_after_resize.json | \ if ! (cat ./testdata/journal_before_resize.json ./testdata/journal_after_resize.json | \
jq -e -s 'map([ .[] | del(.crc32, .crc32_prev, .valid, .loc, .start) ]) | .[0] == .[1] and (.[0] | length) > 1'); then jq -e -s 'map([ .[].entries[] | del(.crc32, .crc32_prev, .valid, .loc, .start) ]) | .[0] == .[1] and (.[0] | length) > 1'); then
format_error "OSD $i journal corrupted after resizing" format_error "OSD $i journal corrupted after resizing"
fi fi
done done
@ -54,7 +53,7 @@ for i in $(seq 1 $OSD_COUNT); do
--data_device ./testdata/bin/test_osd$i.bin \ --data_device ./testdata/bin/test_osd$i.bin \
--meta_offset 0 \ --meta_offset 0 \
--journal_offset $((1024*1024)) \ --journal_offset $((1024*1024)) \
--data_offset $((128*1024*1024+32768)) >>./testdata/osd$i.log 2>&1 & --data_offset $((128*1024*1024)) >>./testdata/osd$i.log 2>&1 &
eval OSD${i}_PID=$! eval OSD${i}_PID=$!
done done

View File

@ -1,94 +0,0 @@
#!/bin/bash -ex
ANTIETCD=1
. `dirname $0`/common.sh
[[ -e build/src/disk_tool/vitastor-disk-test ]] || ln -s vitastor-disk build/src/disk_tool/vitastor-disk-test
dd if=/dev/zero of=./testdata/bin/test_osd1.bin bs=1 count=1 seek=$((100*1024*1024*1024-1))
LOOP1=$(sudo losetup --show -f ./testdata/bin/test_osd1.bin)
trap "kill -9 $(jobs -p) || true; sudo losetup -d $LOOP1"' || true' EXIT
dd if=/dev/zero of=./testdata/bin/test_meta.bin bs=1 count=1 seek=$((1024*1024*1024-1))
LOOP2=$(sudo losetup --show -f ./testdata/bin/test_meta.bin)
trap "kill -9 $(jobs -p) || true; sudo losetup -d $LOOP1 $LOOP2"' || true' EXIT
# also test prepare --hybrid :)
# non-vitastor random type UUID to prevent udev activation
mount | grep '/dev type devtmpfs' || sudo mount udev /dev/ -t devtmpfs
sudo build/src/disk_tool/vitastor-disk-test prepare --no_init 1 --meta_reserve 1x,1M \
--block_size 131072 --osd_num 987654 --part_type_uuid 0df42ae0-3695-4395-a957-7d5ff3645c56 \
--hybrid --fast-devices $LOOP2 $LOOP1
# write almost empty journal
node <<EOF > ./testdata/journal.json
console.log(JSON.stringify([
{"type":"start","start":"0x1000"},
{"type":"big_write_instant","inode":"0x1000000000001","stripe":"0xc60000","ver":"10","offset":0,"len":131072,"loc":"0x18ffdc0000","bitmap":"ffffffff"}
]));
EOF
sudo build/src/disk_tool/vitastor-disk write-journal ${LOOP1}p1 < ./testdata/journal.json
sudo build/src/disk_tool/vitastor-disk dump-journal --json --format data ${LOOP1}p1 | jq -S '[ .[] | del(.crc32, .crc32_prev) ]' > ./testdata/j2.json
jq -S '[ .[] + {"valid":true} ]' < ./testdata/journal.json > ./testdata/j1.json
diff ./testdata/j1.json ./testdata/j2.json
# write fake metadata items in the end
DATA_DEV_SIZE=$(sudo blockdev --getsize64 ${LOOP1}p1)
BLOCK_COUNT=$(((DATA_DEV_SIZE-4096)/128/1024))
node <<EOF > ./testdata/meta.json
console.log(JSON.stringify({
version: "0.9",
meta_block_size: 4096,
data_block_size: 131072,
bitmap_granularity: 4096,
data_csum_type: "none",
csum_block_size: 0,
entries: [ ...new Array(100).keys() ].map(i => ({
block: ($BLOCK_COUNT-100)+i,
pool: 1,
inode: "0x1",
stripe: "0x"+Number(i*0x20000).toString(16),
version: 10,
bitmap: "ffffffff",
ext_bitmap: "ffffffff",
})),
}));
EOF
# also test write & dump
sudo build/src/disk_tool/vitastor-disk write-meta ${LOOP1}p1 < ./testdata/meta.json
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 > ./testdata/compare.json
jq -S < ./testdata/meta.json > ./testdata/1.json
jq -S < ./testdata/compare.json > ./testdata/2.json
diff ./testdata/1.json ./testdata/2.json
# move journal & meta back, data will become smaller; end indexes should be shifted by -1251
sudo build/src/disk_tool/vitastor-disk-test resize --move-journal '' --move-meta '' ${LOOP1}p1
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 | jq -S > ./testdata/2.json
jq -S '. + {"entries": [ .entries[] | (. + { "block": (.block-1251) }) ]}' < ./testdata/meta.json > ./testdata/1.json
diff ./testdata/1.json ./testdata/2.json
sudo build/src/disk_tool/vitastor-disk dump-journal --json --format data ${LOOP1}p1 | jq -S '[ .[] | del(.crc32, .crc32_prev) ]' > ./testdata/j2.json
jq -S '[ (.[] + {"valid":true}) | (if .type == "big_write_instant" then . + {"loc":"0x18f6160000"} else . end) ]' < ./testdata/journal.json > ./testdata/j1.json
diff ./testdata/j1.json ./testdata/j2.json
# move journal & meta out, data will become larger; end indexes should be shifted back by +1251
sudo build/src/disk_tool/vitastor-disk-test resize --move-journal ${LOOP2}p1 --move-meta ${LOOP2}p2 ${LOOP1}p1
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 | jq -S > ./testdata/2.json
jq -S < ./testdata/meta.json > ./testdata/1.json
diff ./testdata/1.json ./testdata/2.json
jq -S '[ .[] + {"valid":true} ]' < ./testdata/journal.json > ./testdata/j1.json
sudo build/src/disk_tool/vitastor-disk dump-journal --json --format data ${LOOP1}p1 | jq -S '[ .[] | del(.crc32, .crc32_prev) ]' > ./testdata/j2.json
# reduce data device size by exactly 128k * 99 (occupied blocks); exactly 1 should be left in place :)
sudo build/src/disk_tool/vitastor-disk-test resize --data-size $((DATA_DEV_SIZE-128*1024*99)) ${LOOP1}p1
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 | jq -S > ./testdata/2.json
jq -S '. + {"entries": ([ .entries[] | (. + { "block": (.block | if . > '$BLOCK_COUNT'-100 then .-('$BLOCK_COUNT'-100+1) else '$BLOCK_COUNT'-100 end) }) ] | .[1:] + [ .[0] ])}' < ./testdata/meta.json > ./testdata/1.json
diff ./testdata/1.json ./testdata/2.json
jq -S '[ .[] + {"valid":true} ]' < ./testdata/journal.json > ./testdata/j1.json
sudo build/src/disk_tool/vitastor-disk dump-journal --json --format data ${LOOP1}p1 | jq -S '[ .[] | del(.crc32, .crc32_prev) ]' > ./testdata/j2.json
# extend data device size to maximum
sudo build/src/disk_tool/vitastor-disk-test resize --data-size max ${LOOP1}p1
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 | jq -S > ./testdata/2.json
diff ./testdata/1.json ./testdata/2.json
format_green OK