Compare commits
34 Commits
29498c9a9e
...
d0a20b3f7a
Author | SHA1 | Date |
---|---|---|
Vitaliy Filippov | d0a20b3f7a | |
Vitaliy Filippov | 7de38250ad | |
Vitaliy Filippov | 9c59d30e83 | |
Vitaliy Filippov | 5db02cdf6e | |
Vitaliy Filippov | 8202ee9d74 | |
Vitaliy Filippov | 5864bd067c | |
Vitaliy Filippov | c312557ace | |
Vitaliy Filippov | 5ce20116d8 | |
Vitaliy Filippov | be66791e59 | |
Vitaliy Filippov | 141cec2383 | |
Vitaliy Filippov | 1ce4b1b417 | |
Vitaliy Filippov | ebf24bac9a | |
Vitaliy Filippov | edd9051f81 | |
Vitaliy Filippov | 662ca86dc0 | |
Vitaliy Filippov | a1ca573168 | |
Vitaliy Filippov | f69f801ffb | |
Vitaliy Filippov | af92cbdfcc | |
Vitaliy Filippov | a775db10cc | |
Vitaliy Filippov | eafce26049 | |
Vitaliy Filippov | 625c74294f | |
Vitaliy Filippov | ef8c21ad6f | |
Vitaliy Filippov | 2bb8e8999e | |
Vitaliy Filippov | c2e7c28672 | |
Vitaliy Filippov | bd22beefb5 | |
Vitaliy Filippov | e7038ab99c | |
Vitaliy Filippov | b6f75ebcfd | |
Vitaliy Filippov | 9def199981 | |
Vitaliy Filippov | c72e8e649e | |
Vitaliy Filippov | 8bdb3e8786 | |
Vitaliy Filippov | a87e236c70 | |
Vitaliy Filippov | 16f67cf6f1 | |
Vitaliy Filippov | 56de4a520d | |
Vitaliy Filippov | adca162278 | |
Vitaliy Filippov | 490b314d72 |
|
@ -22,7 +22,7 @@ RUN apt-get update
|
||||||
RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \
|
RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \
|
||||||
liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
|
liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
|
||||||
RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
|
RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
|
||||||
RUN apt-get -y install jq lp-solve sudo nfs-common
|
RUN apt-get update && apt-get -y install jq lp-solve sudo nfs-common fdisk parted
|
||||||
RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
|
RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
|
||||||
|
|
||||||
RUN set -ex; \
|
RUN set -ex; \
|
||||||
|
|
|
@ -828,6 +828,42 @@ jobs:
|
||||||
echo ""
|
echo ""
|
||||||
done
|
done
|
||||||
|
|
||||||
|
test_resize:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: build
|
||||||
|
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||||
|
steps:
|
||||||
|
- name: Run test
|
||||||
|
id: test
|
||||||
|
timeout-minutes: 3
|
||||||
|
run: /root/vitastor/tests/test_resize.sh
|
||||||
|
- name: Print logs
|
||||||
|
if: always() && steps.test.outcome == 'failure'
|
||||||
|
run: |
|
||||||
|
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||||
|
echo "-------- $i --------"
|
||||||
|
cat $i
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
|
||||||
|
test_resize_auto:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: build
|
||||||
|
container: ${{env.TEST_IMAGE}}:${{github.sha}}
|
||||||
|
steps:
|
||||||
|
- name: Run test
|
||||||
|
id: test
|
||||||
|
timeout-minutes: 3
|
||||||
|
run: /root/vitastor/tests/test_resize_auto.sh
|
||||||
|
- name: Print logs
|
||||||
|
if: always() && steps.test.outcome == 'failure'
|
||||||
|
run: |
|
||||||
|
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
|
||||||
|
echo "-------- $i --------"
|
||||||
|
cat $i
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
|
||||||
test_snapshot_pool2:
|
test_snapshot_pool2:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: build
|
needs: build
|
||||||
|
|
|
@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
|
||||||
|
|
||||||
project(vitastor)
|
project(vitastor)
|
||||||
|
|
||||||
set(VITASTOR_VERSION "1.9.2")
|
set(VITASTOR_VERSION "1.9.3")
|
||||||
|
|
||||||
add_subdirectory(src)
|
add_subdirectory(src)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
VITASTOR_VERSION ?= v1.9.2
|
VITASTOR_VERSION ?= v1.9.3
|
||||||
|
|
||||||
all: build push
|
all: build push
|
||||||
|
|
||||||
|
|
|
@ -49,7 +49,7 @@ spec:
|
||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
allowPrivilegeEscalation: true
|
allowPrivilegeEscalation: true
|
||||||
image: vitalif/vitastor-csi:v1.9.2
|
image: vitalif/vitastor-csi:v1.9.3
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
|
|
@ -121,7 +121,7 @@ spec:
|
||||||
privileged: true
|
privileged: true
|
||||||
capabilities:
|
capabilities:
|
||||||
add: ["SYS_ADMIN"]
|
add: ["SYS_ADMIN"]
|
||||||
image: vitalif/vitastor-csi:v1.9.2
|
image: vitalif/vitastor-csi:v1.9.3
|
||||||
args:
|
args:
|
||||||
- "--node=$(NODE_ID)"
|
- "--node=$(NODE_ID)"
|
||||||
- "--endpoint=$(CSI_ENDPOINT)"
|
- "--endpoint=$(CSI_ENDPOINT)"
|
||||||
|
|
|
@ -5,7 +5,7 @@ package vitastor
|
||||||
|
|
||||||
const (
|
const (
|
||||||
vitastorCSIDriverName = "csi.vitastor.io"
|
vitastorCSIDriverName = "csi.vitastor.io"
|
||||||
vitastorCSIDriverVersion = "1.9.2"
|
vitastorCSIDriverVersion = "1.9.3"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Config struct fills the parameters of request or user input
|
// Config struct fills the parameters of request or user input
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
vitastor (1.9.2-1) unstable; urgency=medium
|
vitastor (1.9.3-1) unstable; urgency=medium
|
||||||
|
|
||||||
* Bugfixes
|
* Bugfixes
|
||||||
|
|
||||||
|
|
|
@ -6,19 +6,150 @@
|
||||||
|
|
||||||
# Architecture
|
# Architecture
|
||||||
|
|
||||||
|
- [Server-side components](#server-side-components)
|
||||||
- [Basic concepts](#basic-concepts)
|
- [Basic concepts](#basic-concepts)
|
||||||
|
- [Client-side components](#client-side-components)
|
||||||
|
- [Additional utilities](#additional-utilities)
|
||||||
|
- [Overall read/write process](#overall-read-write-process)
|
||||||
|
- [Nuances of request handling](#nuances-of-request-handling)
|
||||||
- [Similarities to Ceph](#similarities-to-ceph)
|
- [Similarities to Ceph](#similarities-to-ceph)
|
||||||
- [Differences from Ceph](#differences-from-ceph)
|
- [Differences from Ceph](#differences-from-ceph)
|
||||||
- [Implementation Principles](#implementation-principles)
|
- [Implementation Principles](#implementation-principles)
|
||||||
|
|
||||||
|
## Server-side components
|
||||||
|
|
||||||
|
- **OSD** (Object Storage Daemon) is a process that directly works with the disk, stores data
|
||||||
|
and serves read/write requests. One OSD serves one disk (or one partition). OSDs talk to etcd
|
||||||
|
and to each other — they receive cluster state from etcd, and send read/write requests for
|
||||||
|
secondary copies of data to other OSDs.
|
||||||
|
- **etcd** — clustered key/value database, used as a reliable storage for configuration
|
||||||
|
and high-level cluster state. Etcd is the component that prevents splitbrain in the cluster.
|
||||||
|
Data blocks are not stored in etcd, etcd doesn't participate in data write or read path.
|
||||||
|
- **Монитор** — a separate node.js based daemon which monitors the cluster, calculates
|
||||||
|
required configuration changes and saves them to etcd, thus commanding OSDs to apply these
|
||||||
|
changes. Monitor also aggregates cluster statistics. OSD don't talk to monitor, monitor
|
||||||
|
only sends and receives data from etcd.
|
||||||
|
|
||||||
## Basic concepts
|
## Basic concepts
|
||||||
|
|
||||||
- OSD (Object Storage Daemon) is a process that stores data and serves read/write requests.
|
- **Pool** is a container for data that has equal redundancy scheme and disk placement rules.
|
||||||
- PG (Placement Group) is a "shard" of the cluster, group of data stored on one set of replicas.
|
- **PG (Placement Group)** is a "shard" of the cluster, subdivision unit that has its own
|
||||||
- Pool is a container for data that has equal redundancy scheme and placement rules.
|
set of OSDs for data storage.
|
||||||
- Monitor is a separate daemon that watches cluster state and handles failures.
|
- **Failure Domain** is a group of OSDs, from the simultaneous failure of which you are
|
||||||
- Failure Domain is a group of OSDs that you allow to fail. It's "host" by default.
|
protected by Vitastor. Default failure domain is "host" (server), but you choose a
|
||||||
- Placement Tree groups OSDs in a hierarchy to later split them into Failure Domains.
|
larger (for example, a rack of servers) or smaller (a single drive) failure domain
|
||||||
|
for every pool.
|
||||||
|
- **Placement Tree** (similar to Ceph CRUSH Tree) groups OSDs in a hierarchy to later
|
||||||
|
split them into Failure Domains.
|
||||||
|
|
||||||
|
## Client-side components
|
||||||
|
|
||||||
|
- **Client library** encapsulates client I/O logic. Client library connects to etcd and to all OSDs,
|
||||||
|
receives cluster state from etcd, sends read and write requests directly to all OSDs. Due
|
||||||
|
to the symmetric distributed architecture, all data blocks (each 128 KB by default) are placed
|
||||||
|
to different OSDs, but clients always know where each data block is stored and connect directly
|
||||||
|
to the right OSD.
|
||||||
|
|
||||||
|
All other client-side components are based on the client library:
|
||||||
|
|
||||||
|
- **[vitastor-cli](../usage/cli.en.md)** — command-line utility for cluster management.
|
||||||
|
Allows to view cluster state, manage pools and images, i.e. create, modify and remove
|
||||||
|
virtual disks, their snapshots and clones.
|
||||||
|
- **[QEMU driver](../usage/qemu.en.md)** — pluggable QEMU module allowing QEMU/KVM virtual
|
||||||
|
machines work with virtual Vitastor disks directly from userspace through the client library,
|
||||||
|
without the need to attach disks as kernel block devices. However, if you want to attach
|
||||||
|
disks, you can also do that with the same driver and [VDUSE](../usage/qemu.en.md#vduse).
|
||||||
|
- **[vitastor-nbd](../usage/nbd.en.md)** — utility that allows to attach Vitastor disks as
|
||||||
|
kernel block devices using NBD (Network Block Device), which works more like "BUSE"
|
||||||
|
(Block Device In Userspace). Vitastor doesn't have Linux kernel modules for the same task
|
||||||
|
(at least by now). NBD is an older, non-recommended way to attach disks — you should use
|
||||||
|
VDUSE whenever you can.
|
||||||
|
- **[CSI driver](../installation/kubernetes.en.md)** — driver for attaching Vitastor images
|
||||||
|
as Kubernetes persistent volumes. Works through VDUSE (when available) or NBD — images are
|
||||||
|
attached as kernel block devices and mounted into containers.
|
||||||
|
- **Drivers for Proxmox, OpenStack and so on** — pluggable modules for corresponding systems,
|
||||||
|
allowing to use Vitastor as storage in them.
|
||||||
|
- **[vitastor-nfs](../usage/nfs.en.md)** — NFS 3.0 server allowing export of two file system variants:
|
||||||
|
the first is a simplified pseudo-FS for file-based access to Vitastor block images (for non-QEMU
|
||||||
|
hypervisors with NFS support), the second is **VitastorFS**, full-featured clustered POSIX FS.
|
||||||
|
Both variants support parallel access from multiple vitastor-nfs servers. In fact, you are
|
||||||
|
not required to setup separate NFS servers at all and use vitastor-nfs mount command on every
|
||||||
|
client node — it starts the NFS server and mounts the FS locally.
|
||||||
|
- **[fio driver](../usage/fio.en.md)** — pluggable module for fio disk benchmarking tool for
|
||||||
|
running performance tests on your Vitastor cluster.
|
||||||
|
- **vitastor-kv** — client for a key-value DB working over shared block volumes (usual
|
||||||
|
vitastor images). VitastorFS metadata is stored in vitastor-kv.
|
||||||
|
|
||||||
|
## Additional utilities
|
||||||
|
|
||||||
|
- **vitastor-disk** — a Vitastor OSD disk management tool. You can create, remove,
|
||||||
|
resize and move OSD partitions with it.
|
||||||
|
|
||||||
|
## Overall read/write process
|
||||||
|
|
||||||
|
- Vitastor stores virtual disks, also named "images" or "inodes".
|
||||||
|
- Each image is stored in some pool. Pool specifies storage parameters such as redundancy
|
||||||
|
scheme (replication or EC — erasure codes, i.e. error correction codes), failure domain
|
||||||
|
and restrictions on OSD selection for image data placement. See [Pool configuration](../config/pool.en.md) for details.
|
||||||
|
- Each image is split into objects/blocks of fixed size, equal to [block_size](../config/layout-cluster.en.md#block_size)
|
||||||
|
(128 KB by default), multiplied by data part count for EC or 1 for replicas. That is,
|
||||||
|
if a pool uses EC 4+2 coding scheme (4 data parts + 2 parity parts), then, with the
|
||||||
|
default block_size, images are split into 512 KB objects.
|
||||||
|
- Client read/write requests are split into parts at object boundaries.
|
||||||
|
- Each object is mapped to a PG number it belongs to, by simply taking a remainder of
|
||||||
|
division of its offset by PG count of the image's pool.
|
||||||
|
- Client reads primary OSD for all PGs from etcd. Primary OSD for each PG is assigned
|
||||||
|
by the monitor during cluster operation, along with the full PG OSD set.
|
||||||
|
- If not already connected, client connects to primary OSDs of all PGs involved in a
|
||||||
|
read/write request and sends parts of the request to them.
|
||||||
|
- If a primary OSD is unavailable, client retries connection attempts indefinitely
|
||||||
|
either until it becomes available or until the monitor assigns another OSD as primary
|
||||||
|
for that PG.
|
||||||
|
- Client also retries requests if the primary OSD replies with error code EPIPE, meaning
|
||||||
|
that the PG is inactive at this OSD at the moment - for example, when the primary OSD
|
||||||
|
is switched, or if the primary OSD itself loses connection to replicas during request
|
||||||
|
handling.
|
||||||
|
- Primary OSD determines where the parts of the object are stored. By default, all objects
|
||||||
|
are assumed to be stored at the target OSD set of a PG, but some of them may be present
|
||||||
|
at a different OSD set if they are degraded or moved, or if the data rebalancing process
|
||||||
|
is active. OSDs doesn't do any network requests, if calculates locations of all objects
|
||||||
|
during PG activation and stores it in memory.
|
||||||
|
- Primary OSD handles the request locally when it can - for example, when it's a read
|
||||||
|
from a replicated pool or when it's a read from a EC pool involving only one data part
|
||||||
|
stored on the OSD's local disk.
|
||||||
|
- When a request requires reads or writes to additional OSDs, primary OSD uses already
|
||||||
|
established connections to secondary OSDs of the PG to execute these requests. This happens
|
||||||
|
in parallel to local disk operations. All such connections are guaranteed to be already
|
||||||
|
established when the PG is active, and if any of them is dropped, PG is restarted and
|
||||||
|
all current read/write operations to it fail with EPIPE error and are retried by clients.
|
||||||
|
- After completing all secondary read/write requests, primary OSD sends the response to
|
||||||
|
the client.
|
||||||
|
|
||||||
|
### Nuances of request handling
|
||||||
|
|
||||||
|
- If a pool uses erasure codes and some of the OSDs are unavailable, primary OSDs recover
|
||||||
|
data from the remaining parts during read.
|
||||||
|
- Each object has a version number. During write, primary OSD first determines the current
|
||||||
|
version of the object. As primary OSD usually stores the object or its part itself, most
|
||||||
|
of the time version is read from the memory of the OSD itself. However, if primary OSD
|
||||||
|
doesn't contain parts of the object, it requests the version number from a secondary OSD
|
||||||
|
which has that part. Such request still doesn't involve reading from the disk though,
|
||||||
|
because object metadata, including version number, is always stored in OSD memory.
|
||||||
|
- If a pool uses erasure codes, partial writes of an object require reading other parts of
|
||||||
|
it from secondary OSDs or from the local disk of the primary OSD itself. This is called
|
||||||
|
"read-modify-write" process.
|
||||||
|
- If a pool uses erasure codes, two-phase write process is used to get rid of the Write Hole
|
||||||
|
problem: first a new version of object parts is written to all secondary OSDs without
|
||||||
|
removing the previous version, and then, after receiving successful write confirmations
|
||||||
|
from all OSDs, new version is committed and the old one is allowed to be removed.
|
||||||
|
- In a pool doesn't use immediate_commit mode, then write requests sent by clients aren't
|
||||||
|
treated as committed to physical media instantly. Clients have to send separate type of
|
||||||
|
requests (SYNC) to commit changes, and before it isn't sent, new versions of data are
|
||||||
|
allowed to be lost if some OSDs die. Thus, when immediate_commit is disabled, clients
|
||||||
|
store copies of all write requests in memory and repeat them from there when the
|
||||||
|
connection to primary OSD is lost. This in-memory copy is removed after a successful
|
||||||
|
SYNC, and to prevent excessive memory usage, clients also do an automatic SYNC
|
||||||
|
every [client_dirty_limit](../config/network.en.md#client_dirty_limit) written bytes.
|
||||||
|
|
||||||
## Similarities to Ceph
|
## Similarities to Ceph
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,7 @@
|
||||||
- [Серверные компоненты](#серверные-компоненты)
|
- [Серверные компоненты](#серверные-компоненты)
|
||||||
- [Базовые понятия](#базовые-понятия)
|
- [Базовые понятия](#базовые-понятия)
|
||||||
- [Клиентские компоненты](#клиентские-компоненты)
|
- [Клиентские компоненты](#клиентские-компоненты)
|
||||||
|
- [Дополнительные утилиты](#дополнительные-утилиты)
|
||||||
- [Общий процесс записи и чтения](#общий-процесс-записи-и-чтения)
|
- [Общий процесс записи и чтения](#общий-процесс-записи-и-чтения)
|
||||||
- [Особенности обработки запросов](#особенности-обработки-запросов)
|
- [Особенности обработки запросов](#особенности-обработки-запросов)
|
||||||
- [Схожесть с Ceph](#схожесть-с-ceph)
|
- [Схожесть с Ceph](#схожесть-с-ceph)
|
||||||
|
@ -34,8 +35,9 @@
|
||||||
- **Пул (Pool)** — контейнер для данных, имеющих одну и ту же схему избыточности и правила распределения по OSD.
|
- **Пул (Pool)** — контейнер для данных, имеющих одну и ту же схему избыточности и правила распределения по OSD.
|
||||||
- **PG (Placement Group)** — "шард", единица деления пулов в кластере, которой назначается свой набор
|
- **PG (Placement Group)** — "шард", единица деления пулов в кластере, которой назначается свой набор
|
||||||
OSD для хранения данных (копий или частей объектов).
|
OSD для хранения данных (копий или частей объектов).
|
||||||
- **Домен отказа (Failure Domain)** — группа OSD, одновременное падение которых рассматривается
|
- **Домен отказа (Failure Domain)** — группа OSD, от одновременного падения которых должен защищать
|
||||||
как вероятное. По умолчанию это "host" (сервер).
|
Vitastor. По умолчанию домен отказа — "host" (сервер), но вы можете установить для пула как больший
|
||||||
|
домен отказа (например, стойку серверов), так и меньший (например, отдельный диск).
|
||||||
- **Дерево распределения** (Placement Tree, в Ceph CRUSH Tree) — иерархическая группировка OSD
|
- **Дерево распределения** (Placement Tree, в Ceph CRUSH Tree) — иерархическая группировка OSD
|
||||||
в узлы, которые далее можно использовать как домены отказа.
|
в узлы, которые далее можно использовать как домены отказа.
|
||||||
|
|
||||||
|
@ -49,25 +51,39 @@
|
||||||
|
|
||||||
На базе клиентской библиотеки реализованы все остальные клиенты:
|
На базе клиентской библиотеки реализованы все остальные клиенты:
|
||||||
|
|
||||||
- **vitastor-cli** — утилита командной строки для управления кластером. В данный момент позволяет
|
- **[vitastor-cli](../usage/cli.ru.md)** — утилита командной строки для управления кластером.
|
||||||
просматривать общее состояние кластера и управлять образами — т.е. создавать, менять и удалять
|
Позволяет просматривать общее состояние кластера, управлять пулами и образами — то есть
|
||||||
виртуальные диски, их снимки и клоны.
|
создавать, менять и удалять виртуальные диски, их снимки и клоны.
|
||||||
- **Драйвер QEMU** — подключаемый модуль QEMU, позволяющий QEMU/KVM виртуальным машинам работать
|
- **[Драйвер QEMU](../usage/qemu.ru.md)** — подключаемый модуль QEMU, позволяющий QEMU/KVM
|
||||||
с виртуальными дисками Vitastor напрямую из пространства пользователя с помощью клиентской
|
виртуальным машинам работать с виртуальными дисками Vitastor напрямую из пространства пользователя
|
||||||
библиотеки, без необходимости отображения дисков в виде блочных устройств. Тот же драйвер
|
с помощью клиентской библиотеки, без необходимости подключения дисков в виде блочных устройств
|
||||||
позволяет подключать диски в систему через [VDUSE](../usage/qemu.ru.md#vduse).
|
Linux. Если, однако, вы хотите подключать диски в виде блочных устройств, то вы тоже можете
|
||||||
- **vitastor-nbd** — утилита, позволяющая монтировать образы Vitastor в виде блочных устройств
|
сделать это с помощью того же самого драйвера и [VDUSE](../usage/qemu.ru.md#vduse).
|
||||||
с помощью NBD (Network Block Device), на самом деле скорее работающего как "BUSE"
|
- **[vitastor-nbd](../usage/nbd.ru.md)** — утилита, позволяющая монтировать образы Vitastor
|
||||||
(Block Device In Userspace). Модуля ядра Linux для выполнения той же задачи в Vitastor нет
|
в виде блочных устройств с помощью NBD (Network Block Device), на самом деле скорее работающего
|
||||||
(по крайней мере, пока).
|
как "BUSE" (Block Device In Userspace). Модуля ядра Linux для выполнения той же задачи в
|
||||||
- **CSI драйвер** — драйвер для подключения Vitastor-образов в виде персистентных томов (PV) Kubernetes.
|
Vitastor нет (по крайней мере, пока). NBD — более старый и нерекомендуемый способ подключения
|
||||||
Работает через vitastor-nbd — образы отражаются в виде блочных устройств и монтируются
|
дисков — вам следует использовать VDUSE всегда, когда это возможно.
|
||||||
в контейнеры.
|
- **[CSI драйвер](../installation/kubernetes.ru.md)** — драйвер для подключения Vitastor-образов
|
||||||
|
в виде персистентных томов (PV) Kubernetes. Работает через VDUSE (если доступно) или через
|
||||||
|
NBD — образы отражаются в виде блочных устройств и монтируются в контейнеры.
|
||||||
- **Драйвера Proxmox, OpenStack и т.п.** — подключаемые модули для соответствующих систем,
|
- **Драйвера Proxmox, OpenStack и т.п.** — подключаемые модули для соответствующих систем,
|
||||||
позволяющие использовать Vitastor как хранилище в оных.
|
позволяющие использовать Vitastor как хранилище в оных.
|
||||||
- **vitastor-nfs** — утилита, предоставляющая файловый доступ к образам в кластере Vitastor
|
- **[vitastor-nfs](../usage/nfs.ru.md)** — NFS 3.0 сервер, предоставляющий два варианта файловой системы:
|
||||||
по протоколу NFS 3.0. Предназначена для гипервизоров, не основанных на QEMU и Linux, но при
|
первая — упрощённая для файлового доступа к блочным образам (для не-QEMU гипервизоров, поддерживающих NFS),
|
||||||
этом поддерживающих NFS.
|
вторая — VitastorFS, полноценная кластерная POSIX ФС. Оба варианта поддерживают параллельный
|
||||||
|
доступ с нескольких vitastor-nfs серверов. На самом деле можно вообще не выделять
|
||||||
|
отдельные NFS-серверы, а вместо этого использовать команду vitastor-nfs mount, запускающую
|
||||||
|
NFS-сервер прямо на клиентской машине и монтирующую ФС локально.
|
||||||
|
- **[Драйвер fio](../usage/fio.ru.md)** — подключаемый модуль для утилиты тестирования
|
||||||
|
производительности дисков fio, позволяющий тестировать Vitastor-кластеры.
|
||||||
|
- **vitastor-kv** — клиент для key-value базы данных, работающей поверх разделяемого блочного
|
||||||
|
образа (обычного блочного образа vitastor). Метаданные VitastorFS хранятся именно в vitastor-kv.
|
||||||
|
|
||||||
|
## Дополнительные утилиты
|
||||||
|
|
||||||
|
- **vitastor-disk** — утилита для разметки дисков под Vitastor OSD. С её помощью можно
|
||||||
|
создавать, удалять, менять размеры или перемещать разделы OSD.
|
||||||
|
|
||||||
## Общий процесс записи и чтения
|
## Общий процесс записи и чтения
|
||||||
|
|
||||||
|
@ -98,16 +114,22 @@
|
||||||
находиться на других OSD, если эти объекты деградированы или перемещены, или идёт процесс
|
находиться на других OSD, если эти объекты деградированы или перемещены, или идёт процесс
|
||||||
ребаланса. Запросы для проверки по сети не отправляются, информация о местоположении всех
|
ребаланса. Запросы для проверки по сети не отправляются, информация о местоположении всех
|
||||||
объектов рассчитывается первичным OSD при активации PG и хранится в памяти.
|
объектов рассчитывается первичным OSD при активации PG и хранится в памяти.
|
||||||
- Первичный OSD соединяется (если ещё не соединён) с вторичными OSD, на которых располагаются
|
- Когда это возможно, первичный OSD обрабатывает запрос локально. Например, так происходит
|
||||||
части объекта, и отправляет им запросы чтения/записи, а также читает/пишет из/в своё локальное
|
при чтениях объектов из пулов с репликацией или при чтении из EC пула, затрагивающего
|
||||||
хранилище, если сам входит в набор.
|
только часть, хранимую на диске самого первичного OSD.
|
||||||
|
- Когда запрос требует записи или чтения с вторичных OSD, первичный OSD использует заранее
|
||||||
|
установленные соединения с ними для выполнения этих запросов. Это происходит параллельно
|
||||||
|
локальным операциям чтения/записи с диска самого OSD. Так как соединения к вторичным OSD PG
|
||||||
|
устанавливаются при её запуске, то они уже гарантированно установлены, когда PG активна,
|
||||||
|
и если любое из этих соединений отключается, PG перезапускается, а все текущие запросы чтения
|
||||||
|
и записи в неё завершаются с ошибкой EPIPE, после чего повторяются клиентами.
|
||||||
- После завершения всех вторичных операций чтения/записи первичный OSD отправляет ответ клиенту.
|
- После завершения всех вторичных операций чтения/записи первичный OSD отправляет ответ клиенту.
|
||||||
|
|
||||||
### Особенности обработки запросов
|
### Особенности обработки запросов
|
||||||
|
|
||||||
- Если в пуле используются коды коррекции ошибок и при этом часть OSD недоступна, первичный
|
- Если в пуле используются коды коррекции ошибок и при этом часть OSD недоступна, первичный
|
||||||
OSD при чтении восстанавливает данные из оставшихся частей.
|
OSD при чтении восстанавливает данные из оставшихся частей.
|
||||||
- Каждый объект имеет номер версии. При записи объекта первичный OSD сначала читает из номер
|
- Каждый объект имеет номер версии. При записи объекта первичный OSD сначала получает номер
|
||||||
версии объекта. Так как первичный OSD обычно сам хранит копию или часть объекта, номер
|
версии объекта. Так как первичный OSD обычно сам хранит копию или часть объекта, номер
|
||||||
версии обычно читается из памяти самого OSD. Однако, если ни одна часть обновляемого объекта
|
версии обычно читается из памяти самого OSD. Однако, если ни одна часть обновляемого объекта
|
||||||
не находится на первичном OSD, для получения номера версии он обращается к одному из вторичных
|
не находится на первичном OSD, для получения номера версии он обращается к одному из вторичных
|
||||||
|
@ -115,20 +137,20 @@
|
||||||
так как метаданные объектов, включая номер версии, все OSD хранят в памяти.
|
так как метаданные объектов, включая номер версии, все OSD хранят в памяти.
|
||||||
- Если в пуле используются коды коррекции ошибок, перед частичной записью объекта для вычисления
|
- Если в пуле используются коды коррекции ошибок, перед частичной записью объекта для вычисления
|
||||||
чётности зачастую требуется чтение частей объекта с вторичных OSD или с локального диска
|
чётности зачастую требуется чтение частей объекта с вторичных OSD или с локального диска
|
||||||
самого первичного OSD.
|
самого первичного OSD. Это называется процессом "чтение-модификация-запись" (read-modify-write).
|
||||||
- Также, если в пуле используются коды коррекции ошибок, для закрытия Write Hole применяется
|
- Если в пуле используются коды коррекции ошибок, для закрытия Write Hole применяется
|
||||||
двухфазный алгоритм записи: сначала на все вторичные OSD записывается новая версия частей
|
двухфазный алгоритм записи: сначала на все вторичные OSD записывается новая версия частей
|
||||||
объекта, но при этом старая версия не удаляется, а потом, после получения подтверждения
|
объекта, но при этом старая версия не удаляется, а потом, после получения подтверждения
|
||||||
успешной записи от всех вторичных OSD, новая версия фиксируется и разрешается удаление старой.
|
успешной записи от всех вторичных OSD, новая версия фиксируется и разрешается удаление старой.
|
||||||
- Если в кластере не включён режим immediate_commit, то запросы записи, отправляемые клиентами,
|
- Если в пуле не включён режим immediate_commit, то запросы записи, отправляемые клиентами,
|
||||||
не считаются зафиксированными на физических накопителях сразу. Для фиксации данных клиенты
|
не считаются зафиксированными на физических накопителях сразу. Для фиксации данных клиенты
|
||||||
должны отдельно отправлять запросы SYNC (отдельный от чтения и записи вид запроса),
|
должны отдельно отправлять запросы SYNC (отдельный от чтения и записи вид запроса),
|
||||||
а пока такой запрос не отправлен, считается, что записанные данные могут исчезнуть,
|
а пока такой запрос не отправлен, считается, что записанные данные могут исчезнуть,
|
||||||
если соответствующий OSD упадёт. Поэтому, когда режим immediate_commit отключён, все
|
если соответствующий OSD упадёт. Поэтому, когда режим immediate_commit отключён, все
|
||||||
запросы записи клиенты копируют в памяти и при потере соединения и повторном соединении
|
запросы записи клиенты копируют в памяти и при потере соединения и повторном соединении
|
||||||
с OSD повторяют из памяти. Скопированные в память данные удаляются при успешном fsync,
|
с OSD повторяют из памяти. Скопированные в память данные удаляются при успешном SYNC,
|
||||||
а чтобы хранение этих данных не приводило к чрезмерному потреблению памяти, клиенты
|
а чтобы хранение этих данных не приводило к чрезмерному потреблению памяти, клиенты
|
||||||
автоматически выполняют fsync каждые [client_dirty_limit](../config/network.ru.md#client_dirty_limit)
|
автоматически выполняют SYNC каждые [client_dirty_limit](../config/network.ru.md#client_dirty_limit)
|
||||||
записанных байт.
|
записанных байт.
|
||||||
|
|
||||||
## Схожесть с Ceph
|
## Схожесть с Ceph
|
||||||
|
|
|
@ -171,7 +171,14 @@ to make them use the new version of the client library.
|
||||||
|
|
||||||
### 1.7.x to 1.8.0
|
### 1.7.x to 1.8.0
|
||||||
|
|
||||||
After upgrading version <= 1.7.x to version >= 1.8.0, BUT <= 1.9.0: restart all clients
|
It's recommended to upgrade from version <= 1.7.x to version >= 1.8.0 with full downtime,
|
||||||
|
i.e. you should first stop clients and then the cluster (OSDs and monitor), because 1.8.0
|
||||||
|
includes a fix for etcd event stream inconsistency which could lead to "incomplete" objects
|
||||||
|
appearing in EC pools, and in rare cases, probably, even to data corruption during mass OSD
|
||||||
|
restarts. It doesn't mean that you WILL hit this problem if you upgrade without full downtime,
|
||||||
|
but it's better to secure yourself against it.
|
||||||
|
|
||||||
|
Also, if you upgrade version from <= 1.7.x to version >= 1.8.0, BUT <= 1.9.0: restart all clients
|
||||||
(VMs and so on), otherwise they will hang when monitor clears old PG configuration key,
|
(VMs and so on), otherwise they will hang when monitor clears old PG configuration key,
|
||||||
which happens 24 hours after upgrade.
|
which happens 24 hours after upgrade.
|
||||||
|
|
||||||
|
|
|
@ -168,7 +168,14 @@ done
|
||||||
|
|
||||||
### 1.7.x -> 1.8.0
|
### 1.7.x -> 1.8.0
|
||||||
|
|
||||||
После обновления с версий <= 1.7.x до версий >= 1.8.0, НО <= 1.9.0: перезапустите всех
|
Обновляться с версий <= 1.7.x до версий >= 1.8.0 рекомендуется с полной остановкой
|
||||||
|
сначала клиентов, а затем кластера, так как в 1.8.0 исправлена проблема (неконсистентность
|
||||||
|
потоков событий от etcd), способная приводить к появлению incomplete объектов в EC-пулах
|
||||||
|
и, хоть и редко, но даже к повреждению данных при массовых перезапусках OSD. Если вы
|
||||||
|
обновляетесь без полной остановки - это не значит, что вы обязательно столкнётесь с этой
|
||||||
|
проблемой, но лучше подстраховаться.
|
||||||
|
|
||||||
|
Также, если вы обновляетесь с версии <= 1.7.x до версии >= 1.8.0, НО <= 1.9.0: перезапустите всех
|
||||||
клиентов (процессы виртуальных машин можно перезапустить путём миграции на другой сервер),
|
клиентов (процессы виртуальных машин можно перезапустить путём миграции на другой сервер),
|
||||||
иначе они зависнут, когда монитор удалит старый ключ конфигурации PG, что происходит через
|
иначе они зависнут, когда монитор удалит старый ключ конфигурации PG, что происходит через
|
||||||
24 часа после обновления.
|
24 часа после обновления.
|
||||||
|
|
|
@ -51,12 +51,16 @@ Options (automatic mode):
|
||||||
--osd_per_disk <N>
|
--osd_per_disk <N>
|
||||||
Create <N> OSDs on each disk (default 1)
|
Create <N> OSDs on each disk (default 1)
|
||||||
--hybrid
|
--hybrid
|
||||||
Prepare hybrid (HDD+SSD) OSDs using provided devices. SSDs will be used for
|
Prepare hybrid (HDD+SSD, NVMe+SATA or etc) OSDs using provided devices. By default,
|
||||||
journals and metadata, HDDs will be used for data. Partitions for journals and
|
any passed SSDs will be used for journals and metadata, HDDs will be used for data,
|
||||||
metadata will be created automatically. Whether disks are SSD or HDD is decided
|
but you can override this behaviour with --fast-devices option. Journal and metadata
|
||||||
by the `/sys/block/.../queue/rotational` flag. In hybrid mode, default object
|
partitions will be created automatically. In the default mode, SSD and HDD disks
|
||||||
size is 1 MB instead of 128 KB, default journal size is 1 GB instead of 32 MB,
|
are distinguished by the `/sys/block/.../queue/rotational` flag. When HDDs are used
|
||||||
and throttle_small_writes is enabled by default.
|
for data in hybrid mode, default block_size is 1 MB instead of 128 KB, default journal
|
||||||
|
size is 1 GB instead of 32 MB, and throttle_small_writes is enabled by default.
|
||||||
|
--fast-devices /dev/nvmeX,/dev/nvmeY
|
||||||
|
In --hybrid mode, use these devices for journal and metadata instead of auto-detecting
|
||||||
|
and extracting them from the main [devices...] list.
|
||||||
--disable_data_fsync auto
|
--disable_data_fsync auto
|
||||||
Disable data device cache and fsync (1/yes/true = on, default auto)
|
Disable data device cache and fsync (1/yes/true = on, default auto)
|
||||||
--disable_meta_fsync auto
|
--disable_meta_fsync auto
|
||||||
|
|
|
@ -51,12 +51,17 @@ vitastor-disk - инструмент командной строки для уп
|
||||||
--osd_per_disk <N>
|
--osd_per_disk <N>
|
||||||
Создавать по несколько (<N>) OSD на каждом диске (по умолчанию 1)
|
Создавать по несколько (<N>) OSD на каждом диске (по умолчанию 1)
|
||||||
--hybrid
|
--hybrid
|
||||||
Инициализировать гибридные (HDD+SSD) OSD на указанных дисках. SSD будут
|
Инициализировать гибридные (HDD+SSD, NVMe+SATA и т.п.) OSD на указанных дисках.
|
||||||
использованы для журналов и метаданных, а HDD - для данных. Разделы для журналов
|
По умолчанию, SSD будут использованы для журналов и метаданных, а HDD - для данных,
|
||||||
и метаданных будут созданы автоматически. Является ли диск SSD или HDD, определяется
|
но вы можете поменять это поведение опцией --fast-devices. Разделы для журналов
|
||||||
по флагу `/sys/block/.../queue/rotational`. В гибридном режиме по умолчанию
|
и метаданных будут созданы автоматически. В режиме по умолчанию SSD и HDD-диски
|
||||||
используется размер объекта 1 МБ вместо 128 КБ, размер журнала 1 ГБ вместо 32 МБ
|
различаются по флагу `/sys/block/.../queue/rotational`. Когда в гибридном режиме
|
||||||
и включённый throttle_small_writes.
|
для данных используются HDD, по умолчанию размер блока устанавливается 1 МБ вместо
|
||||||
|
128 КБ, размер журнала 1 ГБ вместо 32 МБ, и throttle_small_writes включается по
|
||||||
|
умолчанию.
|
||||||
|
--fast-devices /dev/nvmeX,/dev/nvmeY
|
||||||
|
Использовать данные диски для журналов и метаданных в гибридном режиме вместо их
|
||||||
|
автоопределения и извлечения из основного списка [devices...].
|
||||||
--disable_data_fsync auto
|
--disable_data_fsync auto
|
||||||
Отключать кэш и fsync-и для устройств данных. (1/yes/true = да, по умолчанию автоопределение)
|
Отключать кэш и fsync-и для устройств данных. (1/yes/true = да, по умолчанию автоопределение)
|
||||||
--disable_meta_fsync auto
|
--disable_meta_fsync auto
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{
|
{
|
||||||
"name": "vitastor-mon",
|
"name": "vitastor-mon",
|
||||||
"version": "1.9.2",
|
"version": "1.9.3",
|
||||||
"description": "Vitastor SDS monitor service",
|
"description": "Vitastor SDS monitor service",
|
||||||
"main": "mon-main.js",
|
"main": "mon-main.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
|
|
@ -50,7 +50,7 @@ from cinder.volume import configuration
|
||||||
from cinder.volume import driver
|
from cinder.volume import driver
|
||||||
from cinder.volume import volume_utils
|
from cinder.volume import volume_utils
|
||||||
|
|
||||||
VITASTOR_VERSION = '1.9.2'
|
VITASTOR_VERSION = '1.9.3'
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
|
@ -306,12 +306,12 @@ index e5ff653a60..884ecc79ea 100644
|
||||||
+ etcd = virBufferContentAndReset(&buf);
|
+ etcd = virBufferContentAndReset(&buf);
|
||||||
+ }
|
+ }
|
||||||
+
|
+
|
||||||
+ if (virJSONValueObjectCreate(&ret,
|
+ if (virJSONValueObjectAdd(&ret,
|
||||||
+ "S:etcd-host", etcd,
|
+ "S:etcd-host", etcd,
|
||||||
+ "S:etcd-prefix", src->query,
|
+ "S:etcd-prefix", src->query,
|
||||||
+ "S:config-path", src->configFile,
|
+ "S:config-path", src->configFile,
|
||||||
+ "s:image", src->path,
|
+ "s:image", src->path,
|
||||||
+ NULL) < 0)
|
+ NULL) < 0)
|
||||||
+ return NULL;
|
+ return NULL;
|
||||||
+
|
+
|
||||||
+ return ret;
|
+ return ret;
|
||||||
|
|
|
@ -0,0 +1,193 @@
|
||||||
|
Index: pve-qemu-kvm-9.0.0/block/meson.build
|
||||||
|
===================================================================
|
||||||
|
--- pve-qemu-kvm-9.0.0.orig/block/meson.build
|
||||||
|
+++ pve-qemu-kvm-9.0.0/block/meson.build
|
||||||
|
@@ -126,6 +126,7 @@ foreach m : [
|
||||||
|
[libnfs, 'nfs', files('nfs.c')],
|
||||||
|
[libssh, 'ssh', files('ssh.c')],
|
||||||
|
[rbd, 'rbd', files('rbd.c')],
|
||||||
|
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||||
|
]
|
||||||
|
if m[0].found()
|
||||||
|
module_ss = ss.source_set()
|
||||||
|
Index: pve-qemu-kvm-9.0.0/meson.build
|
||||||
|
===================================================================
|
||||||
|
--- pve-qemu-kvm-9.0.0.orig/meson.build
|
||||||
|
+++ pve-qemu-kvm-9.0.0/meson.build
|
||||||
|
@@ -1452,6 +1452,26 @@ if not get_option('rbd').auto() or have_
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
+vitastor = not_found
|
||||||
|
+if not get_option('vitastor').auto() or have_block
|
||||||
|
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||||
|
+ required: get_option('vitastor'))
|
||||||
|
+ if libvitastor_client.found()
|
||||||
|
+ if cc.links('''
|
||||||
|
+ #include <vitastor_c.h>
|
||||||
|
+ int main(void) {
|
||||||
|
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||||
|
+ return 0;
|
||||||
|
+ }''', dependencies: libvitastor_client)
|
||||||
|
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||||
|
+ elif get_option('vitastor').enabled()
|
||||||
|
+ error('could not link libvitastor_client')
|
||||||
|
+ else
|
||||||
|
+ warning('could not link libvitastor_client, disabling')
|
||||||
|
+ endif
|
||||||
|
+ endif
|
||||||
|
+endif
|
||||||
|
+
|
||||||
|
glusterfs = not_found
|
||||||
|
glusterfs_ftruncate_has_stat = false
|
||||||
|
glusterfs_iocb_has_stat = false
|
||||||
|
@@ -2254,6 +2274,7 @@ endif
|
||||||
|
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
||||||
|
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
|
||||||
|
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||||
|
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||||
|
config_host_data.set('CONFIG_RDMA', rdma.found())
|
||||||
|
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
|
||||||
|
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
|
||||||
|
@@ -4454,6 +4475,7 @@ summary_info += {'fdt support': fd
|
||||||
|
summary_info += {'libcap-ng support': libcap_ng}
|
||||||
|
summary_info += {'bpf support': libbpf}
|
||||||
|
summary_info += {'rbd support': rbd}
|
||||||
|
+summary_info += {'vitastor support': vitastor}
|
||||||
|
summary_info += {'smartcard support': cacard}
|
||||||
|
summary_info += {'U2F support': u2f}
|
||||||
|
summary_info += {'libusb': libusb}
|
||||||
|
Index: pve-qemu-kvm-9.0.0/meson_options.txt
|
||||||
|
===================================================================
|
||||||
|
--- pve-qemu-kvm-9.0.0.orig/meson_options.txt
|
||||||
|
+++ pve-qemu-kvm-9.0.0/meson_options.txt
|
||||||
|
@@ -194,6 +194,8 @@ option('lzo', type : 'feature', value :
|
||||||
|
description: 'lzo compression support')
|
||||||
|
option('rbd', type : 'feature', value : 'auto',
|
||||||
|
description: 'Ceph block device driver')
|
||||||
|
+option('vitastor', type : 'feature', value : 'auto',
|
||||||
|
+ description: 'Vitastor block device driver')
|
||||||
|
option('opengl', type : 'feature', value : 'auto',
|
||||||
|
description: 'OpenGL support')
|
||||||
|
option('rdma', type : 'feature', value : 'auto',
|
||||||
|
Index: pve-qemu-kvm-9.0.0/qapi/block-core.json
|
||||||
|
===================================================================
|
||||||
|
--- pve-qemu-kvm-9.0.0.orig/qapi/block-core.json
|
||||||
|
+++ pve-qemu-kvm-9.0.0/qapi/block-core.json
|
||||||
|
@@ -3481,7 +3481,7 @@
|
||||||
|
'raw', 'rbd',
|
||||||
|
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||||
|
'pbs',
|
||||||
|
- 'ssh', 'throttle', 'vdi', 'vhdx',
|
||||||
|
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
|
||||||
|
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
|
||||||
|
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
|
||||||
|
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
|
||||||
|
@@ -4591,6 +4591,28 @@
|
||||||
|
'*server': ['InetSocketAddressBase'] } }
|
||||||
|
|
||||||
|
##
|
||||||
|
+# @BlockdevOptionsVitastor:
|
||||||
|
+#
|
||||||
|
+# Driver specific block device options for vitastor
|
||||||
|
+#
|
||||||
|
+# @image: Image name
|
||||||
|
+# @inode: Inode number
|
||||||
|
+# @pool: Pool ID
|
||||||
|
+# @size: Desired image size in bytes
|
||||||
|
+# @config-path: Path to Vitastor configuration
|
||||||
|
+# @etcd-host: etcd connection address(es)
|
||||||
|
+# @etcd-prefix: etcd key/value prefix
|
||||||
|
+##
|
||||||
|
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||||
|
+ 'data': { '*inode': 'uint64',
|
||||||
|
+ '*pool': 'uint64',
|
||||||
|
+ '*size': 'uint64',
|
||||||
|
+ '*image': 'str',
|
||||||
|
+ '*config-path': 'str',
|
||||||
|
+ '*etcd-host': 'str',
|
||||||
|
+ '*etcd-prefix': 'str' } }
|
||||||
|
+
|
||||||
|
+##
|
||||||
|
# @ReplicationMode:
|
||||||
|
#
|
||||||
|
# An enumeration of replication modes.
|
||||||
|
@@ -5053,6 +5075,7 @@
|
||||||
|
'throttle': 'BlockdevOptionsThrottle',
|
||||||
|
'vdi': 'BlockdevOptionsGenericFormat',
|
||||||
|
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||||
|
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||||
|
'virtio-blk-vfio-pci':
|
||||||
|
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
||||||
|
'if': 'CONFIG_BLKIO' },
|
||||||
|
@@ -5498,6 +5521,20 @@
|
||||||
|
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||||
|
|
||||||
|
##
|
||||||
|
+# @BlockdevCreateOptionsVitastor:
|
||||||
|
+#
|
||||||
|
+# Driver specific image creation options for Vitastor.
|
||||||
|
+#
|
||||||
|
+# @location: Where to store the new image file. This location cannot
|
||||||
|
+# point to a snapshot.
|
||||||
|
+#
|
||||||
|
+# @size: Size of the virtual disk in bytes
|
||||||
|
+##
|
||||||
|
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||||
|
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||||
|
+ 'size': 'size' } }
|
||||||
|
+
|
||||||
|
+##
|
||||||
|
# @BlockdevVmdkSubformat:
|
||||||
|
#
|
||||||
|
# Subformat options for VMDK images
|
||||||
|
@@ -5719,6 +5753,7 @@
|
||||||
|
'ssh': 'BlockdevCreateOptionsSsh',
|
||||||
|
'vdi': 'BlockdevCreateOptionsVdi',
|
||||||
|
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||||
|
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||||
|
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||||
|
'vpc': 'BlockdevCreateOptionsVpc'
|
||||||
|
} }
|
||||||
|
Index: pve-qemu-kvm-9.0.0/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||||
|
===================================================================
|
||||||
|
--- pve-qemu-kvm-9.0.0.orig/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||||
|
+++ pve-qemu-kvm-9.0.0/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||||
|
@@ -30,7 +30,7 @@
|
||||||
|
--with-suffix="qemu-kvm" \
|
||||||
|
--firmwarepath=/usr/share/qemu-firmware \
|
||||||
|
--target-list="x86_64-softmmu" \
|
||||||
|
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||||
|
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||||
|
--audio-drv-list="" \
|
||||||
|
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
|
||||||
|
--with-coroutine=ucontext \
|
||||||
|
@@ -176,6 +176,7 @@
|
||||||
|
--enable-opengl \
|
||||||
|
--enable-pie \
|
||||||
|
--enable-rbd \
|
||||||
|
+--enable-vitastor \
|
||||||
|
--enable-rdma \
|
||||||
|
--enable-seccomp \
|
||||||
|
--enable-snappy \
|
||||||
|
Index: pve-qemu-kvm-9.0.0/scripts/meson-buildoptions.sh
|
||||||
|
===================================================================
|
||||||
|
--- pve-qemu-kvm-9.0.0.orig/scripts/meson-buildoptions.sh
|
||||||
|
+++ pve-qemu-kvm-9.0.0/scripts/meson-buildoptions.sh
|
||||||
|
@@ -168,6 +168,7 @@ meson_options_help() {
|
||||||
|
printf "%s\n" ' qed qed image format support'
|
||||||
|
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
||||||
|
printf "%s\n" ' rbd Ceph block device driver'
|
||||||
|
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
||||||
|
printf "%s\n" ' rdma Enable RDMA-based migration'
|
||||||
|
printf "%s\n" ' replication replication support'
|
||||||
|
printf "%s\n" ' rutabaga-gfx rutabaga_gfx support'
|
||||||
|
@@ -445,6 +446,8 @@ _meson_option_parse() {
|
||||||
|
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
|
||||||
|
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
||||||
|
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
||||||
|
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
||||||
|
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
||||||
|
--enable-rdma) printf "%s" -Drdma=enabled ;;
|
||||||
|
--disable-rdma) printf "%s" -Drdma=disabled ;;
|
||||||
|
--enable-relocatable) printf "%s" -Drelocatable=true ;;
|
|
@ -0,0 +1,172 @@
|
||||||
|
diff --git a/block/meson.build b/block/meson.build
|
||||||
|
index f1262ec2ba..3cf3e23f16 100644
|
||||||
|
--- a/block/meson.build
|
||||||
|
+++ b/block/meson.build
|
||||||
|
@@ -114,6 +114,7 @@ foreach m : [
|
||||||
|
[libnfs, 'nfs', files('nfs.c')],
|
||||||
|
[libssh, 'ssh', files('ssh.c')],
|
||||||
|
[rbd, 'rbd', files('rbd.c')],
|
||||||
|
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||||
|
]
|
||||||
|
if m[0].found()
|
||||||
|
module_ss = ss.source_set()
|
||||||
|
diff --git a/meson.build b/meson.build
|
||||||
|
index fbda17c987..3edac22aff 100644
|
||||||
|
--- a/meson.build
|
||||||
|
+++ b/meson.build
|
||||||
|
@@ -1510,6 +1510,26 @@ if not get_option('rbd').auto() or have_block
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
+vitastor = not_found
|
||||||
|
+if not get_option('vitastor').auto() or have_block
|
||||||
|
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||||
|
+ required: get_option('vitastor'))
|
||||||
|
+ if libvitastor_client.found()
|
||||||
|
+ if cc.links('''
|
||||||
|
+ #include <vitastor_c.h>
|
||||||
|
+ int main(void) {
|
||||||
|
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||||
|
+ return 0;
|
||||||
|
+ }''', dependencies: libvitastor_client)
|
||||||
|
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||||
|
+ elif get_option('vitastor').enabled()
|
||||||
|
+ error('could not link libvitastor_client')
|
||||||
|
+ else
|
||||||
|
+ warning('could not link libvitastor_client, disabling')
|
||||||
|
+ endif
|
||||||
|
+ endif
|
||||||
|
+endif
|
||||||
|
+
|
||||||
|
glusterfs = not_found
|
||||||
|
glusterfs_ftruncate_has_stat = false
|
||||||
|
glusterfs_iocb_has_stat = false
|
||||||
|
@@ -2351,6 +2371,7 @@ endif
|
||||||
|
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
||||||
|
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
|
||||||
|
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||||
|
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||||
|
config_host_data.set('CONFIG_RDMA', rdma.found())
|
||||||
|
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
|
||||||
|
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
|
||||||
|
@@ -4510,6 +4531,7 @@ summary_info += {'fdt support': fdt_opt == 'internal' ? 'internal' : fdt}
|
||||||
|
summary_info += {'libcap-ng support': libcap_ng}
|
||||||
|
summary_info += {'bpf support': libbpf}
|
||||||
|
summary_info += {'rbd support': rbd}
|
||||||
|
+summary_info += {'vitastor support': vitastor}
|
||||||
|
summary_info += {'smartcard support': cacard}
|
||||||
|
summary_info += {'U2F support': u2f}
|
||||||
|
summary_info += {'libusb': libusb}
|
||||||
|
diff --git a/meson_options.txt b/meson_options.txt
|
||||||
|
index 0269fa0f16..4740ffdc27 100644
|
||||||
|
--- a/meson_options.txt
|
||||||
|
+++ b/meson_options.txt
|
||||||
|
@@ -194,6 +194,8 @@ option('lzo', type : 'feature', value : 'auto',
|
||||||
|
description: 'lzo compression support')
|
||||||
|
option('rbd', type : 'feature', value : 'auto',
|
||||||
|
description: 'Ceph block device driver')
|
||||||
|
+option('vitastor', type : 'feature', value : 'auto',
|
||||||
|
+ description: 'Vitastor block device driver')
|
||||||
|
option('opengl', type : 'feature', value : 'auto',
|
||||||
|
description: 'OpenGL support')
|
||||||
|
option('rdma', type : 'feature', value : 'auto',
|
||||||
|
diff --git a/qapi/block-core.json b/qapi/block-core.json
|
||||||
|
index aa40d44f1d..bbee6a0e9c 100644
|
||||||
|
--- a/qapi/block-core.json
|
||||||
|
+++ b/qapi/block-core.json
|
||||||
|
@@ -3203,7 +3203,7 @@
|
||||||
|
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
|
||||||
|
'raw', 'rbd',
|
||||||
|
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||||
|
- 'ssh', 'throttle', 'vdi', 'vhdx',
|
||||||
|
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
|
||||||
|
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
|
||||||
|
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
|
||||||
|
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
|
||||||
|
@@ -4286,6 +4286,28 @@
|
||||||
|
'*key-secret': 'str',
|
||||||
|
'*server': ['InetSocketAddressBase'] } }
|
||||||
|
|
||||||
|
+##
|
||||||
|
+# @BlockdevOptionsVitastor:
|
||||||
|
+#
|
||||||
|
+# Driver specific block device options for vitastor
|
||||||
|
+#
|
||||||
|
+# @image: Image name
|
||||||
|
+# @inode: Inode number
|
||||||
|
+# @pool: Pool ID
|
||||||
|
+# @size: Desired image size in bytes
|
||||||
|
+# @config-path: Path to Vitastor configuration
|
||||||
|
+# @etcd-host: etcd connection address(es)
|
||||||
|
+# @etcd-prefix: etcd key/value prefix
|
||||||
|
+##
|
||||||
|
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||||
|
+ 'data': { '*inode': 'uint64',
|
||||||
|
+ '*pool': 'uint64',
|
||||||
|
+ '*size': 'uint64',
|
||||||
|
+ '*image': 'str',
|
||||||
|
+ '*config-path': 'str',
|
||||||
|
+ '*etcd-host': 'str',
|
||||||
|
+ '*etcd-prefix': 'str' } }
|
||||||
|
+
|
||||||
|
##
|
||||||
|
# @ReplicationMode:
|
||||||
|
#
|
||||||
|
@@ -4742,6 +4764,7 @@
|
||||||
|
'throttle': 'BlockdevOptionsThrottle',
|
||||||
|
'vdi': 'BlockdevOptionsGenericFormat',
|
||||||
|
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||||
|
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||||
|
'virtio-blk-vfio-pci':
|
||||||
|
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
||||||
|
'if': 'CONFIG_BLKIO' },
|
||||||
|
@@ -5183,6 +5206,20 @@
|
||||||
|
'*cluster-size' : 'size',
|
||||||
|
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||||
|
|
||||||
|
+##
|
||||||
|
+# @BlockdevCreateOptionsVitastor:
|
||||||
|
+#
|
||||||
|
+# Driver specific image creation options for Vitastor.
|
||||||
|
+#
|
||||||
|
+# @location: Where to store the new image file. This location cannot
|
||||||
|
+# point to a snapshot.
|
||||||
|
+#
|
||||||
|
+# @size: Size of the virtual disk in bytes
|
||||||
|
+##
|
||||||
|
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||||
|
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||||
|
+ 'size': 'size' } }
|
||||||
|
+
|
||||||
|
##
|
||||||
|
# @BlockdevVmdkSubformat:
|
||||||
|
#
|
||||||
|
@@ -5405,6 +5442,7 @@
|
||||||
|
'ssh': 'BlockdevCreateOptionsSsh',
|
||||||
|
'vdi': 'BlockdevCreateOptionsVdi',
|
||||||
|
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||||
|
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||||
|
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||||
|
'vpc': 'BlockdevCreateOptionsVpc'
|
||||||
|
} }
|
||||||
|
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
|
||||||
|
index c97079a38c..4623f552ec 100644
|
||||||
|
--- a/scripts/meson-buildoptions.sh
|
||||||
|
+++ b/scripts/meson-buildoptions.sh
|
||||||
|
@@ -168,6 +168,7 @@ meson_options_help() {
|
||||||
|
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
||||||
|
printf "%s\n" ' qpl Query Processing Library support'
|
||||||
|
printf "%s\n" ' rbd Ceph block device driver'
|
||||||
|
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
||||||
|
printf "%s\n" ' rdma Enable RDMA-based migration'
|
||||||
|
printf "%s\n" ' replication replication support'
|
||||||
|
printf "%s\n" ' rutabaga-gfx rutabaga_gfx support'
|
||||||
|
@@ -444,6 +445,8 @@ _meson_option_parse() {
|
||||||
|
--disable-qpl) printf "%s" -Dqpl=disabled ;;
|
||||||
|
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
||||||
|
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
||||||
|
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
||||||
|
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
||||||
|
--enable-rdma) printf "%s" -Drdma=enabled ;;
|
||||||
|
--disable-rdma) printf "%s" -Drdma=disabled ;;
|
||||||
|
--enable-relocatable) printf "%s" -Drelocatable=true ;;
|
|
@ -1,11 +1,11 @@
|
||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 1.9.2
|
Version: 1.9.3
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-1.9.2.el7.tar.gz
|
Source0: vitastor-1.9.3.el7.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 1.9.2
|
Version: 1.9.3
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-1.9.2.el8.tar.gz
|
Source0: vitastor-1.9.3.el8.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
Name: vitastor
|
Name: vitastor
|
||||||
Version: 1.9.2
|
Version: 1.9.3
|
||||||
Release: 1%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: Vitastor, a fast software-defined clustered block storage
|
Summary: Vitastor, a fast software-defined clustered block storage
|
||||||
|
|
||||||
License: Vitastor Network Public License 1.1
|
License: Vitastor Network Public License 1.1
|
||||||
URL: https://vitastor.io/
|
URL: https://vitastor.io/
|
||||||
Source0: vitastor-1.9.2.el9.tar.gz
|
Source0: vitastor-1.9.3.el9.tar.gz
|
||||||
|
|
||||||
BuildRequires: liburing-devel >= 0.6
|
BuildRequires: liburing-devel >= 0.6
|
||||||
BuildRequires: gperftools-devel
|
BuildRequires: gperftools-devel
|
||||||
|
|
|
@ -19,7 +19,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
|
||||||
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_definitions(-DVITASTOR_VERSION="1.9.2")
|
add_definitions(-DVITASTOR_VERSION="1.9.3")
|
||||||
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
|
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
|
||||||
add_link_options(-fno-omit-frame-pointer)
|
add_link_options(-fno-omit-frame-pointer)
|
||||||
if (${WITH_ASAN})
|
if (${WITH_ASAN})
|
||||||
|
|
|
@ -176,7 +176,7 @@ void etcd_state_client_t::add_etcd_url(std::string addr)
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
if (!local_ips.size())
|
if (!local_ips.size())
|
||||||
local_ips = getifaddr_list();
|
local_ips = getifaddr_list(std::vector<std::string>(), true);
|
||||||
std::string check_addr;
|
std::string check_addr;
|
||||||
int pos = addr.find('/');
|
int pos = addr.find('/');
|
||||||
int pos2 = addr.find(':');
|
int pos2 = addr.find(':');
|
||||||
|
|
|
@ -62,6 +62,7 @@ struct http_co_t
|
||||||
inline void end() { ended = true; if (!onstack) { delete this; } }
|
inline void end() { ended = true; if (!onstack) { delete this; } }
|
||||||
void run_cb_and_clear();
|
void run_cb_and_clear();
|
||||||
void start_connection();
|
void start_connection();
|
||||||
|
void start_ws_connection();
|
||||||
void close_connection();
|
void close_connection();
|
||||||
void next_request();
|
void next_request();
|
||||||
void handle_events();
|
void handle_events();
|
||||||
|
@ -112,7 +113,7 @@ http_co_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, cons
|
||||||
handler->keepalive = false;
|
handler->keepalive = false;
|
||||||
handler->request = request;
|
handler->request = request;
|
||||||
handler->response_callback = response_callback;
|
handler->response_callback = response_callback;
|
||||||
handler->start_connection();
|
handler->start_ws_connection();
|
||||||
return handler;
|
return handler;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -282,6 +283,27 @@ void http_co_t::close_connection()
|
||||||
epoll_events = 0;
|
epoll_events = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void http_co_t::start_ws_connection()
|
||||||
|
{
|
||||||
|
stackin();
|
||||||
|
start_connection();
|
||||||
|
if (request_timeout > 0)
|
||||||
|
{
|
||||||
|
timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
|
||||||
|
{
|
||||||
|
stackin();
|
||||||
|
if (state != HTTP_CO_WEBSOCKET)
|
||||||
|
{
|
||||||
|
close_connection();
|
||||||
|
parsed = { .error = "Websocket connection timed out" };
|
||||||
|
run_cb_and_clear();
|
||||||
|
}
|
||||||
|
stackout();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
stackout();
|
||||||
|
}
|
||||||
|
|
||||||
void http_co_t::start_connection()
|
void http_co_t::start_connection()
|
||||||
{
|
{
|
||||||
stackin();
|
stackin();
|
||||||
|
|
|
@ -121,7 +121,7 @@ void osd_messenger_t::init()
|
||||||
if (use_rdma)
|
if (use_rdma)
|
||||||
{
|
{
|
||||||
rdma_context = msgr_rdma_context_t::create(
|
rdma_context = msgr_rdma_context_t::create(
|
||||||
rdma_device != "" ? rdma_device.c_str() : NULL,
|
osd_networks, rdma_device != "" ? rdma_device.c_str() : NULL,
|
||||||
rdma_port_num, rdma_gid_index, rdma_mtu, rdma_odp, log_level
|
rdma_port_num, rdma_gid_index, rdma_mtu, rdma_odp, log_level
|
||||||
);
|
);
|
||||||
if (!rdma_context)
|
if (!rdma_context)
|
||||||
|
@ -266,7 +266,8 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
||||||
this->rdma_port_num = (uint8_t)config["rdma_port_num"].uint64_value();
|
this->rdma_port_num = (uint8_t)config["rdma_port_num"].uint64_value();
|
||||||
if (!this->rdma_port_num)
|
if (!this->rdma_port_num)
|
||||||
this->rdma_port_num = 1;
|
this->rdma_port_num = 1;
|
||||||
this->rdma_gid_index = (uint8_t)config["rdma_gid_index"].uint64_value();
|
if (!config["rdma_gid_index"].is_null())
|
||||||
|
this->rdma_gid_index = (uint8_t)config["rdma_gid_index"].uint64_value();
|
||||||
this->rdma_mtu = (uint32_t)config["rdma_mtu"].uint64_value();
|
this->rdma_mtu = (uint32_t)config["rdma_mtu"].uint64_value();
|
||||||
this->rdma_max_sge = config["rdma_max_sge"].uint64_value();
|
this->rdma_max_sge = config["rdma_max_sge"].uint64_value();
|
||||||
if (!this->rdma_max_sge)
|
if (!this->rdma_max_sge)
|
||||||
|
@ -281,6 +282,15 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
||||||
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
||||||
this->rdma_max_msg = 129*1024;
|
this->rdma_max_msg = 129*1024;
|
||||||
this->rdma_odp = config["rdma_odp"].bool_value();
|
this->rdma_odp = config["rdma_odp"].bool_value();
|
||||||
|
std::vector<std::string> mask;
|
||||||
|
if (config["bind_address"].is_string())
|
||||||
|
mask.push_back(config["bind_address"].string_value());
|
||||||
|
else if (config["osd_network"].is_string())
|
||||||
|
mask.push_back(config["osd_network"].string_value());
|
||||||
|
else
|
||||||
|
for (auto v: config["osd_network"].array_items())
|
||||||
|
mask.push_back(v.string_value());
|
||||||
|
this->osd_networks = mask;
|
||||||
#endif
|
#endif
|
||||||
if (!osd_num)
|
if (!osd_num)
|
||||||
this->iothread_count = (uint32_t)config["client_iothread_count"].uint64_value();
|
this->iothread_count = (uint32_t)config["client_iothread_count"].uint64_value();
|
||||||
|
|
|
@ -165,8 +165,9 @@ protected:
|
||||||
|
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
bool use_rdma = true;
|
bool use_rdma = true;
|
||||||
|
std::vector<std::string> osd_networks;
|
||||||
std::string rdma_device;
|
std::string rdma_device;
|
||||||
uint64_t rdma_port_num = 1, rdma_gid_index = 0, rdma_mtu = 0;
|
uint64_t rdma_port_num = 1, rdma_gid_index = -1, rdma_mtu = 0;
|
||||||
msgr_rdma_context_t *rdma_context = NULL;
|
msgr_rdma_context_t *rdma_context = NULL;
|
||||||
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
|
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
|
||||||
uint64_t rdma_max_msg = 0;
|
uint64_t rdma_max_msg = 0;
|
||||||
|
@ -177,7 +178,7 @@ protected:
|
||||||
std::vector<int> read_ready_clients;
|
std::vector<int> read_ready_clients;
|
||||||
std::vector<int> write_ready_clients;
|
std::vector<int> write_ready_clients;
|
||||||
// We don't use ringloop->set_immediate here because we may have no ringloop in client :)
|
// We don't use ringloop->set_immediate here because we may have no ringloop in client :)
|
||||||
std::vector<std::function<void()>> set_immediate;
|
std::vector<osd_op_t*> set_immediate_ops;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
timerfd_manager_t *tfd;
|
timerfd_manager_t *tfd;
|
||||||
|
@ -237,6 +238,8 @@ protected:
|
||||||
void handle_op_hdr(osd_client_t *cl);
|
void handle_op_hdr(osd_client_t *cl);
|
||||||
bool handle_reply_hdr(osd_client_t *cl);
|
bool handle_reply_hdr(osd_client_t *cl);
|
||||||
void handle_reply_ready(osd_op_t *op);
|
void handle_reply_ready(osd_op_t *op);
|
||||||
|
void handle_immediate_ops();
|
||||||
|
void clear_immediate_ops(int peer_fd);
|
||||||
|
|
||||||
#ifdef WITH_RDMA
|
#ifdef WITH_RDMA
|
||||||
void try_send_rdma(osd_client_t *cl);
|
void try_send_rdma(osd_client_t *cl);
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include "addr_util.h"
|
||||||
#include "msgr_rdma.h"
|
#include "msgr_rdma.h"
|
||||||
#include "messenger.h"
|
#include "messenger.h"
|
||||||
|
|
||||||
|
@ -69,7 +70,126 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
|
||||||
send_out_size = 0;
|
send_out_size = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level)
|
static bool is_ipv4_gid(ibv_gid_entry *gidx)
|
||||||
|
{
|
||||||
|
return (((uint64_t*)gidx->gid.raw)[0] == 0 &&
|
||||||
|
((uint32_t*)gidx->gid.raw)[2] == 0xffff0000);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool match_gid(ibv_gid_entry *gidx, addr_mask_t *networks, int nnet)
|
||||||
|
{
|
||||||
|
if (gidx->gid_type != IBV_GID_TYPE_ROCE_V1 &&
|
||||||
|
gidx->gid_type != IBV_GID_TYPE_ROCE_V2 ||
|
||||||
|
((uint64_t*)gidx->gid.raw)[0] == 0 &&
|
||||||
|
((uint64_t*)gidx->gid.raw)[1] == 0)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (is_ipv4_gid(gidx))
|
||||||
|
{
|
||||||
|
for (int i = 0; i < nnet; i++)
|
||||||
|
{
|
||||||
|
if (networks[i].family == AF_INET && cidr_match(*(in_addr*)(gidx->gid.raw+12), networks[i].ipv4, networks[i].bits))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (int i = 0; i < nnet; i++)
|
||||||
|
{
|
||||||
|
if (networks[i].family == AF_INET6 && cidr6_match(*(in6_addr*)gidx->gid.raw, networks[i].ipv6, networks[i].bits))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct matched_dev
|
||||||
|
{
|
||||||
|
int dev = -1;
|
||||||
|
int port = -1;
|
||||||
|
int gid = -1;
|
||||||
|
bool rocev2 = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
static void log_rdma_dev_port_gid(ibv_device *dev, int ib_port, int gid_index, ibv_gid_entry & gidx)
|
||||||
|
{
|
||||||
|
bool is4 = ((uint64_t*)gidx.gid.raw)[0] == 0 && ((uint32_t*)gidx.gid.raw)[2] == 0xffff0000;
|
||||||
|
char buf[256];
|
||||||
|
inet_ntop(is4 ? AF_INET : AF_INET6, is4 ? gidx.gid.raw+12 : gidx.gid.raw, buf, sizeof(buf));
|
||||||
|
fprintf(
|
||||||
|
stderr, "Auto-selected RDMA device %s port %d GID %d - ROCEv%d IPv%d %s\n",
|
||||||
|
ibv_get_device_name(dev), ib_port, gid_index,
|
||||||
|
gidx.gid_type == IBV_GID_TYPE_ROCE_V2 ? 2 : 1, is4 ? 4 : 6, buf
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
static matched_dev match_device(ibv_device **dev_list, addr_mask_t *networks, int nnet, int log_level)
|
||||||
|
{
|
||||||
|
matched_dev best;
|
||||||
|
ibv_device_attr attr;
|
||||||
|
ibv_port_attr portinfo;
|
||||||
|
ibv_gid_entry best_gidx;
|
||||||
|
int res;
|
||||||
|
for (int i = 0; dev_list[i]; ++i)
|
||||||
|
{
|
||||||
|
auto dev = dev_list[i];
|
||||||
|
ibv_context *context = ibv_open_device(dev_list[i]);
|
||||||
|
if ((res = ibv_query_device(context, &attr)) != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't query RDMA device %s for its features: %s\n", ibv_get_device_name(dev_list[i]), strerror(res));
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
for (int j = 1; j <= attr.phys_port_cnt; j++)
|
||||||
|
{
|
||||||
|
// Try to find a port with matching address
|
||||||
|
if ((res = ibv_query_port(context, j, &portinfo)) != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't get RDMA device %s port %d info: %s\n", ibv_get_device_name(dev), j, strerror(res));
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
for (int k = 0; k < portinfo.gid_tbl_len; k++)
|
||||||
|
{
|
||||||
|
ibv_gid_entry gidx;
|
||||||
|
if ((res = ibv_query_gid_ex(context, j, k, &gidx, 0)) != 0)
|
||||||
|
{
|
||||||
|
if (res != ENODATA)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't read RDMA device %s GID index %d: %s\n", ibv_get_device_name(dev), k, strerror(res));
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (match_gid(&gidx, networks, nnet))
|
||||||
|
{
|
||||||
|
// Prefer RoCEv2
|
||||||
|
if (!best.rocev2)
|
||||||
|
{
|
||||||
|
best.dev = i;
|
||||||
|
best.port = j;
|
||||||
|
best.gid = k;
|
||||||
|
best.rocev2 = (gidx.gid_type == IBV_GID_TYPE_ROCE_V2);
|
||||||
|
best_gidx = gidx;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cleanup:
|
||||||
|
ibv_close_device(context);
|
||||||
|
if (best.rocev2)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (best.dev >= 0 && log_level > 0)
|
||||||
|
{
|
||||||
|
log_rdma_dev_port_gid(dev_list[best.dev], best.port, best.gid, best_gidx);
|
||||||
|
}
|
||||||
|
return best;
|
||||||
|
}
|
||||||
|
|
||||||
|
msgr_rdma_context_t *msgr_rdma_context_t::create(std::vector<std::string> osd_networks, const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level)
|
||||||
{
|
{
|
||||||
int res;
|
int res;
|
||||||
ibv_device **dev_list = NULL;
|
ibv_device **dev_list = NULL;
|
||||||
|
@ -80,28 +200,23 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
|
||||||
clock_gettime(CLOCK_REALTIME, &tv);
|
clock_gettime(CLOCK_REALTIME, &tv);
|
||||||
srand48(tv.tv_sec*1000000000 + tv.tv_nsec);
|
srand48(tv.tv_sec*1000000000 + tv.tv_nsec);
|
||||||
dev_list = ibv_get_device_list(NULL);
|
dev_list = ibv_get_device_list(NULL);
|
||||||
if (!dev_list)
|
if (!dev_list || !*dev_list)
|
||||||
{
|
{
|
||||||
if (errno == -ENOSYS || errno == ENOSYS)
|
if (errno == -ENOSYS || errno == ENOSYS)
|
||||||
{
|
{
|
||||||
if (log_level > 0)
|
if (log_level > 0)
|
||||||
fprintf(stderr, "No RDMA devices found (RDMA device list returned ENOSYS)\n");
|
fprintf(stderr, "No RDMA devices found (RDMA device list returned ENOSYS)\n");
|
||||||
}
|
}
|
||||||
|
else if (!*dev_list)
|
||||||
|
{
|
||||||
|
if (log_level > 0)
|
||||||
|
fprintf(stderr, "No RDMA devices found\n");
|
||||||
|
}
|
||||||
else
|
else
|
||||||
fprintf(stderr, "Failed to get RDMA device list: %s\n", strerror(errno));
|
fprintf(stderr, "Failed to get RDMA device list: %s\n", strerror(errno));
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
if (!ib_devname)
|
if (ib_devname)
|
||||||
{
|
|
||||||
ctx->dev = *dev_list;
|
|
||||||
if (!ctx->dev)
|
|
||||||
{
|
|
||||||
if (log_level > 0)
|
|
||||||
fprintf(stderr, "No RDMA devices found\n");
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
for (i = 0; dev_list[i]; ++i)
|
for (i = 0; dev_list[i]; ++i)
|
||||||
|
@ -114,6 +229,31 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if (osd_networks.size())
|
||||||
|
{
|
||||||
|
std::vector<addr_mask_t> nets;
|
||||||
|
for (auto & netstr: osd_networks)
|
||||||
|
{
|
||||||
|
nets.push_back(cidr_parse(netstr));
|
||||||
|
}
|
||||||
|
auto best = match_device(dev_list, nets.data(), nets.size(), log_level);
|
||||||
|
if (best.dev < 0)
|
||||||
|
{
|
||||||
|
if (log_level > 0)
|
||||||
|
fprintf(stderr, "RDMA device matching osd_network is not found, using first available device\n");
|
||||||
|
best.dev = 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ib_port = best.port;
|
||||||
|
gid_index = best.gid;
|
||||||
|
}
|
||||||
|
ctx->dev = dev_list[best.dev];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ctx->dev = *dev_list;
|
||||||
|
}
|
||||||
|
|
||||||
ctx->context = ibv_open_device(ctx->dev);
|
ctx->context = ibv_open_device(ctx->dev);
|
||||||
if (!ctx->context)
|
if (!ctx->context)
|
||||||
|
@ -123,7 +263,6 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->ib_port = ib_port;
|
ctx->ib_port = ib_port;
|
||||||
ctx->gid_index = gid_index;
|
|
||||||
if ((res = ibv_query_port(ctx->context, ib_port, &ctx->portinfo)) != 0)
|
if ((res = ibv_query_port(ctx->context, ib_port, &ctx->portinfo)) != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Couldn't get RDMA device %s port %d info: %s\n", ibv_get_device_name(ctx->dev), ib_port, strerror(res));
|
fprintf(stderr, "Couldn't get RDMA device %s port %d info: %s\n", ibv_get_device_name(ctx->dev), ib_port, strerror(res));
|
||||||
|
@ -135,10 +274,47 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
|
||||||
fprintf(stderr, "RDMA device %s must have local LID because it's not Ethernet, but LID is zero\n", ibv_get_device_name(ctx->dev));
|
fprintf(stderr, "RDMA device %s must have local LID because it's not Ethernet, but LID is zero\n", ibv_get_device_name(ctx->dev));
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
if (ibv_query_gid(ctx->context, ib_port, gid_index, &ctx->my_gid))
|
|
||||||
|
if (gid_index != -1)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), gid_index);
|
ctx->gid_index = gid_index;
|
||||||
goto cleanup;
|
if (ibv_query_gid_ex(ctx->context, ib_port, gid_index, &ctx->my_gid, 0))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), gid_index);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Auto-guess GID
|
||||||
|
for (int k = 0; k < ctx->portinfo.gid_tbl_len; k++)
|
||||||
|
{
|
||||||
|
ibv_gid_entry gidx;
|
||||||
|
if (ibv_query_gid_ex(ctx->context, ib_port, k, &gidx, 0) != 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), k);
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
// Skip empty GID
|
||||||
|
if (((uint64_t*)gidx.gid.raw)[0] == 0 &&
|
||||||
|
((uint64_t*)gidx.gid.raw)[1] == 0)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Prefer IPv4 RoCEv2 GID by default
|
||||||
|
if (gid_index == -1 ||
|
||||||
|
gidx.gid_type == IBV_GID_TYPE_ROCE_V2 &&
|
||||||
|
(ctx->my_gid.gid_type != IBV_GID_TYPE_ROCE_V2 || is_ipv4_gid(&gidx)))
|
||||||
|
{
|
||||||
|
gid_index = k;
|
||||||
|
ctx->my_gid = gidx;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ctx->gid_index = gid_index = (gid_index == -1 ? 0 : gid_index);
|
||||||
|
if (log_level > 0)
|
||||||
|
{
|
||||||
|
log_rdma_dev_port_gid(ctx->dev, ctx->ib_port, ctx->gid_index, ctx->my_gid);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->pd = ibv_alloc_pd(ctx->context);
|
ctx->pd = ibv_alloc_pd(ctx->context);
|
||||||
|
@ -255,7 +431,7 @@ msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx,
|
||||||
}
|
}
|
||||||
|
|
||||||
conn->addr.lid = ctx->my_lid;
|
conn->addr.lid = ctx->my_lid;
|
||||||
conn->addr.gid = ctx->my_gid;
|
conn->addr.gid = ctx->my_gid.gid;
|
||||||
conn->addr.qpn = conn->qp->qp_num;
|
conn->addr.qpn = conn->qp->qp_num;
|
||||||
conn->addr.psn = lrand48() & 0xffffff;
|
conn->addr.psn = lrand48() & 0xffffff;
|
||||||
|
|
||||||
|
@ -598,6 +774,7 @@ void osd_messenger_t::handle_rdma_events()
|
||||||
}
|
}
|
||||||
fprintf(stderr, " with status: %s, stopping client\n", ibv_wc_status_str(wc[i].status));
|
fprintf(stderr, " with status: %s, stopping client\n", ibv_wc_status_str(wc[i].status));
|
||||||
stop_client(client_id);
|
stop_client(client_id);
|
||||||
|
clear_immediate_ops(client_id);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (!is_send)
|
if (!is_send)
|
||||||
|
@ -606,6 +783,7 @@ void osd_messenger_t::handle_rdma_events()
|
||||||
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
|
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
|
||||||
{
|
{
|
||||||
// handle_read_buffer may stop the client
|
// handle_read_buffer may stop the client
|
||||||
|
clear_immediate_ops(client_id);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]);
|
try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]);
|
||||||
|
@ -666,9 +844,5 @@ void osd_messenger_t::handle_rdma_events()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (event_count > 0);
|
} while (event_count > 0);
|
||||||
for (auto cb: set_immediate)
|
handle_immediate_ops();
|
||||||
{
|
|
||||||
cb();
|
|
||||||
}
|
|
||||||
set_immediate.clear();
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,12 +31,12 @@ struct msgr_rdma_context_t
|
||||||
uint8_t ib_port;
|
uint8_t ib_port;
|
||||||
uint8_t gid_index;
|
uint8_t gid_index;
|
||||||
uint16_t my_lid;
|
uint16_t my_lid;
|
||||||
ibv_gid my_gid;
|
ibv_gid_entry my_gid;
|
||||||
uint32_t mtu;
|
uint32_t mtu;
|
||||||
int max_cqe = 0;
|
int max_cqe = 0;
|
||||||
int used_max_cqe = 0;
|
int used_max_cqe = 0;
|
||||||
|
|
||||||
static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level);
|
static msgr_rdma_context_t *create(std::vector<std::string> osd_networks, const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level);
|
||||||
~msgr_rdma_context_t();
|
~msgr_rdma_context_t();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -65,6 +65,7 @@ void osd_messenger_t::read_requests()
|
||||||
bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
||||||
{
|
{
|
||||||
bool ret = false;
|
bool ret = false;
|
||||||
|
int peer_fd = cl->peer_fd;
|
||||||
cl->read_msg.msg_iovlen = 0;
|
cl->read_msg.msg_iovlen = 0;
|
||||||
cl->refs--;
|
cl->refs--;
|
||||||
if (cl->peer_state == PEER_STOPPED)
|
if (cl->peer_state == PEER_STOPPED)
|
||||||
|
@ -101,7 +102,8 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
||||||
{
|
{
|
||||||
if (!handle_read_buffer(cl, cl->in_buf, result))
|
if (!handle_read_buffer(cl, cl->in_buf, result))
|
||||||
{
|
{
|
||||||
goto fin;
|
clear_immediate_ops(peer_fd);
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -113,7 +115,8 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
||||||
{
|
{
|
||||||
if (!handle_finished_read(cl))
|
if (!handle_finished_read(cl))
|
||||||
{
|
{
|
||||||
goto fin;
|
clear_immediate_ops(peer_fd);
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -122,15 +125,47 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
|
||||||
ret = true;
|
ret = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fin:
|
handle_immediate_ops();
|
||||||
for (auto cb: set_immediate)
|
|
||||||
{
|
|
||||||
cb();
|
|
||||||
}
|
|
||||||
set_immediate.clear();
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void osd_messenger_t::clear_immediate_ops(int peer_fd)
|
||||||
|
{
|
||||||
|
size_t i = 0, j = 0;
|
||||||
|
while (i < set_immediate_ops.size())
|
||||||
|
{
|
||||||
|
if (set_immediate_ops[i]->peer_fd == peer_fd)
|
||||||
|
{
|
||||||
|
delete set_immediate_ops[i];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (i != j)
|
||||||
|
set_immediate_ops[j] = set_immediate_ops[i];
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
set_immediate_ops.resize(j);
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_messenger_t::handle_immediate_ops()
|
||||||
|
{
|
||||||
|
for (auto op: set_immediate_ops)
|
||||||
|
{
|
||||||
|
if (op->op_type == OSD_OP_IN)
|
||||||
|
{
|
||||||
|
exec_op(op);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Copy lambda to be unaffected by `delete op`
|
||||||
|
std::function<void(osd_op_t*)>(op->callback)(op);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
set_immediate_ops.clear();
|
||||||
|
}
|
||||||
|
|
||||||
bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int remain)
|
bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int remain)
|
||||||
{
|
{
|
||||||
// Compose operation(s) from the buffer
|
// Compose operation(s) from the buffer
|
||||||
|
@ -199,7 +234,7 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
|
||||||
{
|
{
|
||||||
// Operation is ready
|
// Operation is ready
|
||||||
cl->received_ops.push_back(cl->read_op);
|
cl->received_ops.push_back(cl->read_op);
|
||||||
set_immediate.push_back([this, op = cl->read_op]() { exec_op(op); });
|
set_immediate_ops.push_back(cl->read_op);
|
||||||
cl->read_op = NULL;
|
cl->read_op = NULL;
|
||||||
cl->read_state = 0;
|
cl->read_state = 0;
|
||||||
}
|
}
|
||||||
|
@ -295,7 +330,7 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
|
||||||
{
|
{
|
||||||
// Operation is ready
|
// Operation is ready
|
||||||
cl->received_ops.push_back(cur_op);
|
cl->received_ops.push_back(cur_op);
|
||||||
set_immediate.push_back([this, cur_op]() { exec_op(cur_op); });
|
set_immediate_ops.push_back(cur_op);
|
||||||
cl->read_op = NULL;
|
cl->read_op = NULL;
|
||||||
cl->read_state = 0;
|
cl->read_state = 0;
|
||||||
}
|
}
|
||||||
|
@ -416,9 +451,5 @@ void osd_messenger_t::handle_reply_ready(osd_op_t *op)
|
||||||
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
|
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
|
||||||
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
|
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
|
||||||
);
|
);
|
||||||
set_immediate.push_back([op]()
|
set_immediate_ops.push_back(op);
|
||||||
{
|
|
||||||
// Copy lambda to be unaffected by `delete op`
|
|
||||||
std::function<void(osd_op_t*)>(op->callback)(op);
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,7 +16,6 @@
|
||||||
#include "qapi/error.h"
|
#include "qapi/error.h"
|
||||||
#include "qapi/qmp/qdict.h"
|
#include "qapi/qmp/qdict.h"
|
||||||
#include "qapi/qmp/qerror.h"
|
#include "qapi/qmp/qerror.h"
|
||||||
#include "qemu/uri.h"
|
|
||||||
#include "qemu/error-report.h"
|
#include "qemu/error-report.h"
|
||||||
#include "qemu/module.h"
|
#include "qemu/module.h"
|
||||||
#include "qemu/option.h"
|
#include "qemu/option.h"
|
||||||
|
@ -1021,7 +1020,11 @@ static BlockDriver bdrv_vitastor = {
|
||||||
// FIXME: Implement it along with per-inode statistics
|
// FIXME: Implement it along with per-inode statistics
|
||||||
//.bdrv_get_allocated_file_size = vitastor_get_allocated_file_size,
|
//.bdrv_get_allocated_file_size = vitastor_get_allocated_file_size,
|
||||||
|
|
||||||
|
#if QEMU_VERSION_MAJOR > 9 || QEMU_VERSION_MAJOR == 9 && QEMU_VERSION_MINOR > 0
|
||||||
|
.bdrv_open = vitastor_file_open,
|
||||||
|
#else
|
||||||
.bdrv_file_open = vitastor_file_open,
|
.bdrv_file_open = vitastor_file_open,
|
||||||
|
#endif
|
||||||
.bdrv_close = vitastor_close,
|
.bdrv_close = vitastor_close,
|
||||||
|
|
||||||
// Option list for the create operation
|
// Option list for the create operation
|
||||||
|
|
|
@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
||||||
|
|
||||||
Name: Vitastor
|
Name: Vitastor
|
||||||
Description: Vitastor client library
|
Description: Vitastor client library
|
||||||
Version: 1.9.2
|
Version: 1.9.3
|
||||||
Libs: -L${libdir} -lvitastor_client
|
Libs: -L${libdir} -lvitastor_client
|
||||||
Cflags: -I${includedir}
|
Cflags: -I${includedir}
|
||||||
|
|
||||||
|
|
|
@ -431,7 +431,7 @@ struct cli_dd_t
|
||||||
if (read_op->retval < 0)
|
if (read_op->retval < 0)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Failed to read bitmap for %lu bytes from image %s at offset %lu: %s (code %d)\n",
|
stderr, "Failed to read bitmap for %ju bytes from image %s at offset %ju: %s (code %d)\n",
|
||||||
read_op->len, iinfo.iimg.c_str(), read_op->offset,
|
read_op->len, iinfo.iimg.c_str(), read_op->offset,
|
||||||
strerror(read_op->retval < 0 ? -read_op->retval : EIO), read_op->retval
|
strerror(read_op->retval < 0 ? -read_op->retval : EIO), read_op->retval
|
||||||
);
|
);
|
||||||
|
@ -476,7 +476,7 @@ struct cli_dd_t
|
||||||
if (read_op->retval != read_op->len)
|
if (read_op->retval != read_op->len)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Failed to read %lu bytes from image %s at offset %lu: %s (code %d)\n",
|
stderr, "Failed to read %ju bytes from image %s at offset %ju: %s (code %d)\n",
|
||||||
read_op->len, iinfo.iimg.c_str(), read_op->offset,
|
read_op->len, iinfo.iimg.c_str(), read_op->offset,
|
||||||
strerror(read_op->retval < 0 ? -read_op->retval : EIO), read_op->retval
|
strerror(read_op->retval < 0 ? -read_op->retval : EIO), read_op->retval
|
||||||
);
|
);
|
||||||
|
@ -547,7 +547,7 @@ struct cli_dd_t
|
||||||
if (data->res < 0)
|
if (data->res < 0)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Failed to read %lu bytes from %s at offset %lu: %s (code %d)\n",
|
stderr, "Failed to read %ju bytes from %s at offset %ju: %s (code %d)\n",
|
||||||
data->iov.iov_len, iinfo.ifile == "" ? "stdin" : iinfo.ifile.c_str(), cur_read->offset,
|
data->iov.iov_len, iinfo.ifile == "" ? "stdin" : iinfo.ifile.c_str(), cur_read->offset,
|
||||||
strerror(-data->res), data->res
|
strerror(-data->res), data->res
|
||||||
);
|
);
|
||||||
|
@ -644,7 +644,7 @@ struct cli_dd_t
|
||||||
if (write_op->retval != write_op->len)
|
if (write_op->retval != write_op->len)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Failed to write %lu bytes to image %s at offset %lu: %s (code %d)\n",
|
stderr, "Failed to write %ju bytes to image %s at offset %ju: %s (code %d)\n",
|
||||||
write_op->len, oinfo.oimg.c_str(), write_op->offset,
|
write_op->len, oinfo.oimg.c_str(), write_op->offset,
|
||||||
strerror(write_op->retval < 0 ? -write_op->retval : EIO), write_op->retval
|
strerror(write_op->retval < 0 ? -write_op->retval : EIO), write_op->retval
|
||||||
);
|
);
|
||||||
|
@ -680,7 +680,7 @@ struct cli_dd_t
|
||||||
if (data->res < 0)
|
if (data->res < 0)
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "Failed to write %lu bytes to %s at offset %lu: %s (code %d)\n",
|
stderr, "Failed to write %ju bytes to %s at offset %ju: %s (code %d)\n",
|
||||||
data->iov.iov_len, oinfo.ofile == "" ? "stdout" : oinfo.ofile.c_str(),
|
data->iov.iov_len, oinfo.ofile == "" ? "stdout" : oinfo.ofile.c_str(),
|
||||||
oinfo.out_seekable ? cur_read->offset+cur_read->len+oseek : 0,
|
oinfo.out_seekable ? cur_read->offset+cur_read->len+oseek : 0,
|
||||||
strerror(-data->res), data->res
|
strerror(-data->res), data->res
|
||||||
|
@ -727,7 +727,7 @@ struct cli_dd_t
|
||||||
{
|
{
|
||||||
char buf[256];
|
char buf[256];
|
||||||
snprintf(
|
snprintf(
|
||||||
buf, sizeof(buf), "%lu bytes (%s) copied, %.1f s, %sB/s",
|
buf, sizeof(buf), "%ju bytes (%s) copied, %.1f s, %sB/s",
|
||||||
written_size, format_size(written_size).c_str(), sec_total,
|
written_size, format_size(written_size).c_str(), sec_total,
|
||||||
format_size((uint64_t)(written_size/sec_total), true).c_str()
|
format_size((uint64_t)(written_size/sec_total), true).c_str()
|
||||||
);
|
);
|
||||||
|
@ -749,7 +749,7 @@ struct cli_dd_t
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr, "\r%lu bytes (%s) copied, %.1f s, %sB/s, avg %sB/s\033[K",
|
stderr, "\r%ju bytes (%s) copied, %.1f s, %sB/s, avg %sB/s\033[K",
|
||||||
written_size, format_size(written_size).c_str(), sec_total,
|
written_size, format_size(written_size).c_str(), sec_total,
|
||||||
format_size((uint64_t)(delta/sec_delta), true).c_str(),
|
format_size((uint64_t)(delta/sec_delta), true).c_str(),
|
||||||
format_size((uint64_t)(written_size/sec_total), true).c_str()
|
format_size((uint64_t)(written_size/sec_total), true).c_str()
|
||||||
|
|
|
@ -216,7 +216,7 @@ resume_1:
|
||||||
for (uint64_t osd_num: node.child_osds)
|
for (uint64_t osd_num: node.child_osds)
|
||||||
{
|
{
|
||||||
auto & osd = placement_tree->osds.at(osd_num);
|
auto & osd = placement_tree->osds.at(osd_num);
|
||||||
fmt_items.push_back(json11::Json::object{
|
auto json_osd = json11::Json::object{
|
||||||
{ "type", "osd" },
|
{ "type", "osd" },
|
||||||
{ "name", osd.num },
|
{ "name", osd.num },
|
||||||
{ "parent", node.name },
|
{ "parent", node.name },
|
||||||
|
@ -230,7 +230,16 @@ resume_1:
|
||||||
{ "bitmap", (uint64_t)osd.bitmap_granularity },
|
{ "bitmap", (uint64_t)osd.bitmap_granularity },
|
||||||
{ "commit", osd.immediate_commit == IMMEDIATE_NONE ? "none" : (osd.immediate_commit == IMMEDIATE_ALL ? "all" : "small") },
|
{ "commit", osd.immediate_commit == IMMEDIATE_NONE ? "none" : (osd.immediate_commit == IMMEDIATE_ALL ? "all" : "small") },
|
||||||
{ "op_stats", osd_stats[osd_num]["op_stats"] },
|
{ "op_stats", osd_stats[osd_num]["op_stats"] },
|
||||||
});
|
};
|
||||||
|
if (osd_stats[osd_num]["slow_ops_primary"].uint64_value() > 0)
|
||||||
|
{
|
||||||
|
json_osd["slow_ops_primary"] = osd_stats[osd_num]["slow_ops_primary"];
|
||||||
|
}
|
||||||
|
if (osd_stats[osd_num]["slow_ops_secondary"].uint64_value() > 0)
|
||||||
|
{
|
||||||
|
json_osd["slow_ops_secondary"] = osd_stats[osd_num]["slow_ops_secondary"];
|
||||||
|
}
|
||||||
|
fmt_items.push_back(json_osd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
result.data = fmt_items;
|
result.data = fmt_items;
|
||||||
|
|
|
@ -134,6 +134,7 @@ resume_2:
|
||||||
}
|
}
|
||||||
int osd_count = 0, osd_up = 0;
|
int osd_count = 0, osd_up = 0;
|
||||||
uint64_t total_raw = 0, free_raw = 0, free_down_raw = 0, down_raw = 0;
|
uint64_t total_raw = 0, free_raw = 0, free_down_raw = 0, down_raw = 0;
|
||||||
|
std::vector<uint64_t> slow_op_primary_osds, slow_op_secondary_osds;
|
||||||
parent->iterate_kvs_1(osd_stats, "/osd/stats/", [&](uint64_t stat_osd_num, json11::Json value)
|
parent->iterate_kvs_1(osd_stats, "/osd/stats/", [&](uint64_t stat_osd_num, json11::Json value)
|
||||||
{
|
{
|
||||||
osd_count++;
|
osd_count++;
|
||||||
|
@ -153,6 +154,14 @@ resume_2:
|
||||||
if (peer_it != parent->cli->st_cli.peer_states.end())
|
if (peer_it != parent->cli->st_cli.peer_states.end())
|
||||||
{
|
{
|
||||||
osd_up++;
|
osd_up++;
|
||||||
|
if (value["slow_ops_primary"].uint64_value() > 0)
|
||||||
|
{
|
||||||
|
slow_op_primary_osds.push_back(stat_osd_num);
|
||||||
|
}
|
||||||
|
if (value["slow_ops_secondary"].uint64_value() > 0)
|
||||||
|
{
|
||||||
|
slow_op_secondary_osds.push_back(stat_osd_num);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -216,6 +225,10 @@ resume_2:
|
||||||
{ "mon_master", mon_master },
|
{ "mon_master", mon_master },
|
||||||
{ "osd_up", osd_up },
|
{ "osd_up", osd_up },
|
||||||
{ "osd_count", osd_count },
|
{ "osd_count", osd_count },
|
||||||
|
{ "osds_full", osds_full },
|
||||||
|
{ "osds_nearfull", osds_nearfull },
|
||||||
|
{ "osds_primary_slow_ops", slow_op_primary_osds },
|
||||||
|
{ "osds_secondary_slow_ops", slow_op_secondary_osds },
|
||||||
{ "total_raw", total_raw },
|
{ "total_raw", total_raw },
|
||||||
{ "free_raw", free_raw },
|
{ "free_raw", free_raw },
|
||||||
{ "down_raw", down_raw },
|
{ "down_raw", down_raw },
|
||||||
|
@ -300,6 +313,26 @@ resume_2:
|
||||||
warning_str += " "+std::to_string(osds_nearfull)+
|
warning_str += " "+std::to_string(osds_nearfull)+
|
||||||
(osds_nearfull > 1 ? " osds are almost full\n" : " osd is almost full\n");
|
(osds_nearfull > 1 ? " osds are almost full\n" : " osd is almost full\n");
|
||||||
}
|
}
|
||||||
|
if (slow_op_primary_osds.size() > 0)
|
||||||
|
{
|
||||||
|
warning_str += " "+std::to_string(slow_op_primary_osds.size());
|
||||||
|
warning_str += (slow_op_primary_osds.size() > 1 ? " osds have" : " osd has");
|
||||||
|
warning_str += " slow client ops: ";
|
||||||
|
for (int i = 0; i < slow_op_primary_osds.size(); i++)
|
||||||
|
{
|
||||||
|
warning_str += (i > 0 ? ", " : "")+std::to_string(slow_op_primary_osds[i])+"\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (slow_op_secondary_osds.size() > 0)
|
||||||
|
{
|
||||||
|
warning_str += " "+std::to_string(slow_op_secondary_osds.size());
|
||||||
|
warning_str += (slow_op_secondary_osds.size() > 1 ? " osds have" : " osd has");
|
||||||
|
warning_str += " slow replication ops: ";
|
||||||
|
for (int i = 0; i < slow_op_secondary_osds.size(); i++)
|
||||||
|
{
|
||||||
|
warning_str += (i > 0 ? ", " : "")+std::to_string(slow_op_secondary_osds[i])+"\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
if (warning_str != "")
|
if (warning_str != "")
|
||||||
{
|
{
|
||||||
warning_str = "\n warning:\n"+warning_str;
|
warning_str = "\n warning:\n"+warning_str;
|
||||||
|
|
|
@ -27,12 +27,16 @@ static const char *help_text =
|
||||||
" --osd_per_disk <N>\n"
|
" --osd_per_disk <N>\n"
|
||||||
" Create <N> OSDs on each disk (default 1)\n"
|
" Create <N> OSDs on each disk (default 1)\n"
|
||||||
" --hybrid\n"
|
" --hybrid\n"
|
||||||
" Prepare hybrid (HDD+SSD) OSDs using provided devices. SSDs will be used for\n"
|
" Prepare hybrid (HDD+SSD, NVMe+SATA or etc) OSDs using provided devices. By default,\n"
|
||||||
" journals and metadata, HDDs will be used for data. Partitions for journals and\n"
|
" any passed SSDs will be used for journals and metadata, HDDs will be used for data,\n"
|
||||||
" metadata will be created automatically. Whether disks are SSD or HDD is decided\n"
|
" but you can override this behaviour with --fast-devices option. Journal and metadata\n"
|
||||||
" by the `/sys/block/.../queue/rotational` flag. In hybrid mode, default object\n"
|
" partitions will be created automatically. In the default mode, SSD and HDD disks\n"
|
||||||
" size is 1 MB instead of 128 KB, default journal size is 1 GB instead of 32 MB,\n"
|
" are distinguished by the `/sys/block/.../queue/rotational` flag. When HDDs are used\n"
|
||||||
" and throttle_small_writes is enabled by default.\n"
|
" for data in hybrid mode, default block_size is 1 MB instead of 128 KB, default journal\n"
|
||||||
|
" size is 1 GB instead of 32 MB, and throttle_small_writes is enabled by default.\n"
|
||||||
|
" --fast-devices /dev/nvmeX,/dev/nvmeY\n"
|
||||||
|
" In --hybrid mode, use these devices for journal and metadata instead of auto-detecting\n"
|
||||||
|
" and extracting them from the main [devices...] list.\n"
|
||||||
" --disable_data_fsync auto\n"
|
" --disable_data_fsync auto\n"
|
||||||
" Disable data device cache and fsync (1/yes/true = on, default auto)\n"
|
" Disable data device cache and fsync (1/yes/true = on, default auto)\n"
|
||||||
" --disable_meta_fsync auto\n"
|
" --disable_meta_fsync auto\n"
|
||||||
|
@ -196,6 +200,7 @@ static const char *help_text =
|
||||||
" --device_size 0 Set device size\n"
|
" --device_size 0 Set device size\n"
|
||||||
" --format text Result format: json, options, env, or text\n"
|
" --format text Result format: json, options, env, or text\n"
|
||||||
"\n"
|
"\n"
|
||||||
|
"Default I/O mode for commands involving disk I/O is O_DIRECT. If you don't want it, add --io cached.\n"
|
||||||
"Use vitastor-disk --help <command> for command details or vitastor-disk --help --all for all details.\n"
|
"Use vitastor-disk --help <command> for command details or vitastor-disk --help --all for all details.\n"
|
||||||
;
|
;
|
||||||
|
|
||||||
|
@ -220,6 +225,10 @@ int main(int argc, char *argv[])
|
||||||
cmd.push_back((char*)"dump-journal");
|
cmd.push_back((char*)"dump-journal");
|
||||||
aliased = true;
|
aliased = true;
|
||||||
}
|
}
|
||||||
|
else if (!strcmp(exe_name, "vitastor-disk-test"))
|
||||||
|
{
|
||||||
|
self.test_mode = true;
|
||||||
|
}
|
||||||
for (int i = 1; i < argc; i++)
|
for (int i = 1; i < argc; i++)
|
||||||
{
|
{
|
||||||
if (!strcmp(argv[i], "--all"))
|
if (!strcmp(argv[i], "--all"))
|
||||||
|
@ -314,6 +323,7 @@ int main(int argc, char *argv[])
|
||||||
// First argument is an OSD device - take metadata layout parameters from it
|
// First argument is an OSD device - take metadata layout parameters from it
|
||||||
if (self.dump_load_check_superblock(self.new_journal_device))
|
if (self.dump_load_check_superblock(self.new_journal_device))
|
||||||
return 1;
|
return 1;
|
||||||
|
self.new_journal_device = self.dsk.journal_device;
|
||||||
self.new_journal_offset = self.dsk.journal_offset;
|
self.new_journal_offset = self.dsk.journal_offset;
|
||||||
self.new_journal_len = self.dsk.journal_len;
|
self.new_journal_len = self.dsk.journal_len;
|
||||||
}
|
}
|
||||||
|
@ -379,6 +389,7 @@ int main(int argc, char *argv[])
|
||||||
// First argument is an OSD device - take metadata layout parameters from it
|
// First argument is an OSD device - take metadata layout parameters from it
|
||||||
if (self.dump_load_check_superblock(self.new_meta_device))
|
if (self.dump_load_check_superblock(self.new_meta_device))
|
||||||
return 1;
|
return 1;
|
||||||
|
self.new_meta_device = self.dsk.meta_device;
|
||||||
self.new_meta_offset = self.dsk.meta_offset;
|
self.new_meta_offset = self.dsk.meta_offset;
|
||||||
self.new_meta_len = self.dsk.meta_len;
|
self.new_meta_len = self.dsk.meta_len;
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,7 @@
|
||||||
#define VITASTOR_DISK_MAX_SB_SIZE 128*1024
|
#define VITASTOR_DISK_MAX_SB_SIZE 128*1024
|
||||||
#define VITASTOR_PART_TYPE "e7009fac-a5a1-4d72-af72-53de13059903"
|
#define VITASTOR_PART_TYPE "e7009fac-a5a1-4d72-af72-53de13059903"
|
||||||
#define DEFAULT_HYBRID_JOURNAL "1G"
|
#define DEFAULT_HYBRID_JOURNAL "1G"
|
||||||
|
#define DEFAULT_HYBRID_SSD_JOURNAL "128M"
|
||||||
|
|
||||||
struct resizer_data_moving_t;
|
struct resizer_data_moving_t;
|
||||||
|
|
||||||
|
@ -40,6 +41,7 @@ struct disk_tool_t
|
||||||
/**** Parameters ****/
|
/**** Parameters ****/
|
||||||
|
|
||||||
std::map<std::string, std::string> options;
|
std::map<std::string, std::string> options;
|
||||||
|
bool test_mode = false;
|
||||||
bool all, json, now;
|
bool all, json, now;
|
||||||
bool dump_with_blocks, dump_with_data;
|
bool dump_with_blocks, dump_with_data;
|
||||||
blockstore_disk_t dsk;
|
blockstore_disk_t dsk;
|
||||||
|
@ -126,7 +128,8 @@ struct disk_tool_t
|
||||||
uint32_t write_osd_superblock(std::string device, json11::Json params);
|
uint32_t write_osd_superblock(std::string device, json11::Json params);
|
||||||
|
|
||||||
int prepare_one(std::map<std::string, std::string> options, int is_hdd = -1);
|
int prepare_one(std::map<std::string, std::string> options, int is_hdd = -1);
|
||||||
int check_existing_partition(const std::string & dev);
|
int check_existing_partition(std::string & dev_by_uuid);
|
||||||
|
int fix_partition_type(std::string & dev_by_uuid);
|
||||||
int prepare(std::vector<std::string> devices);
|
int prepare(std::vector<std::string> devices);
|
||||||
std::vector<vitastor_dev_info_t> collect_devices(const std::vector<std::string> & devices);
|
std::vector<vitastor_dev_info_t> collect_devices(const std::vector<std::string> & devices);
|
||||||
json11::Json add_partitions(vitastor_dev_info_t & devinfo, std::vector<std::string> sizes);
|
json11::Json add_partitions(vitastor_dev_info_t & devinfo, std::vector<std::string> sizes);
|
||||||
|
@ -148,6 +151,6 @@ int write_zero(int fd, uint64_t offset, uint64_t size);
|
||||||
json11::Json read_parttable(std::string dev);
|
json11::Json read_parttable(std::string dev);
|
||||||
uint64_t dev_size_from_parttable(json11::Json pt);
|
uint64_t dev_size_from_parttable(json11::Json pt);
|
||||||
uint64_t free_from_parttable(json11::Json pt);
|
uint64_t free_from_parttable(json11::Json pt);
|
||||||
int fix_partition_type(std::string dev_by_uuid);
|
int fix_partition_type_uuid(std::string & dev_by_uuid, const std::string & type_uuid);
|
||||||
std::string csum_type_str(uint32_t data_csum_type);
|
std::string csum_type_str(uint32_t data_csum_type);
|
||||||
uint32_t csum_type_from_str(std::string data_csum_type);
|
uint32_t csum_type_from_str(std::string data_csum_type);
|
||||||
|
|
|
@ -18,7 +18,7 @@ int disk_tool_t::dump_journal()
|
||||||
printf("[\n");
|
printf("[\n");
|
||||||
if (all)
|
if (all)
|
||||||
{
|
{
|
||||||
dsk.journal_fd = open(dsk.journal_device.c_str(), O_DIRECT|O_RDONLY);
|
dsk.journal_fd = open(dsk.journal_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY);
|
||||||
if (dsk.journal_fd < 0)
|
if (dsk.journal_fd < 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno));
|
fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno));
|
||||||
|
@ -121,7 +121,7 @@ int disk_tool_t::dump_journal()
|
||||||
|
|
||||||
int disk_tool_t::process_journal(std::function<int(void*)> block_fn)
|
int disk_tool_t::process_journal(std::function<int(void*)> block_fn)
|
||||||
{
|
{
|
||||||
dsk.journal_fd = open(dsk.journal_device.c_str(), O_DIRECT|O_RDONLY);
|
dsk.journal_fd = open(dsk.journal_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY);
|
||||||
if (dsk.journal_fd < 0)
|
if (dsk.journal_fd < 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno));
|
fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno));
|
||||||
|
|
|
@ -14,7 +14,7 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
|
||||||
fprintf(stderr, "Invalid metadata block size: is not a multiple of %d\n", DIRECT_IO_ALIGNMENT);
|
fprintf(stderr, "Invalid metadata block size: is not a multiple of %d\n", DIRECT_IO_ALIGNMENT);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
dsk.meta_fd = open(dsk.meta_device.c_str(), O_DIRECT|O_RDONLY);
|
dsk.meta_fd = open(dsk.meta_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY);
|
||||||
if (dsk.meta_fd < 0)
|
if (dsk.meta_fd < 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to open metadata device %s: %s\n", dsk.meta_device.c_str(), strerror(errno));
|
fprintf(stderr, "Failed to open metadata device %s: %s\n", dsk.meta_device.c_str(), strerror(errno));
|
||||||
|
@ -159,7 +159,7 @@ int disk_tool_t::dump_load_check_superblock(const std::string & device)
|
||||||
{
|
{
|
||||||
auto cfg = json_to_string_map(sb["params"].object_items());
|
auto cfg = json_to_string_map(sb["params"].object_items());
|
||||||
dsk.parse_config(cfg);
|
dsk.parse_config(cfg);
|
||||||
dsk.data_io = dsk.meta_io = dsk.journal_io = "direct";
|
dsk.data_io = dsk.meta_io = dsk.journal_io = "cached";
|
||||||
dsk.open_data();
|
dsk.open_data();
|
||||||
dsk.open_meta();
|
dsk.open_meta();
|
||||||
dsk.open_journal();
|
dsk.open_journal();
|
||||||
|
@ -315,8 +315,7 @@ int disk_tool_t::write_json_meta(json11::Json meta)
|
||||||
fromhexstr(e["data_csum"].string_value(), new_data_csum_size,
|
fromhexstr(e["data_csum"].string_value(), new_data_csum_size,
|
||||||
((uint8_t*)new_entry) + sizeof(clean_disk_entry) + 2*new_clean_entry_bitmap_size);
|
((uint8_t*)new_entry) + sizeof(clean_disk_entry) + 2*new_clean_entry_bitmap_size);
|
||||||
}
|
}
|
||||||
uint32_t *new_entry_csum = (uint32_t*)(((uint8_t*)new_entry) + sizeof(clean_disk_entry) +
|
uint32_t *new_entry_csum = (uint32_t*)(((uint8_t*)new_entry) + new_clean_entry_size - 4);
|
||||||
2*new_clean_entry_bitmap_size + new_data_csum_size);
|
|
||||||
*new_entry_csum = crc32c(0, new_entry, new_clean_entry_size - 4);
|
*new_entry_csum = crc32c(0, new_entry, new_clean_entry_size - 4);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,18 +29,12 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
|
||||||
};
|
};
|
||||||
if (options.find("force") == options.end())
|
if (options.find("force") == options.end())
|
||||||
{
|
{
|
||||||
std::vector<std::string> all_devs = { options["data_device"], options["meta_device"], options["journal_device"] };
|
std::string* all_devs[] = { &options["data_device"], &options["meta_device"], &options["journal_device"] };
|
||||||
for (int i = 0; i < all_devs.size(); i++)
|
for (int i = 0; i < 3; i++)
|
||||||
{
|
{
|
||||||
const auto & dev = all_devs[i];
|
auto & dev = *all_devs[i];
|
||||||
if (dev == "")
|
if (dev == "")
|
||||||
continue;
|
continue;
|
||||||
if (dev.substr(0, 22) != "/dev/disk/by-partuuid/")
|
|
||||||
{
|
|
||||||
// Partitions should be identified by GPT partition UUID
|
|
||||||
fprintf(stderr, "%s does not start with /dev/disk/by-partuuid/. Partitions should be identified by GPT partition UUIDs\n", dev.c_str());
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
std::string real_dev = realpath_str(dev, false);
|
std::string real_dev = realpath_str(dev, false);
|
||||||
if (real_dev == "")
|
if (real_dev == "")
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -114,7 +108,11 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
dsk.parse_config(options);
|
dsk.parse_config(options);
|
||||||
dsk.data_io = dsk.meta_io = dsk.journal_io = "direct";
|
// Set all offsets to 4096 to calculate metadata size with excess
|
||||||
|
dsk.journal_offset = 4096;
|
||||||
|
dsk.meta_offset = 4096;
|
||||||
|
dsk.data_offset = 4096;
|
||||||
|
dsk.data_io = dsk.meta_io = dsk.journal_io = (options["io"] == "cached" ? "cached" : "direct");
|
||||||
dsk.open_data();
|
dsk.open_data();
|
||||||
dsk.open_meta();
|
dsk.open_meta();
|
||||||
dsk.open_journal();
|
dsk.open_journal();
|
||||||
|
@ -159,7 +157,11 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
std::string osd_num_str;
|
std::string osd_num_str;
|
||||||
if (shell_exec({ "vitastor-cli", "alloc-osd" }, "", &osd_num_str, NULL) != 0)
|
if (test_mode && options.find("osd_num") != options.end())
|
||||||
|
{
|
||||||
|
osd_num_str = options["osd_num"];
|
||||||
|
}
|
||||||
|
else if (shell_exec({ "vitastor-cli", "alloc-osd" }, "", &osd_num_str, NULL) != 0)
|
||||||
{
|
{
|
||||||
dsk.close_all();
|
dsk.close_all();
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -173,8 +175,8 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
|
||||||
}
|
}
|
||||||
sb["osd_num"] = osd_num;
|
sb["osd_num"] = osd_num;
|
||||||
// Zero out metadata and journal
|
// Zero out metadata and journal
|
||||||
if (write_zero(dsk.meta_fd, dsk.meta_offset, dsk.meta_len) != 0 ||
|
if (write_zero(dsk.meta_fd, sb["meta_offset"].uint64_value(), dsk.meta_len) != 0 ||
|
||||||
write_zero(dsk.journal_fd, dsk.journal_offset, dsk.journal_len) != 0)
|
write_zero(dsk.journal_fd, sb["journal_offset"].uint64_value(), dsk.journal_len) != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to zero out metadata or journal: %s\n", strerror(errno));
|
fprintf(stderr, "Failed to zero out metadata or journal: %s\n", strerror(errno));
|
||||||
dsk.close_all();
|
dsk.close_all();
|
||||||
|
@ -199,15 +201,18 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
|
||||||
if (sep_j)
|
if (sep_j)
|
||||||
desc += (sep_m ? " and journal on " : " with journal on ") + realpath_str(options["journal_device"]);
|
desc += (sep_m ? " and journal on " : " with journal on ") + realpath_str(options["journal_device"]);
|
||||||
fprintf(stderr, "Initialized OSD %ju on %s\n", osd_num, desc.c_str());
|
fprintf(stderr, "Initialized OSD %ju on %s\n", osd_num, desc.c_str());
|
||||||
if (shell_exec({ "systemctl", "enable", "--now", "vitastor-osd@"+std::to_string(osd_num) }, "", NULL, NULL) != 0)
|
if (!test_mode || options.find("no_init") == options.end())
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to enable systemd unit vitastor-osd@%ju\n", osd_num);
|
if (shell_exec({ "systemctl", "enable", "--now", "vitastor-osd@"+std::to_string(osd_num) }, "", NULL, NULL) != 0)
|
||||||
return 1;
|
{
|
||||||
|
fprintf(stderr, "Failed to enable systemd unit vitastor-osd@%ju\n", osd_num);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int disk_tool_t::check_existing_partition(const std::string & dev)
|
int disk_tool_t::check_existing_partition(std::string & dev)
|
||||||
{
|
{
|
||||||
std::string out;
|
std::string out;
|
||||||
if (shell_exec({ "wipefs", dev }, "", &out, NULL) != 0 || out != "")
|
if (shell_exec({ "wipefs", dev }, "", &out, NULL) != 0 || out != "")
|
||||||
|
@ -229,11 +234,27 @@ int disk_tool_t::check_existing_partition(const std::string & dev)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int disk_tool_t::fix_partition_type(std::string & dev)
|
||||||
|
{
|
||||||
|
std::string type_uuid = VITASTOR_PART_TYPE;
|
||||||
|
if (test_mode && options.find("part_type_uuid") != options.end())
|
||||||
|
{
|
||||||
|
type_uuid = options["part_type_uuid"];
|
||||||
|
}
|
||||||
|
return fix_partition_type_uuid(dev, type_uuid);
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<vitastor_dev_info_t> disk_tool_t::collect_devices(const std::vector<std::string> & devices)
|
std::vector<vitastor_dev_info_t> disk_tool_t::collect_devices(const std::vector<std::string> & devices)
|
||||||
{
|
{
|
||||||
std::vector<vitastor_dev_info_t> devinfo;
|
std::vector<vitastor_dev_info_t> devinfo;
|
||||||
|
std::set<std::string> seen;
|
||||||
for (auto & dev: devices)
|
for (auto & dev: devices)
|
||||||
{
|
{
|
||||||
|
if (seen.find(dev) != seen.end())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "%s is specified multiple times, ignoring\n", dev.c_str());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
// Check if the device is a whole disk
|
// Check if the device is a whole disk
|
||||||
if (dev.substr(0, 5) != "/dev/")
|
if (dev.substr(0, 5) != "/dev/")
|
||||||
{
|
{
|
||||||
|
@ -294,10 +315,6 @@ std::vector<vitastor_dev_info_t> disk_tool_t::collect_devices(const std::vector<
|
||||||
.free = !pt.is_null() ? free_from_parttable(pt) : dev_size,
|
.free = !pt.is_null() ? free_from_parttable(pt) : dev_size,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
if (!devinfo.size())
|
|
||||||
{
|
|
||||||
fprintf(stderr, "No suitable devices found\n");
|
|
||||||
}
|
|
||||||
return devinfo;
|
return devinfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -348,47 +365,12 @@ json11::Json disk_tool_t::add_partitions(vitastor_dev_info_t & devinfo, std::vec
|
||||||
fprintf(stderr, "Failed to add %zu partition(s) with sfdisk: new partitions not found in table\n", sizes.size());
|
fprintf(stderr, "Failed to add %zu partition(s) with sfdisk: new partitions not found in table\n", sizes.size());
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
// Check if new nodes exist and run partprobe if not
|
// Check if new devices exist, run partprobe if not, then wait until they appear
|
||||||
// FIXME: We could use parted instead of sfdisk because partprobe is already a part of parted
|
// FIXME: We could use parted instead of sfdisk because partprobe is already a part of parted
|
||||||
int iter = 0, r;
|
|
||||||
while (true)
|
|
||||||
{
|
|
||||||
for (const auto & part: new_parts)
|
|
||||||
{
|
|
||||||
std::string link_path = "/dev/disk/by-partuuid/"+strtolower(part["uuid"].string_value());
|
|
||||||
struct stat st;
|
|
||||||
if (lstat(link_path.c_str(), &st) < 0)
|
|
||||||
{
|
|
||||||
if (errno == ENOENT)
|
|
||||||
{
|
|
||||||
iter++;
|
|
||||||
// Run partprobe
|
|
||||||
std::string out;
|
|
||||||
if (iter > 1 || (r = shell_exec({ "partprobe", devinfo.path }, "", &out, NULL)) != 0)
|
|
||||||
{
|
|
||||||
fprintf(
|
|
||||||
stderr, iter == 1 && r == 255
|
|
||||||
? "partprobe utility is required to reread partition table while disk %s is in use\n"
|
|
||||||
: "partprobe failed to re-read partition table while disk %s is in use\n",
|
|
||||||
devinfo.path.c_str()
|
|
||||||
);
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
fprintf(stderr, "Failed to lstat %s: %s\n", link_path.c_str(), strerror(errno));
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
// Wait until device symlinks in /dev/disk/by-partuuid/ appear
|
|
||||||
bool exists = false;
|
bool exists = false;
|
||||||
const int max_iter = 300; // max 30 sec
|
const int max_iter = 300; // max 30 sec
|
||||||
iter = 0;
|
int iter = 0;
|
||||||
|
int r = 0;
|
||||||
while (!exists && iter < max_iter)
|
while (!exists && iter < max_iter)
|
||||||
{
|
{
|
||||||
exists = true;
|
exists = true;
|
||||||
|
@ -396,28 +378,48 @@ json11::Json disk_tool_t::add_partitions(vitastor_dev_info_t & devinfo, std::vec
|
||||||
{
|
{
|
||||||
std::string link_path = "/dev/disk/by-partuuid/"+strtolower(part["uuid"].string_value());
|
std::string link_path = "/dev/disk/by-partuuid/"+strtolower(part["uuid"].string_value());
|
||||||
struct stat st;
|
struct stat st;
|
||||||
if (lstat(link_path.c_str(), &st) < 0)
|
if (stat(part["node"].string_value().c_str(), &st) < 0 ||
|
||||||
|
lstat(link_path.c_str(), &st) < 0)
|
||||||
{
|
{
|
||||||
if (errno == ENOENT)
|
if (errno == ENOENT)
|
||||||
{
|
{
|
||||||
exists = false;
|
exists = false;
|
||||||
if (iter == 4)
|
if (iter == 4)
|
||||||
{
|
{
|
||||||
|
// Print message after 400ms
|
||||||
fprintf(stderr, "Waiting for %s to appear for up to %d sec...\n", link_path.c_str(), max_iter/10);
|
fprintf(stderr, "Waiting for %s to appear for up to %d sec...\n", link_path.c_str(), max_iter/10);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to lstat %s: %s\n", link_path.c_str(), strerror(errno));
|
fprintf(stderr, "Failed to stat %s or lstat %s: %s\n", part["node"].string_value().c_str(),
|
||||||
|
link_path.c_str(), strerror(errno));
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!exists)
|
if (exists)
|
||||||
{
|
{
|
||||||
struct timespec ts = { .tv_sec = 0, .tv_nsec = 100000000 }; // 100ms
|
break;
|
||||||
iter += (nanosleep(&ts, NULL) == 0);
|
|
||||||
}
|
}
|
||||||
|
if (!exists && iter == 0)
|
||||||
|
{
|
||||||
|
// Run partprobe
|
||||||
|
std::string out;
|
||||||
|
r = shell_exec({ "partprobe", devinfo.path }, "", &out, NULL);
|
||||||
|
if (r != 0)
|
||||||
|
{
|
||||||
|
fprintf(
|
||||||
|
stderr, r == 255
|
||||||
|
? "partprobe utility is required to reread partition table while disk %s is in use\n"
|
||||||
|
: "partprobe failed to re-read partition table while disk %s is in use\n",
|
||||||
|
devinfo.path.c_str()
|
||||||
|
);
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
struct timespec ts = { .tv_sec = 0, .tv_nsec = 100000000 }; // 100ms
|
||||||
|
iter += (nanosleep(&ts, NULL) == 0 || !iter);
|
||||||
}
|
}
|
||||||
devinfo.pt = newpt;
|
devinfo.pt = newpt;
|
||||||
devinfo.osd_part_count += sizes.size();
|
devinfo.osd_part_count += sizes.size();
|
||||||
|
@ -500,7 +502,10 @@ int disk_tool_t::get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std
|
||||||
{
|
{
|
||||||
blockstore_disk_t dsk;
|
blockstore_disk_t dsk;
|
||||||
dsk.parse_config(options);
|
dsk.parse_config(options);
|
||||||
dsk.data_io = dsk.meta_io = dsk.journal_io = "direct";
|
dsk.journal_offset = 4096;
|
||||||
|
dsk.meta_offset = 4096;
|
||||||
|
dsk.data_offset = 4096;
|
||||||
|
dsk.data_io = dsk.meta_io = dsk.journal_io = "cached";
|
||||||
dsk.open_data();
|
dsk.open_data();
|
||||||
dsk.open_meta();
|
dsk.open_meta();
|
||||||
dsk.open_journal();
|
dsk.open_journal();
|
||||||
|
@ -510,6 +515,7 @@ int disk_tool_t::get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std
|
||||||
}
|
}
|
||||||
catch (std::exception & e)
|
catch (std::exception & e)
|
||||||
{
|
{
|
||||||
|
dsk.close_all();
|
||||||
fprintf(stderr, "%s\n", e.what());
|
fprintf(stderr, "%s\n", e.what());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -564,9 +570,12 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
|
||||||
{
|
{
|
||||||
if (options.find("data_device") != options.end() && options["data_device"] != "")
|
if (options.find("data_device") != options.end() && options["data_device"] != "")
|
||||||
{
|
{
|
||||||
if (options.find("hybrid") != options.end() || options.find("osd_per_disk") != options.end() || devices.size())
|
if (options.find("hybrid") != options.end() ||
|
||||||
|
options.find("fast_devices") != options.end() ||
|
||||||
|
options.find("osd_per_disk") != options.end() ||
|
||||||
|
devices.size())
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Device list (positional arguments) and --hybrid are incompatible with --data_device\n");
|
fprintf(stderr, "Device list (positional arguments), --osd_per_disk, --hybrid and --fast-devices are incompatible with --data_device\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
return prepare_one(options, options.find("hdd") != options.end() ? 1 : 0);
|
return prepare_one(options, options.find("hdd") != options.end() ? 1 : 0);
|
||||||
|
@ -583,8 +592,10 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
|
||||||
auto devinfo = collect_devices(devices);
|
auto devinfo = collect_devices(devices);
|
||||||
if (!devinfo.size())
|
if (!devinfo.size())
|
||||||
{
|
{
|
||||||
|
fprintf(stderr, "No suitable devices found\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
bool explicit_fast = options.find("fast_devices") != options.end();
|
||||||
uint64_t osd_per_disk = stoull_full(options["osd_per_disk"]);
|
uint64_t osd_per_disk = stoull_full(options["osd_per_disk"]);
|
||||||
if (!osd_per_disk)
|
if (!osd_per_disk)
|
||||||
osd_per_disk = 1;
|
osd_per_disk = 1;
|
||||||
|
@ -603,21 +614,55 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
|
||||||
if (options.find("disable_meta_fsync") == options.end())
|
if (options.find("disable_meta_fsync") == options.end())
|
||||||
options["disable_meta_fsync"] = "auto";
|
options["disable_meta_fsync"] = "auto";
|
||||||
options["disable_journal_fsync"] = options["disable_meta_fsync"];
|
options["disable_journal_fsync"] = options["disable_meta_fsync"];
|
||||||
for (auto & dev: devinfo)
|
if (explicit_fast)
|
||||||
if (!dev.is_hdd)
|
|
||||||
ssds.push_back(dev);
|
|
||||||
if (!ssds.size())
|
|
||||||
{
|
{
|
||||||
fprintf(stderr, "No SSDs found\n");
|
auto fast = explode(",", options["fast_devices"], true);
|
||||||
return 1;
|
ssds = collect_devices(fast);
|
||||||
|
if (!ssds.size())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "No fast devices found\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (options["journal_size"] == "")
|
||||||
|
{
|
||||||
|
auto auto_journal_size = DEFAULT_HYBRID_SSD_JOURNAL;
|
||||||
|
for (auto & dev: devinfo)
|
||||||
|
{
|
||||||
|
if (dev.is_hdd)
|
||||||
|
{
|
||||||
|
auto_journal_size = DEFAULT_HYBRID_JOURNAL;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
options["journal_size"] = auto_journal_size;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (ssds.size() == devinfo.size())
|
else
|
||||||
{
|
{
|
||||||
fprintf(stderr, "No HDDs found\n");
|
std::vector<vitastor_dev_info_t> hdds;
|
||||||
return 1;
|
for (auto & dev: devinfo)
|
||||||
|
{
|
||||||
|
if (!dev.is_hdd)
|
||||||
|
ssds.push_back(dev);
|
||||||
|
else
|
||||||
|
hdds.push_back(dev);
|
||||||
|
}
|
||||||
|
if (!ssds.size())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "No SSDs found\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (!hdds.size())
|
||||||
|
{
|
||||||
|
fprintf(stderr, "No HDDs found\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
devinfo = hdds;
|
||||||
|
if (options["journal_size"] == "")
|
||||||
|
{
|
||||||
|
options["journal_size"] = DEFAULT_HYBRID_JOURNAL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (options["journal_size"] == "")
|
|
||||||
options["journal_size"] = DEFAULT_HYBRID_JOURNAL;
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -627,31 +672,28 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
|
||||||
auto journal_size = options["journal_size"];
|
auto journal_size = options["journal_size"];
|
||||||
for (auto & dev: devinfo)
|
for (auto & dev: devinfo)
|
||||||
{
|
{
|
||||||
if (!hybrid || dev.is_hdd)
|
// Select new partitions and create an OSD on each of them
|
||||||
|
for (const auto & uuid: get_new_data_parts(dev, osd_per_disk, max_other_percent))
|
||||||
{
|
{
|
||||||
// Select new partitions and create an OSD on each of them
|
options["force"] = true;
|
||||||
for (const auto & uuid: get_new_data_parts(dev, osd_per_disk, max_other_percent))
|
options["data_device"] = "/dev/disk/by-partuuid/"+strtolower(uuid);
|
||||||
|
if (hybrid)
|
||||||
{
|
{
|
||||||
options["force"] = true;
|
// Select/create journal and metadata partitions
|
||||||
options["data_device"] = "/dev/disk/by-partuuid/"+strtolower(uuid);
|
int r = get_meta_partition(ssds, options);
|
||||||
if (hybrid)
|
if (r != 0)
|
||||||
{
|
{
|
||||||
// Select/create journal and metadata partitions
|
return 1;
|
||||||
int r = get_meta_partition(ssds, options);
|
|
||||||
if (r != 0)
|
|
||||||
{
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
options.erase("journal_size");
|
|
||||||
}
|
|
||||||
// Treat all disks as SSDs if not in the hybrid mode
|
|
||||||
prepare_one(options, dev.is_hdd ? 1 : 0);
|
|
||||||
if (hybrid)
|
|
||||||
{
|
|
||||||
options["journal_size"] = journal_size;
|
|
||||||
options.erase("journal_device");
|
|
||||||
options.erase("meta_device");
|
|
||||||
}
|
}
|
||||||
|
options.erase("journal_size");
|
||||||
|
}
|
||||||
|
// Treat all disks as SSDs if not in the hybrid mode
|
||||||
|
prepare_one(options, dev.is_hdd ? 1 : 0);
|
||||||
|
if (hybrid)
|
||||||
|
{
|
||||||
|
options["journal_size"] = journal_size;
|
||||||
|
options.erase("journal_device");
|
||||||
|
options.erase("meta_device");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -91,7 +91,7 @@ int disk_tool_t::resize_parse_params()
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
dsk.parse_config(options);
|
dsk.parse_config(options);
|
||||||
dsk.data_io = dsk.meta_io = dsk.journal_io = "direct";
|
dsk.data_io = dsk.meta_io = dsk.journal_io = "cached";
|
||||||
dsk.open_data();
|
dsk.open_data();
|
||||||
dsk.open_meta();
|
dsk.open_meta();
|
||||||
dsk.open_journal();
|
dsk.open_journal();
|
||||||
|
@ -114,7 +114,10 @@ int disk_tool_t::resize_parse_params()
|
||||||
new_data_offset = options.find("new_data_offset") != options.end()
|
new_data_offset = options.find("new_data_offset") != options.end()
|
||||||
? parse_size(options["new_data_offset"]) : dsk.data_offset;
|
? parse_size(options["new_data_offset"]) : dsk.data_offset;
|
||||||
new_data_len = options.find("new_data_len") != options.end()
|
new_data_len = options.find("new_data_len") != options.end()
|
||||||
? parse_size(options["new_data_len"]) : dsk.data_len;
|
? parse_size(options["new_data_len"])
|
||||||
|
: (options.find("new_data_offset") != options.end()
|
||||||
|
? dsk.data_device_size-new_data_offset
|
||||||
|
: dsk.data_len);
|
||||||
new_meta_offset = options.find("new_meta_offset") != options.end()
|
new_meta_offset = options.find("new_meta_offset") != options.end()
|
||||||
? parse_size(options["new_meta_offset"]) : dsk.meta_offset;
|
? parse_size(options["new_meta_offset"]) : dsk.meta_offset;
|
||||||
new_meta_len = options.find("new_meta_len") != options.end()
|
new_meta_len = options.find("new_meta_len") != options.end()
|
||||||
|
@ -123,6 +126,14 @@ int disk_tool_t::resize_parse_params()
|
||||||
? parse_size(options["new_journal_offset"]) : dsk.journal_offset;
|
? parse_size(options["new_journal_offset"]) : dsk.journal_offset;
|
||||||
new_journal_len = options.find("new_journal_len") != options.end()
|
new_journal_len = options.find("new_journal_len") != options.end()
|
||||||
? parse_size(options["new_journal_len"]) : dsk.journal_len;
|
? parse_size(options["new_journal_len"]) : dsk.journal_len;
|
||||||
|
if (new_data_len+new_data_offset > dsk.data_device_size)
|
||||||
|
new_data_len = dsk.data_device_size-new_data_offset;
|
||||||
|
if (new_meta_device == dsk.data_device && new_data_offset < new_meta_offset &&
|
||||||
|
new_data_len+new_data_offset > new_meta_offset)
|
||||||
|
new_data_len = new_meta_offset-new_data_offset;
|
||||||
|
if (new_journal_device == dsk.data_device && new_data_offset < new_journal_offset &&
|
||||||
|
new_data_len+new_data_offset > new_journal_offset)
|
||||||
|
new_data_len = new_journal_offset-new_data_offset;
|
||||||
if (new_meta_device == dsk.meta_device &&
|
if (new_meta_device == dsk.meta_device &&
|
||||||
new_journal_device == dsk.journal_device &&
|
new_journal_device == dsk.journal_device &&
|
||||||
new_data_offset == dsk.data_offset &&
|
new_data_offset == dsk.data_offset &&
|
||||||
|
@ -159,10 +170,10 @@ void disk_tool_t::resize_init(blockstore_meta_header_v2_t *hdr)
|
||||||
dsk.data_csum_type = hdr->data_csum_type;
|
dsk.data_csum_type = hdr->data_csum_type;
|
||||||
dsk.csum_block_size = hdr->csum_block_size;
|
dsk.csum_block_size = hdr->csum_block_size;
|
||||||
}
|
}
|
||||||
if (((new_data_len-dsk.data_len) % dsk.data_block_size) ||
|
if (((new_data_offset-dsk.data_offset) % dsk.data_block_size))
|
||||||
((new_data_offset-dsk.data_offset) % dsk.data_block_size))
|
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Data alignment mismatch\n");
|
fprintf(stderr, "Data alignment mismatch: old data offset is 0x%jx, new is 0x%jx, but alignment on %x should be equal\n",
|
||||||
|
dsk.data_offset, new_data_offset, dsk.data_block_size);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
data_idx_diff = ((int64_t)(dsk.data_offset-new_data_offset)) / dsk.data_block_size;
|
data_idx_diff = ((int64_t)(dsk.data_offset-new_data_offset)) / dsk.data_block_size;
|
||||||
|
@ -220,10 +231,10 @@ int disk_tool_t::resize_remap_blocks()
|
||||||
}
|
}
|
||||||
for (uint64_t i = 0; i < free_last; i++)
|
for (uint64_t i = 0; i < free_last; i++)
|
||||||
{
|
{
|
||||||
if (data_alloc->get(total_blocks-i))
|
if (data_alloc->get(total_blocks-i-1))
|
||||||
data_remap[total_blocks-i] = 0;
|
data_remap[total_blocks-i-1] = 0;
|
||||||
else
|
else
|
||||||
data_alloc->set(total_blocks-i, true);
|
data_alloc->set(total_blocks-i-1, true);
|
||||||
}
|
}
|
||||||
for (auto & p: data_remap)
|
for (auto & p: data_remap)
|
||||||
{
|
{
|
||||||
|
@ -246,7 +257,7 @@ int disk_tool_t::resize_copy_data()
|
||||||
iodepth = 32;
|
iodepth = 32;
|
||||||
}
|
}
|
||||||
ringloop = new ring_loop_t(iodepth < RINGLOOP_DEFAULT_SIZE ? RINGLOOP_DEFAULT_SIZE : iodepth);
|
ringloop = new ring_loop_t(iodepth < RINGLOOP_DEFAULT_SIZE ? RINGLOOP_DEFAULT_SIZE : iodepth);
|
||||||
dsk.data_fd = open(dsk.data_device.c_str(), O_DIRECT|O_RDWR);
|
dsk.data_fd = open(dsk.data_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR);
|
||||||
if (dsk.data_fd < 0)
|
if (dsk.data_fd < 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to open data device %s: %s\n", dsk.data_device.c_str(), strerror(errno));
|
fprintf(stderr, "Failed to open data device %s: %s\n", dsk.data_device.c_str(), strerror(errno));
|
||||||
|
@ -441,7 +452,7 @@ int disk_tool_t::resize_rewrite_journal()
|
||||||
|
|
||||||
int disk_tool_t::resize_write_new_journal()
|
int disk_tool_t::resize_write_new_journal()
|
||||||
{
|
{
|
||||||
new_journal_fd = open(new_journal_device.c_str(), O_DIRECT|O_RDWR);
|
new_journal_fd = open(new_journal_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR);
|
||||||
if (new_journal_fd < 0)
|
if (new_journal_fd < 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to open new journal device %s: %s\n", new_journal_device.c_str(), strerror(errno));
|
fprintf(stderr, "Failed to open new journal device %s: %s\n", new_journal_device.c_str(), strerror(errno));
|
||||||
|
@ -467,12 +478,13 @@ int disk_tool_t::resize_rewrite_meta()
|
||||||
blockstore_meta_header_v2_t *new_hdr = (blockstore_meta_header_v2_t *)new_meta_buf;
|
blockstore_meta_header_v2_t *new_hdr = (blockstore_meta_header_v2_t *)new_meta_buf;
|
||||||
new_hdr->zero = 0;
|
new_hdr->zero = 0;
|
||||||
new_hdr->magic = BLOCKSTORE_META_MAGIC_V1;
|
new_hdr->magic = BLOCKSTORE_META_MAGIC_V1;
|
||||||
new_hdr->version = BLOCKSTORE_META_FORMAT_V1;
|
new_hdr->version = BLOCKSTORE_META_FORMAT_V2;
|
||||||
new_hdr->meta_block_size = dsk.meta_block_size;
|
new_hdr->meta_block_size = dsk.meta_block_size;
|
||||||
new_hdr->data_block_size = dsk.data_block_size;
|
new_hdr->data_block_size = dsk.data_block_size;
|
||||||
new_hdr->bitmap_granularity = dsk.bitmap_granularity ? dsk.bitmap_granularity : 4096;
|
new_hdr->bitmap_granularity = dsk.bitmap_granularity ? dsk.bitmap_granularity : 4096;
|
||||||
new_hdr->data_csum_type = dsk.data_csum_type;
|
new_hdr->data_csum_type = dsk.data_csum_type;
|
||||||
new_hdr->csum_block_size = dsk.csum_block_size;
|
new_hdr->csum_block_size = dsk.csum_block_size;
|
||||||
|
new_hdr->header_csum = crc32c(0, new_hdr, sizeof(*new_hdr));
|
||||||
},
|
},
|
||||||
[this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
|
[this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
|
||||||
{
|
{
|
||||||
|
@ -481,7 +493,7 @@ int disk_tool_t::resize_rewrite_meta()
|
||||||
block_num = remap_it->second;
|
block_num = remap_it->second;
|
||||||
if (block_num < free_first || block_num >= total_blocks-free_last)
|
if (block_num < free_first || block_num >= total_blocks-free_last)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "BUG: remapped block not in range\n");
|
fprintf(stderr, "BUG: remapped block %ju not in range %ju..%ju\n", block_num, free_first, total_blocks-free_last);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
block_num += data_idx_diff;
|
block_num += data_idx_diff;
|
||||||
|
@ -494,6 +506,8 @@ int disk_tool_t::resize_rewrite_meta()
|
||||||
memcpy(new_entry->bitmap, bitmap, 2*new_clean_entry_bitmap_size + new_data_csum_size);
|
memcpy(new_entry->bitmap, bitmap, 2*new_clean_entry_bitmap_size + new_data_csum_size);
|
||||||
else
|
else
|
||||||
memset(new_entry->bitmap, 0xff, 2*new_clean_entry_bitmap_size);
|
memset(new_entry->bitmap, 0xff, 2*new_clean_entry_bitmap_size);
|
||||||
|
uint32_t *new_entry_csum = (uint32_t*)(((uint8_t*)new_entry) + new_clean_entry_size - 4);
|
||||||
|
*new_entry_csum = crc32c(0, new_entry, new_clean_entry_size - 4);
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
if (r != 0)
|
if (r != 0)
|
||||||
|
@ -507,7 +521,7 @@ int disk_tool_t::resize_rewrite_meta()
|
||||||
|
|
||||||
int disk_tool_t::resize_write_new_meta()
|
int disk_tool_t::resize_write_new_meta()
|
||||||
{
|
{
|
||||||
new_meta_fd = open(new_meta_device.c_str(), O_DIRECT|O_RDWR);
|
new_meta_fd = open(new_meta_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR);
|
||||||
if (new_meta_fd < 0)
|
if (new_meta_fd < 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to open new metadata device %s: %s\n", new_meta_device.c_str(), strerror(errno));
|
fprintf(stderr, "Failed to open new metadata device %s: %s\n", new_meta_device.c_str(), strerror(errno));
|
||||||
|
|
|
@ -37,6 +37,8 @@ int disk_tool_t::resize_data(std::string device)
|
||||||
fprintf(stderr, "%s\n", e.what());
|
fprintf(stderr, "%s\n", e.what());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
// Save FD numbers because calc_lengths() relies on them
|
||||||
|
int old_journal_fd = dsk.journal_fd, old_meta_fd = dsk.meta_fd, old_data_fd = dsk.data_fd;
|
||||||
dsk.close_all();
|
dsk.close_all();
|
||||||
bool dry_run = options.find("dry_run") != options.end();
|
bool dry_run = options.find("dry_run") != options.end();
|
||||||
auto old_journal_device = dsk.journal_device;
|
auto old_journal_device = dsk.journal_device;
|
||||||
|
@ -48,6 +50,22 @@ int disk_tool_t::resize_data(std::string device)
|
||||||
if (options.find("move_journal") == options.end())
|
if (options.find("move_journal") == options.end())
|
||||||
options["move_journal"] = dsk.journal_device == dsk.data_device ? "" : dsk.journal_device;
|
options["move_journal"] = dsk.journal_device == dsk.data_device ? "" : dsk.journal_device;
|
||||||
}
|
}
|
||||||
|
uint64_t new_data_dev_size = 0;
|
||||||
|
if (options.find("data_size") != options.end())
|
||||||
|
{
|
||||||
|
new_data_dev_size = parse_size(options["data_size"]);
|
||||||
|
new_data_dev_size = options["data_size"] == "max" || new_data_dev_size > dsk.data_device_size
|
||||||
|
? dsk.data_device_size : new_data_dev_size;
|
||||||
|
dsk.data_device_size = new_data_dev_size;
|
||||||
|
dsk.cfg_data_size = 0;
|
||||||
|
dsk.journal_fd = old_journal_fd;
|
||||||
|
dsk.meta_fd = old_meta_fd;
|
||||||
|
dsk.data_fd = old_data_fd;
|
||||||
|
dsk.calc_lengths(true);
|
||||||
|
dsk.journal_fd = -1;
|
||||||
|
dsk.meta_fd = -1;
|
||||||
|
dsk.data_fd = -1;
|
||||||
|
}
|
||||||
std::map<std::string, std::string> move_options;
|
std::map<std::string, std::string> move_options;
|
||||||
if (options.find("move_journal") != options.end())
|
if (options.find("move_journal") != options.end())
|
||||||
{
|
{
|
||||||
|
@ -69,14 +87,8 @@ int disk_tool_t::resize_data(std::string device)
|
||||||
new_data_offset += ((dsk.data_offset-new_data_offset) % dsk.data_block_size);
|
new_data_offset += ((dsk.data_offset-new_data_offset) % dsk.data_block_size);
|
||||||
if (new_data_offset != dsk.data_offset)
|
if (new_data_offset != dsk.data_offset)
|
||||||
move_options["new_data_offset"] = std::to_string(new_data_offset);
|
move_options["new_data_offset"] = std::to_string(new_data_offset);
|
||||||
if (options.find("data_size") != options.end())
|
if (new_data_dev_size != 0)
|
||||||
{
|
move_options["new_data_len"] = std::to_string(new_data_dev_size-new_data_offset);
|
||||||
auto new_data_dev_size = parse_size(options["data_size"]);
|
|
||||||
new_data_dev_size = options["data_size"] == "max" || new_data_dev_size > dsk.data_device_size
|
|
||||||
? dsk.data_device_size : new_data_dev_size;
|
|
||||||
if (new_data_dev_size-dsk.data_offset != dsk.data_len)
|
|
||||||
move_options["new_data_len"] = std::to_string(new_data_dev_size-new_data_offset);
|
|
||||||
}
|
|
||||||
new_meta_offset = 4096 + (new_meta_device == new_journal_device ? new_journal_len : 0);
|
new_meta_offset = 4096 + (new_meta_device == new_journal_device ? new_journal_len : 0);
|
||||||
if (new_meta_offset != dsk.meta_offset)
|
if (new_meta_offset != dsk.meta_offset)
|
||||||
move_options["new_meta_offset"] = std::to_string(new_meta_offset);
|
move_options["new_meta_offset"] = std::to_string(new_meta_offset);
|
||||||
|
@ -188,17 +200,12 @@ int disk_tool_t::resize_parse_move_journal(std::map<std::string, std::string> &
|
||||||
else
|
else
|
||||||
options["move_journal"] = "<new journal partition on "+parent_dev+">";
|
options["move_journal"] = "<new journal partition on "+parent_dev+">";
|
||||||
}
|
}
|
||||||
else if (options["move_journal"].substr(0, 22) != "/dev/disk/by-partuuid/")
|
|
||||||
{
|
|
||||||
// Partitions should be identified by GPT partition UUID
|
|
||||||
fprintf(stderr, "%s does not start with /dev/disk/by-partuuid/. Partitions should be identified by GPT partition UUIDs\n", options["move_journal"].c_str());
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// already a partition - check that it's a GPT partition with correct type
|
// already a partition - check that it's a GPT partition with correct type
|
||||||
if (options.find("force") == options.end() &&
|
if ((options.find("force") == options.end()
|
||||||
check_existing_partition(real_dev) != 0)
|
? check_existing_partition(options["move_journal"])
|
||||||
|
: fix_partition_type(options["move_journal"])) != 0)
|
||||||
{
|
{
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -269,17 +276,12 @@ int disk_tool_t::resize_parse_move_meta(std::map<std::string, std::string> & mov
|
||||||
else
|
else
|
||||||
options["move_meta"] = "<new metadata partition on "+parent_dev+">";
|
options["move_meta"] = "<new metadata partition on "+parent_dev+">";
|
||||||
}
|
}
|
||||||
else if (options["move_meta"].substr(0, 22) != "/dev/disk/by-partuuid/")
|
|
||||||
{
|
|
||||||
// Partitions should be identified by GPT partition UUID
|
|
||||||
fprintf(stderr, "%s does not start with /dev/disk/by-partuuid/. Partitions should be identified by GPT partition UUIDs\n", options["move_meta"].c_str());
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// already a partition - check that it's a GPT partition with correct type
|
// already a partition - check that it's a GPT partition with correct type
|
||||||
if (options.find("force") == options.end() &&
|
if ((options.find("force") == options.end()
|
||||||
check_existing_partition(real_dev) != 0)
|
? check_existing_partition(options["move_meta"])
|
||||||
|
: fix_partition_type(options["move_meta"])) != 0)
|
||||||
{
|
{
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -122,7 +122,7 @@ uint32_t disk_tool_t::write_osd_superblock(std::string device, json11::Json para
|
||||||
sb->size = sb_size;
|
sb->size = sb_size;
|
||||||
memcpy(sb->json_data, json_data.c_str(), json_data.size());
|
memcpy(sb->json_data, json_data.c_str(), json_data.size());
|
||||||
sb->crc32c = crc32c(0, &sb->size, sb->size - ((uint8_t*)&sb->size - buf));
|
sb->crc32c = crc32c(0, &sb->size, sb->size - ((uint8_t*)&sb->size - buf));
|
||||||
int fd = open(device.c_str(), O_DIRECT|O_RDWR);
|
int fd = open(device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR);
|
||||||
if (fd < 0)
|
if (fd < 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to open device %s: %s\n", device.c_str(), strerror(errno));
|
fprintf(stderr, "Failed to open device %s: %s\n", device.c_str(), strerror(errno));
|
||||||
|
@ -150,7 +150,7 @@ json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_ex
|
||||||
json11::Json osd_params;
|
json11::Json osd_params;
|
||||||
std::string json_err;
|
std::string json_err;
|
||||||
std::string real_device, device_type, real_data, real_meta, real_journal;
|
std::string real_device, device_type, real_data, real_meta, real_journal;
|
||||||
int r, fd = open(device.c_str(), O_DIRECT|O_RDWR);
|
int r, fd = open(device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR);
|
||||||
if (fd < 0)
|
if (fd < 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Failed to open device %s: %s\n", device.c_str(), strerror(errno));
|
fprintf(stderr, "Failed to open device %s: %s\n", device.c_str(), strerror(errno));
|
||||||
|
@ -385,7 +385,7 @@ int disk_tool_t::pre_exec_osd(std::string device)
|
||||||
int disk_tool_t::clear_osd_superblock(const std::string & dev)
|
int disk_tool_t::clear_osd_superblock(const std::string & dev)
|
||||||
{
|
{
|
||||||
uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 4096);
|
uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 4096);
|
||||||
int fd = -1, r = open(dev.c_str(), O_DIRECT|O_RDWR);
|
int fd = -1, r = open(dev.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR);
|
||||||
if (r >= 0)
|
if (r >= 0)
|
||||||
{
|
{
|
||||||
fd = r;
|
fd = r;
|
||||||
|
|
|
@ -343,23 +343,42 @@ uint64_t free_from_parttable(json11::Json pt)
|
||||||
return free;
|
return free;
|
||||||
}
|
}
|
||||||
|
|
||||||
int fix_partition_type(std::string dev_by_uuid)
|
int fix_partition_type_uuid(std::string & dev_by_uuid, const std::string & type_uuid)
|
||||||
{
|
{
|
||||||
auto uuid = strtolower(dev_by_uuid.substr(dev_by_uuid.rfind('/')+1));
|
bool is_partuuid = dev_by_uuid.substr(0, 22) == "/dev/disk/by-partuuid/";
|
||||||
std::string parent_dev = get_parent_device(realpath_str(dev_by_uuid, false));
|
auto uuid = is_partuuid ? strtolower(dev_by_uuid.substr(22)) : "";
|
||||||
|
auto node = realpath_str(dev_by_uuid, false);
|
||||||
|
std::string parent_dev = get_parent_device(node);
|
||||||
if (parent_dev == "")
|
if (parent_dev == "")
|
||||||
return 1;
|
return 1;
|
||||||
auto pt = read_parttable(parent_dev);
|
auto pt = read_parttable(parent_dev);
|
||||||
if (pt.is_null() || pt.is_bool())
|
if (pt.is_null() || pt.is_bool())
|
||||||
return 1;
|
return 1;
|
||||||
|
bool found = false;
|
||||||
std::string script = "label: gpt\n\n";
|
std::string script = "label: gpt\n\n";
|
||||||
for (const auto & part: pt["partitions"].array_items())
|
for (const auto & part: pt["partitions"].array_items())
|
||||||
{
|
{
|
||||||
bool this_part = (strtolower(part["uuid"].string_value()) == uuid);
|
bool this_part = (part["node"].string_value() == node) &&
|
||||||
if (this_part && strtolower(part["type"].string_value()) == "e7009fac-a5a1-4d72-af72-53de13059903")
|
(!is_partuuid || strtolower(part["uuid"].string_value()) == uuid);
|
||||||
|
if (this_part)
|
||||||
{
|
{
|
||||||
// Already correct type
|
found = true;
|
||||||
return 0;
|
if (!is_partuuid)
|
||||||
|
{
|
||||||
|
if (part["uuid"] == "")
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Could not determine partition UUID for %s. Please use GPT partitions\n", dev_by_uuid.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
auto new_dev = "/dev/disk/by-partuuid/"+strtolower(part["uuid"].string_value());
|
||||||
|
fprintf(stderr, "Using %s instead of %s\n", new_dev.c_str(), dev_by_uuid.c_str());
|
||||||
|
dev_by_uuid = new_dev;
|
||||||
|
}
|
||||||
|
if (strtolower(part["type"].string_value()) == type_uuid)
|
||||||
|
{
|
||||||
|
// Already correct type
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
script += part["node"].string_value()+": ";
|
script += part["node"].string_value()+": ";
|
||||||
bool first = true;
|
bool first = true;
|
||||||
|
@ -369,13 +388,18 @@ int fix_partition_type(std::string dev_by_uuid)
|
||||||
{
|
{
|
||||||
script += (first ? "" : ", ")+kv.first+"="+
|
script += (first ? "" : ", ")+kv.first+"="+
|
||||||
(kv.first == "type" && this_part
|
(kv.first == "type" && this_part
|
||||||
? "e7009fac-a5a1-4d72-af72-53de13059903"
|
? type_uuid
|
||||||
: (kv.second.is_string() ? kv.second.string_value() : kv.second.dump()));
|
: (kv.second.is_string() ? kv.second.string_value() : kv.second.dump()));
|
||||||
first = false;
|
first = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
script += "\n";
|
script += "\n";
|
||||||
}
|
}
|
||||||
|
if (!found)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Could not find partition table entry for %s\n", dev_by_uuid.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
std::string out;
|
std::string out;
|
||||||
return shell_exec({ "sfdisk", "--no-reread", "--no-tell-kernel", "--force", parent_dev }, script, &out, NULL);
|
return shell_exec({ "sfdisk", "--no-reread", "--no-tell-kernel", "--force", parent_dev }, script, &out, NULL);
|
||||||
}
|
}
|
||||||
|
|
|
@ -137,7 +137,7 @@ void nfs_proxy_t::handle_rdmacm_events()
|
||||||
}
|
}
|
||||||
if (ev->event == RDMA_CM_EVENT_CONNECT_REQUEST)
|
if (ev->event == RDMA_CM_EVENT_CONNECT_REQUEST)
|
||||||
{
|
{
|
||||||
rdmacm_accept();
|
rdmacm_accept(ev);
|
||||||
}
|
}
|
||||||
else if (ev->event == RDMA_CM_EVENT_CONNECT_ERROR ||
|
else if (ev->event == RDMA_CM_EVENT_CONNECT_ERROR ||
|
||||||
ev->event == RDMA_CM_EVENT_REJECTED ||
|
ev->event == RDMA_CM_EVENT_REJECTED ||
|
||||||
|
|
|
@ -535,10 +535,12 @@ void osd_t::print_stats()
|
||||||
|
|
||||||
void osd_t::print_slow()
|
void osd_t::print_slow()
|
||||||
{
|
{
|
||||||
bool has_slow = false;
|
cur_slow_op_primary = 0;
|
||||||
|
cur_slow_op_secondary = 0;
|
||||||
char alloc[1024];
|
char alloc[1024];
|
||||||
timespec now;
|
timespec now;
|
||||||
clock_gettime(CLOCK_REALTIME, &now);
|
clock_gettime(CLOCK_REALTIME, &now);
|
||||||
|
// FIXME: Also track slow local blockstore ops and recovery/flush/scrub ops
|
||||||
for (auto & kv: msgr.clients)
|
for (auto & kv: msgr.clients)
|
||||||
{
|
{
|
||||||
for (auto op: kv.second->received_ops)
|
for (auto op: kv.second->received_ops)
|
||||||
|
@ -608,6 +610,7 @@ void osd_t::print_slow()
|
||||||
op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK ||
|
op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK ||
|
||||||
op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
||||||
{
|
{
|
||||||
|
cur_slow_op_secondary++;
|
||||||
bufprintf(" state=%d", op->bs_op ? PRIV(op->bs_op)->op_state : -1);
|
bufprintf(" state=%d", op->bs_op ? PRIV(op->bs_op)->op_state : -1);
|
||||||
int wait_for = op->bs_op ? PRIV(op->bs_op)->wait_for : 0;
|
int wait_for = op->bs_op ? PRIV(op->bs_op)->wait_for : 0;
|
||||||
if (wait_for)
|
if (wait_for)
|
||||||
|
@ -618,15 +621,19 @@ void osd_t::print_slow()
|
||||||
else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
|
else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
|
||||||
op->req.hdr.opcode == OSD_OP_SYNC || op->req.hdr.opcode == OSD_OP_DELETE)
|
op->req.hdr.opcode == OSD_OP_SYNC || op->req.hdr.opcode == OSD_OP_DELETE)
|
||||||
{
|
{
|
||||||
|
cur_slow_op_primary++;
|
||||||
bufprintf(" state=%d", !op->op_data ? -1 : op->op_data->st);
|
bufprintf(" state=%d", !op->op_data ? -1 : op->op_data->st);
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
cur_slow_op_primary++;
|
||||||
|
}
|
||||||
#undef bufprintf
|
#undef bufprintf
|
||||||
printf("%s\n", alloc);
|
printf("%s\n", alloc);
|
||||||
has_slow = true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (has_slow && bs)
|
if ((cur_slow_op_primary+cur_slow_op_secondary) > 0 && bs)
|
||||||
{
|
{
|
||||||
bs->dump_diagnostics();
|
bs->dump_diagnostics();
|
||||||
}
|
}
|
||||||
|
|
|
@ -150,7 +150,9 @@ class osd_t
|
||||||
bool pg_config_applied = false;
|
bool pg_config_applied = false;
|
||||||
bool etcd_reporting_pg_state = false;
|
bool etcd_reporting_pg_state = false;
|
||||||
bool etcd_reporting_stats = false;
|
bool etcd_reporting_stats = false;
|
||||||
int autosync_timer_id = -1, print_stats_timer_id = -1, slow_log_timer_id = -1;
|
int print_stats_timer_id = -1, slow_log_timer_id = -1;
|
||||||
|
uint64_t cur_slow_op_primary = 0;
|
||||||
|
uint64_t cur_slow_op_secondary = 0;
|
||||||
|
|
||||||
// peers and PGs
|
// peers and PGs
|
||||||
|
|
||||||
|
@ -168,6 +170,8 @@ class osd_t
|
||||||
object_id recovery_last_oid;
|
object_id recovery_last_oid;
|
||||||
int recovery_pg_done = 0, recovery_done = 0;
|
int recovery_pg_done = 0, recovery_done = 0;
|
||||||
osd_op_t *autosync_op = NULL;
|
osd_op_t *autosync_op = NULL;
|
||||||
|
int autosync_copies_to_delete = 0;
|
||||||
|
int autosync_timer_id = -1;
|
||||||
|
|
||||||
// Scrubbing
|
// Scrubbing
|
||||||
uint64_t scrub_nearest_ts = 0;
|
uint64_t scrub_nearest_ts = 0;
|
||||||
|
|
|
@ -201,6 +201,14 @@ json11::Json osd_t::get_statistics()
|
||||||
st["immediate_commit"] = immediate_commit == IMMEDIATE_ALL ? "all" : (immediate_commit == IMMEDIATE_SMALL ? "small" : "none");
|
st["immediate_commit"] = immediate_commit == IMMEDIATE_ALL ? "all" : (immediate_commit == IMMEDIATE_SMALL ? "small" : "none");
|
||||||
st["host"] = self_state["host"];
|
st["host"] = self_state["host"];
|
||||||
st["version"] = VITASTOR_VERSION;
|
st["version"] = VITASTOR_VERSION;
|
||||||
|
if (cur_slow_op_primary > 0)
|
||||||
|
{
|
||||||
|
st["slow_ops_primary"] = cur_slow_op_primary;
|
||||||
|
}
|
||||||
|
if (cur_slow_op_secondary > 0)
|
||||||
|
{
|
||||||
|
st["slow_ops_secondary"] = cur_slow_op_secondary;
|
||||||
|
}
|
||||||
json11::Json::object op_stats, subop_stats;
|
json11::Json::object op_stats, subop_stats;
|
||||||
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
||||||
{
|
{
|
||||||
|
|
|
@ -13,10 +13,11 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
|
||||||
bool first = true;
|
bool first = true;
|
||||||
while (it != pg.flush_actions.end())
|
while (it != pg.flush_actions.end())
|
||||||
{
|
{
|
||||||
if (!first && (it->first.oid.inode != prev_it->first.oid.inode ||
|
if (!first &&
|
||||||
(it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK)) &&
|
(it->first.oid.inode != prev_it->first.oid.inode ||
|
||||||
fb->rollback_lists[it->first.osd_num].size() >= FLUSH_BATCH ||
|
(it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK)) &&
|
||||||
fb->stable_lists[it->first.osd_num].size() >= FLUSH_BATCH)
|
(fb->rollback_lists[it->first.osd_num].size() >= FLUSH_BATCH ||
|
||||||
|
fb->stable_lists[it->first.osd_num].size() >= FLUSH_BATCH))
|
||||||
{
|
{
|
||||||
// Stop only at the object boundary
|
// Stop only at the object boundary
|
||||||
break;
|
break;
|
||||||
|
@ -75,6 +76,7 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
||||||
// Throw the result away
|
// Throw the result away
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
fb->flush_done++;
|
||||||
if (retval != 0)
|
if (retval != 0)
|
||||||
{
|
{
|
||||||
if (peer_osd == this->osd_num)
|
if (peer_osd == this->osd_num)
|
||||||
|
@ -92,12 +94,11 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
|
||||||
auto fd_it = msgr.osd_peer_fds.find(peer_osd);
|
auto fd_it = msgr.osd_peer_fds.find(peer_osd);
|
||||||
if (fd_it != msgr.osd_peer_fds.end())
|
if (fd_it != msgr.osd_peer_fds.end())
|
||||||
{
|
{
|
||||||
|
// Will repeer/stop this PG
|
||||||
msgr.stop_client(fd_it->second);
|
msgr.stop_client(fd_it->second);
|
||||||
}
|
}
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fb->flush_done++;
|
|
||||||
if (fb->flush_done == fb->flush_ops)
|
if (fb->flush_done == fb->flush_ops)
|
||||||
{
|
{
|
||||||
// This flush batch is done
|
// This flush batch is done
|
||||||
|
|
|
@ -645,6 +645,18 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec
|
||||||
{
|
{
|
||||||
throw std::runtime_error("BUG: Invalid object state: "+std::to_string((*object_state)->state));
|
throw std::runtime_error("BUG: Invalid object state: "+std::to_string((*object_state)->state));
|
||||||
}
|
}
|
||||||
|
if (changed && immediate_commit != IMMEDIATE_ALL)
|
||||||
|
{
|
||||||
|
// Trigger double automatic sync after changing PG state when we're running with fsyncs.
|
||||||
|
// First autosync commits all written objects and applies copies_to_delete_after_sync;
|
||||||
|
// Second autosync commits all deletions run by the first sync.
|
||||||
|
// Without it, rebalancing in a cluster without load may result in some small amount of
|
||||||
|
// garbage left on "extra" OSDs of the PG, because last deletions are not synced at all.
|
||||||
|
// FIXME: 1000% correct way is to switch PG state only after copies_to_delete_after_sync.
|
||||||
|
// But it's much more complicated.
|
||||||
|
unstable_write_count += autosync_writes;
|
||||||
|
autosync_copies_to_delete = 2;
|
||||||
|
}
|
||||||
if (changed && report)
|
if (changed && report)
|
||||||
{
|
{
|
||||||
report_pg_state(pg);
|
report_pg_state(pg);
|
||||||
|
|
|
@ -9,6 +9,10 @@ void osd_t::autosync()
|
||||||
{
|
{
|
||||||
if (immediate_commit != IMMEDIATE_ALL && !autosync_op)
|
if (immediate_commit != IMMEDIATE_ALL && !autosync_op)
|
||||||
{
|
{
|
||||||
|
if (autosync_copies_to_delete > 0)
|
||||||
|
{
|
||||||
|
autosync_copies_to_delete--;
|
||||||
|
}
|
||||||
autosync_op = new osd_op_t();
|
autosync_op = new osd_op_t();
|
||||||
autosync_op->op_type = OSD_OP_IN;
|
autosync_op->op_type = OSD_OP_IN;
|
||||||
autosync_op->peer_fd = SELF_FD;
|
autosync_op->peer_fd = SELF_FD;
|
||||||
|
@ -29,6 +33,11 @@ void osd_t::autosync()
|
||||||
}
|
}
|
||||||
delete autosync_op;
|
delete autosync_op;
|
||||||
autosync_op = NULL;
|
autosync_op = NULL;
|
||||||
|
if (autosync_copies_to_delete > 0)
|
||||||
|
{
|
||||||
|
// Trigger the second "copies_to_delete" autosync
|
||||||
|
autosync();
|
||||||
|
}
|
||||||
};
|
};
|
||||||
exec_op(autosync_op);
|
exec_op(autosync_op);
|
||||||
}
|
}
|
||||||
|
|
|
@ -213,6 +213,15 @@ resume_8:
|
||||||
{
|
{
|
||||||
goto resume_6;
|
goto resume_6;
|
||||||
}
|
}
|
||||||
|
if (immediate_commit == IMMEDIATE_NONE)
|
||||||
|
{
|
||||||
|
// Mark OSDs as dirty because deletions have to be synced too!
|
||||||
|
for (int i = 0; i < op_data->copies_to_delete_count; i++)
|
||||||
|
{
|
||||||
|
auto & chunk = op_data->copies_to_delete[i];
|
||||||
|
this->dirty_osds.insert(chunk.osd_num);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for (int i = 0; i < op_data->dirty_pg_count; i++)
|
for (int i = 0; i < op_data->dirty_pg_count; i++)
|
||||||
{
|
{
|
||||||
|
@ -227,7 +236,7 @@ resume_8:
|
||||||
start_pg_peering(pg);
|
start_pg_peering(pg);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// FIXME: Free those in the destructor?
|
// FIXME: Free those in the destructor (not here)?
|
||||||
free(op_data->dirty_pgs);
|
free(op_data->dirty_pgs);
|
||||||
op_data->dirty_pgs = NULL;
|
op_data->dirty_pgs = NULL;
|
||||||
op_data->dirty_osds = NULL;
|
op_data->dirty_osds = NULL;
|
||||||
|
|
|
@ -7,6 +7,12 @@
|
||||||
bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
|
bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
|
||||||
{
|
{
|
||||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
|
// First check if PG is not active anymore
|
||||||
|
if (!(pg.state & PG_ACTIVE))
|
||||||
|
{
|
||||||
|
pg_cancel_write_queue(pg, cur_op, op_data->oid, -EPIPE);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
// Check if actions are pending for this object
|
// Check if actions are pending for this object
|
||||||
auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){
|
auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){
|
||||||
.oid = op_data->oid,
|
.oid = op_data->oid,
|
||||||
|
|
|
@ -65,7 +65,7 @@ std::string addr_to_string(const sockaddr_storage &addr)
|
||||||
return std::string(peer_str)+":"+std::to_string(port);
|
return std::string(peer_str)+":"+std::to_string(port);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool cidr_match(const in_addr &addr, const in_addr &net, uint8_t bits)
|
bool cidr_match(const in_addr &addr, const in_addr &net, uint8_t bits)
|
||||||
{
|
{
|
||||||
if (bits == 0)
|
if (bits == 0)
|
||||||
{
|
{
|
||||||
|
@ -75,7 +75,7 @@ static bool cidr_match(const in_addr &addr, const in_addr &net, uint8_t bits)
|
||||||
return !((addr.s_addr ^ net.s_addr) & htonl(0xFFFFFFFFu << (32 - bits)));
|
return !((addr.s_addr ^ net.s_addr) & htonl(0xFFFFFFFFu << (32 - bits)));
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_t bits)
|
bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_t bits)
|
||||||
{
|
{
|
||||||
const uint32_t *a = address.s6_addr32;
|
const uint32_t *a = address.s6_addr32;
|
||||||
const uint32_t *n = network.s6_addr32;
|
const uint32_t *n = network.s6_addr32;
|
||||||
|
@ -93,47 +93,49 @@ static bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct addr_mask_t
|
addr_mask_t cidr_parse(std::string mask)
|
||||||
{
|
{
|
||||||
sa_family_t family;
|
unsigned bits = 255;
|
||||||
|
int p = mask.find('/');
|
||||||
|
if (p != std::string::npos)
|
||||||
|
{
|
||||||
|
char null_byte = 0;
|
||||||
|
if (sscanf(mask.c_str()+p+1, "%u%c", &bits, &null_byte) != 1 || bits > 128)
|
||||||
|
throw std::runtime_error("Invalid IP address mask: " + mask);
|
||||||
|
mask = mask.substr(0, p);
|
||||||
|
}
|
||||||
in_addr ipv4;
|
in_addr ipv4;
|
||||||
in6_addr ipv6;
|
in6_addr ipv6;
|
||||||
uint8_t bits;
|
if (inet_pton(AF_INET, mask.c_str(), &ipv4) == 1)
|
||||||
};
|
{
|
||||||
|
if (bits == 255)
|
||||||
|
bits = 32;
|
||||||
|
if (bits > 32)
|
||||||
|
throw std::runtime_error("Invalid IP address mask: " + mask);
|
||||||
|
return (addr_mask_t){ .family = AF_INET, .ipv4 = ipv4, .bits = (uint8_t)(bits ? bits : 32) };
|
||||||
|
}
|
||||||
|
else if (inet_pton(AF_INET6, mask.c_str(), &ipv6) == 1)
|
||||||
|
{
|
||||||
|
if (bits == 255)
|
||||||
|
bits = 128;
|
||||||
|
return (addr_mask_t){ .family = AF_INET6, .ipv6 = ipv6, .bits = (uint8_t)bits };
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
throw std::runtime_error("Invalid IP address mask: " + mask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg, bool include_v6)
|
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg, bool include_v6)
|
||||||
{
|
{
|
||||||
std::vector<addr_mask_t> masks;
|
std::vector<addr_mask_t> masks;
|
||||||
for (auto mask: mask_cfg)
|
for (auto mask: mask_cfg)
|
||||||
{
|
{
|
||||||
unsigned bits = 0;
|
masks.push_back(cidr_parse(mask));
|
||||||
int p = mask.find('/');
|
if (masks[masks.size()-1].family == AF_INET6)
|
||||||
if (p != std::string::npos)
|
|
||||||
{
|
{
|
||||||
char null_byte = 0;
|
// Auto-enable IPv6 addresses
|
||||||
if (sscanf(mask.c_str()+p+1, "%u%c", &bits, &null_byte) != 1 || bits > 128)
|
include_v6 = true;
|
||||||
{
|
|
||||||
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
|
|
||||||
}
|
|
||||||
mask = mask.substr(0, p);
|
|
||||||
}
|
|
||||||
in_addr ipv4;
|
|
||||||
in6_addr ipv6;
|
|
||||||
if (inet_pton(AF_INET, mask.c_str(), &ipv4) == 1)
|
|
||||||
{
|
|
||||||
if (bits > 32)
|
|
||||||
{
|
|
||||||
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
|
|
||||||
}
|
|
||||||
masks.push_back((addr_mask_t){ .family = AF_INET, .ipv4 = ipv4, .bits = (uint8_t)bits });
|
|
||||||
}
|
|
||||||
else if (include_v6 && inet_pton(AF_INET6, mask.c_str(), &ipv6) == 1)
|
|
||||||
{
|
|
||||||
masks.push_back((addr_mask_t){ .family = AF_INET6, .ipv6 = ipv6, .bits = (uint8_t)bits });
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::set<std::string> addresses;
|
std::set<std::string> addresses;
|
||||||
|
|
|
@ -1,10 +1,22 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <netinet/in.h>
|
||||||
#include <sys/socket.h>
|
#include <sys/socket.h>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
struct addr_mask_t
|
||||||
|
{
|
||||||
|
sa_family_t family;
|
||||||
|
in_addr ipv4;
|
||||||
|
in6_addr ipv6;
|
||||||
|
uint8_t bits;
|
||||||
|
};
|
||||||
|
|
||||||
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr_storage *addr);
|
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr_storage *addr);
|
||||||
std::string addr_to_string(const sockaddr_storage &addr);
|
std::string addr_to_string(const sockaddr_storage &addr);
|
||||||
|
addr_mask_t cidr_parse(std::string mask);
|
||||||
|
bool cidr_match(const in_addr &address, const in_addr &network, uint8_t bits);
|
||||||
|
bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_t bits);
|
||||||
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg = std::vector<std::string>(), bool include_v6 = false);
|
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg = std::vector<std::string>(), bool include_v6 = false);
|
||||||
int create_and_bind_socket(std::string bind_address, int bind_port, int listen_backlog, int *listening_port);
|
int create_and_bind_socket(std::string bind_address, int bind_port, int listen_backlog, int *listening_port);
|
||||||
|
|
|
@ -62,7 +62,7 @@ int timerfd_manager_t::set_timer_us(uint64_t micros, bool repeat, std::function<
|
||||||
.callback = callback,
|
.callback = callback,
|
||||||
});
|
});
|
||||||
inc_timer(timers[timers.size()-1]);
|
inc_timer(timers[timers.size()-1]);
|
||||||
set_nearest();
|
set_nearest(false);
|
||||||
return timer_id;
|
return timer_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -82,13 +82,13 @@ void timerfd_manager_t::clear_timer(int timer_id)
|
||||||
{
|
{
|
||||||
nearest--;
|
nearest--;
|
||||||
}
|
}
|
||||||
set_nearest();
|
set_nearest(false);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void timerfd_manager_t::set_nearest()
|
void timerfd_manager_t::set_nearest(bool trigger_inline)
|
||||||
{
|
{
|
||||||
if (onstack > 0)
|
if (onstack > 0)
|
||||||
{
|
{
|
||||||
|
@ -134,10 +134,13 @@ again:
|
||||||
}
|
}
|
||||||
if (exp.it_value.tv_sec < 0 || exp.it_value.tv_sec == 0 && exp.it_value.tv_nsec <= 0)
|
if (exp.it_value.tv_sec < 0 || exp.it_value.tv_sec == 0 && exp.it_value.tv_nsec <= 0)
|
||||||
{
|
{
|
||||||
// It already happened
|
// It already happened - set minimal timeout
|
||||||
// FIXME: Postpone to setImmediate/BH to avoid reenterability problems
|
if (trigger_inline)
|
||||||
trigger_nearest();
|
{
|
||||||
goto again;
|
trigger_nearest();
|
||||||
|
goto again;
|
||||||
|
}
|
||||||
|
exp.it_value = { .tv_sec = 0, .tv_nsec = 1 };
|
||||||
}
|
}
|
||||||
if (timerfd_settime(timerfd, 0, &exp, NULL))
|
if (timerfd_settime(timerfd, 0, &exp, NULL))
|
||||||
{
|
{
|
||||||
|
@ -157,7 +160,7 @@ void timerfd_manager_t::handle_readable()
|
||||||
trigger_nearest();
|
trigger_nearest();
|
||||||
}
|
}
|
||||||
wait_state = 0;
|
wait_state = 0;
|
||||||
set_nearest();
|
set_nearest(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
void timerfd_manager_t::trigger_nearest()
|
void timerfd_manager_t::trigger_nearest()
|
||||||
|
|
|
@ -26,7 +26,7 @@ class timerfd_manager_t
|
||||||
std::vector<timerfd_timer_t> timers;
|
std::vector<timerfd_timer_t> timers;
|
||||||
|
|
||||||
void inc_timer(timerfd_timer_t & t);
|
void inc_timer(timerfd_timer_t & t);
|
||||||
void set_nearest();
|
void set_nearest(bool trigger_inline);
|
||||||
void trigger_nearest();
|
void trigger_nearest();
|
||||||
void handle_readable();
|
void handle_readable();
|
||||||
public:
|
public:
|
||||||
|
|
|
@ -68,6 +68,9 @@ TEST_NAME=csum_4k_dmj OSD_ARGS="--data_csum_type crc32c --inmemory_metadata fal
|
||||||
TEST_NAME=csum_4k_dj OSD_ARGS="--data_csum_type crc32c --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
|
TEST_NAME=csum_4k_dj OSD_ARGS="--data_csum_type crc32c --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
|
||||||
TEST_NAME=csum_4k OSD_ARGS="--data_csum_type crc32c" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
|
TEST_NAME=csum_4k OSD_ARGS="--data_csum_type crc32c" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
|
||||||
|
|
||||||
|
./test_resize.sh
|
||||||
|
./test_resize_auto.sh
|
||||||
|
|
||||||
./test_snapshot_pool2.sh
|
./test_snapshot_pool2.sh
|
||||||
|
|
||||||
./test_osd_tags.sh
|
./test_osd_tags.sh
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
PG_COUNT=${PG_COUNT:-32}
|
PG_COUNT=${PG_COUNT:-32}
|
||||||
|
|
||||||
. `dirname $0`/run_3osds.sh
|
. `dirname $0`/run_3osds.sh
|
||||||
|
check_qemu
|
||||||
|
|
||||||
LD_PRELOAD="build/src/client/libfio_vitastor.so" \
|
LD_PRELOAD="build/src/client/libfio_vitastor.so" \
|
||||||
fio -thread -name=test -ioengine=build/src/client/libfio_vitastor.so -bs=4M -direct=1 -iodepth=4 \
|
fio -thread -name=test -ioengine=build/src/client/libfio_vitastor.so -bs=4M -direct=1 -iodepth=4 \
|
||||||
|
@ -26,22 +27,22 @@ for i in $(seq 1 $OSD_COUNT); do
|
||||||
offsets=$(build/src/disk_tool/vitastor-disk simple-offsets --format json ./testdata/bin/test_osd$i.bin)
|
offsets=$(build/src/disk_tool/vitastor-disk simple-offsets --format json ./testdata/bin/test_osd$i.bin)
|
||||||
meta_offset=$(echo $offsets | jq -r .meta_offset)
|
meta_offset=$(echo $offsets | jq -r .meta_offset)
|
||||||
data_offset=$(echo $offsets | jq -r .data_offset)
|
data_offset=$(echo $offsets | jq -r .data_offset)
|
||||||
build/src/disk_tool/vitastor-disk dump-journal --json ./testdata/bin/test_osd$i.bin 4096 0 $meta_offset >./testdata/journal_before_resize.json
|
build/src/disk_tool/vitastor-disk dump-journal --io cached --json ./testdata/bin/test_osd$i.bin 4096 0 $meta_offset >./testdata/journal_before_resize.json
|
||||||
build/src/disk_tool/vitastor-disk dump-meta ./testdata/bin/test_osd$i.bin 4096 $meta_offset $((data_offset-meta_offset)) >./testdata/meta_before_resize.json
|
build/src/disk_tool/vitastor-disk dump-meta --io cached ./testdata/bin/test_osd$i.bin 4096 $meta_offset $((data_offset-meta_offset)) >./testdata/meta_before_resize.json
|
||||||
build/src/disk_tool/vitastor-disk resize \
|
build/src/disk_tool/vitastor-disk raw-resize --io cached \
|
||||||
$(build/src/disk_tool/vitastor-disk simple-offsets --format options ./testdata/bin/test_osd$i.bin 2>/dev/null) \
|
$(build/src/disk_tool/vitastor-disk simple-offsets --format options ./testdata/bin/test_osd$i.bin 2>/dev/null) \
|
||||||
--new_meta_offset 0 \
|
--new_meta_offset 0 \
|
||||||
--new_meta_len $((1024*1024)) \
|
--new_meta_len $((1024*1024)) \
|
||||||
--new_journal_offset $((1024*1024)) \
|
--new_journal_offset $((1024*1024)) \
|
||||||
--new_data_offset $((128*1024*1024))
|
--new_data_offset $((128*1024*1024+32768))
|
||||||
build/src/disk_tool/vitastor-disk dump-journal --json ./testdata/bin/test_osd$i.bin 4096 $((1024*1024)) $((127*1024*1024)) >./testdata/journal_after_resize.json
|
build/src/disk_tool/vitastor-disk dump-journal --io cached --json ./testdata/bin/test_osd$i.bin 4096 $((1024*1024)) $((127*1024*1024)) >./testdata/journal_after_resize.json
|
||||||
build/src/disk_tool/vitastor-disk dump-meta ./testdata/bin/test_osd$i.bin 4096 0 $((1024*1024)) >./testdata/meta_after_resize.json
|
build/src/disk_tool/vitastor-disk dump-meta --io cached ./testdata/bin/test_osd$i.bin 4096 0 $((1024*1024)) >./testdata/meta_after_resize.json
|
||||||
if ! (cat ./testdata/meta_before_resize.json ./testdata/meta_after_resize.json | \
|
if ! (cat ./testdata/meta_before_resize.json ./testdata/meta_after_resize.json | \
|
||||||
jq -e -s 'map([ .entries[] | del(.block) ] | sort_by(.pool, .inode, .stripe)) | .[0] == .[1] and (.[0] | length) > 1000'); then
|
jq -e -s 'map([ .entries[] | del(.block) ] | sort_by(.pool, .inode, .stripe)) | .[0] == .[1] and (.[0] | length) > 1000'); then
|
||||||
format_error "OSD $i metadata corrupted after resizing"
|
format_error "OSD $i metadata corrupted after resizing"
|
||||||
fi
|
fi
|
||||||
if ! (cat ./testdata/journal_before_resize.json ./testdata/journal_after_resize.json | \
|
if ! (cat ./testdata/journal_before_resize.json ./testdata/journal_after_resize.json | \
|
||||||
jq -e -s 'map([ .[].entries[] | del(.crc32, .crc32_prev, .valid, .loc, .start) ]) | .[0] == .[1] and (.[0] | length) > 1'); then
|
jq -e -s 'map([ .[] | del(.crc32, .crc32_prev, .valid, .loc, .start) ]) | .[0] == .[1] and (.[0] | length) > 1'); then
|
||||||
format_error "OSD $i journal corrupted after resizing"
|
format_error "OSD $i journal corrupted after resizing"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
@ -53,7 +54,7 @@ for i in $(seq 1 $OSD_COUNT); do
|
||||||
--data_device ./testdata/bin/test_osd$i.bin \
|
--data_device ./testdata/bin/test_osd$i.bin \
|
||||||
--meta_offset 0 \
|
--meta_offset 0 \
|
||||||
--journal_offset $((1024*1024)) \
|
--journal_offset $((1024*1024)) \
|
||||||
--data_offset $((128*1024*1024)) >>./testdata/osd$i.log 2>&1 &
|
--data_offset $((128*1024*1024+32768)) >>./testdata/osd$i.log 2>&1 &
|
||||||
eval OSD${i}_PID=$!
|
eval OSD${i}_PID=$!
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,94 @@
|
||||||
|
#!/bin/bash -ex
|
||||||
|
|
||||||
|
ANTIETCD=1
|
||||||
|
. `dirname $0`/common.sh
|
||||||
|
|
||||||
|
[[ -e build/src/disk_tool/vitastor-disk-test ]] || ln -s vitastor-disk build/src/disk_tool/vitastor-disk-test
|
||||||
|
|
||||||
|
dd if=/dev/zero of=./testdata/bin/test_osd1.bin bs=1 count=1 seek=$((100*1024*1024*1024-1))
|
||||||
|
LOOP1=$(sudo losetup --show -f ./testdata/bin/test_osd1.bin)
|
||||||
|
trap "kill -9 $(jobs -p) || true; sudo losetup -d $LOOP1"' || true' EXIT
|
||||||
|
dd if=/dev/zero of=./testdata/bin/test_meta.bin bs=1 count=1 seek=$((1024*1024*1024-1))
|
||||||
|
LOOP2=$(sudo losetup --show -f ./testdata/bin/test_meta.bin)
|
||||||
|
trap "kill -9 $(jobs -p) || true; sudo losetup -d $LOOP1 $LOOP2"' || true' EXIT
|
||||||
|
|
||||||
|
# also test prepare --hybrid :)
|
||||||
|
# non-vitastor random type UUID to prevent udev activation
|
||||||
|
mount | grep '/dev type devtmpfs' || sudo mount udev /dev/ -t devtmpfs
|
||||||
|
sudo build/src/disk_tool/vitastor-disk-test prepare --no_init 1 --meta_reserve 1x,1M \
|
||||||
|
--block_size 131072 --osd_num 987654 --part_type_uuid 0df42ae0-3695-4395-a957-7d5ff3645c56 \
|
||||||
|
--hybrid --fast-devices $LOOP2 $LOOP1
|
||||||
|
|
||||||
|
# write almost empty journal
|
||||||
|
node <<EOF > ./testdata/journal.json
|
||||||
|
console.log(JSON.stringify([
|
||||||
|
{"type":"start","start":"0x1000"},
|
||||||
|
{"type":"big_write_instant","inode":"0x1000000000001","stripe":"0xc60000","ver":"10","offset":0,"len":131072,"loc":"0x18ffdc0000","bitmap":"ffffffff"}
|
||||||
|
]));
|
||||||
|
EOF
|
||||||
|
sudo build/src/disk_tool/vitastor-disk write-journal ${LOOP1}p1 < ./testdata/journal.json
|
||||||
|
sudo build/src/disk_tool/vitastor-disk dump-journal --json --format data ${LOOP1}p1 | jq -S '[ .[] | del(.crc32, .crc32_prev) ]' > ./testdata/j2.json
|
||||||
|
jq -S '[ .[] + {"valid":true} ]' < ./testdata/journal.json > ./testdata/j1.json
|
||||||
|
diff ./testdata/j1.json ./testdata/j2.json
|
||||||
|
|
||||||
|
# write fake metadata items in the end
|
||||||
|
DATA_DEV_SIZE=$(sudo blockdev --getsize64 ${LOOP1}p1)
|
||||||
|
BLOCK_COUNT=$(((DATA_DEV_SIZE-4096)/128/1024))
|
||||||
|
node <<EOF > ./testdata/meta.json
|
||||||
|
console.log(JSON.stringify({
|
||||||
|
version: "0.9",
|
||||||
|
meta_block_size: 4096,
|
||||||
|
data_block_size: 131072,
|
||||||
|
bitmap_granularity: 4096,
|
||||||
|
data_csum_type: "none",
|
||||||
|
csum_block_size: 0,
|
||||||
|
entries: [ ...new Array(100).keys() ].map(i => ({
|
||||||
|
block: ($BLOCK_COUNT-100)+i,
|
||||||
|
pool: 1,
|
||||||
|
inode: "0x1",
|
||||||
|
stripe: "0x"+Number(i*0x20000).toString(16),
|
||||||
|
version: 10,
|
||||||
|
bitmap: "ffffffff",
|
||||||
|
ext_bitmap: "ffffffff",
|
||||||
|
})),
|
||||||
|
}));
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# also test write & dump
|
||||||
|
sudo build/src/disk_tool/vitastor-disk write-meta ${LOOP1}p1 < ./testdata/meta.json
|
||||||
|
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 > ./testdata/compare.json
|
||||||
|
jq -S < ./testdata/meta.json > ./testdata/1.json
|
||||||
|
jq -S < ./testdata/compare.json > ./testdata/2.json
|
||||||
|
diff ./testdata/1.json ./testdata/2.json
|
||||||
|
|
||||||
|
# move journal & meta back, data will become smaller; end indexes should be shifted by -1251
|
||||||
|
sudo build/src/disk_tool/vitastor-disk-test resize --move-journal '' --move-meta '' ${LOOP1}p1
|
||||||
|
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 | jq -S > ./testdata/2.json
|
||||||
|
jq -S '. + {"entries": [ .entries[] | (. + { "block": (.block-1251) }) ]}' < ./testdata/meta.json > ./testdata/1.json
|
||||||
|
diff ./testdata/1.json ./testdata/2.json
|
||||||
|
sudo build/src/disk_tool/vitastor-disk dump-journal --json --format data ${LOOP1}p1 | jq -S '[ .[] | del(.crc32, .crc32_prev) ]' > ./testdata/j2.json
|
||||||
|
jq -S '[ (.[] + {"valid":true}) | (if .type == "big_write_instant" then . + {"loc":"0x18f6160000"} else . end) ]' < ./testdata/journal.json > ./testdata/j1.json
|
||||||
|
diff ./testdata/j1.json ./testdata/j2.json
|
||||||
|
|
||||||
|
# move journal & meta out, data will become larger; end indexes should be shifted back by +1251
|
||||||
|
sudo build/src/disk_tool/vitastor-disk-test resize --move-journal ${LOOP2}p1 --move-meta ${LOOP2}p2 ${LOOP1}p1
|
||||||
|
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 | jq -S > ./testdata/2.json
|
||||||
|
jq -S < ./testdata/meta.json > ./testdata/1.json
|
||||||
|
diff ./testdata/1.json ./testdata/2.json
|
||||||
|
jq -S '[ .[] + {"valid":true} ]' < ./testdata/journal.json > ./testdata/j1.json
|
||||||
|
sudo build/src/disk_tool/vitastor-disk dump-journal --json --format data ${LOOP1}p1 | jq -S '[ .[] | del(.crc32, .crc32_prev) ]' > ./testdata/j2.json
|
||||||
|
|
||||||
|
# reduce data device size by exactly 128k * 99 (occupied blocks); exactly 1 should be left in place :)
|
||||||
|
sudo build/src/disk_tool/vitastor-disk-test resize --data-size $((DATA_DEV_SIZE-128*1024*99)) ${LOOP1}p1
|
||||||
|
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 | jq -S > ./testdata/2.json
|
||||||
|
jq -S '. + {"entries": ([ .entries[] | (. + { "block": (.block | if . > '$BLOCK_COUNT'-100 then .-('$BLOCK_COUNT'-100+1) else '$BLOCK_COUNT'-100 end) }) ] | .[1:] + [ .[0] ])}' < ./testdata/meta.json > ./testdata/1.json
|
||||||
|
diff ./testdata/1.json ./testdata/2.json
|
||||||
|
jq -S '[ .[] + {"valid":true} ]' < ./testdata/journal.json > ./testdata/j1.json
|
||||||
|
sudo build/src/disk_tool/vitastor-disk dump-journal --json --format data ${LOOP1}p1 | jq -S '[ .[] | del(.crc32, .crc32_prev) ]' > ./testdata/j2.json
|
||||||
|
|
||||||
|
# extend data device size to maximum
|
||||||
|
sudo build/src/disk_tool/vitastor-disk-test resize --data-size max ${LOOP1}p1
|
||||||
|
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 | jq -S > ./testdata/2.json
|
||||||
|
diff ./testdata/1.json ./testdata/2.json
|
||||||
|
|
||||||
|
format_green OK
|
Loading…
Reference in New Issue