Compare commits

...

18 Commits

Author SHA1 Message Date
101592bbff Release 0.7.1
- Add ISA-L erasure code implementation, now used automatically instead of jerasure when available
- Fix listings sending too many parallel requests to OSDs
- Fix rm-data crashing with --wait-list
- Remove empty inodes from statistics and `ls` output, after <inode_vanish_time> seconds after deletion
- Make monitor delete pool statistics when the pool is deleted and thus remove them from `df` output
- Log multiple etcd addresses in OSD logs correctly
- Fix true/false parsing in json configs like no_recovery/no_rebalance
- Show no_recovery, no_rebalance, readonly flags in status
2022-06-05 00:07:24 +03:00
be4087d9d2 Add a FIXME to test_interrupted_rebalance 2022-06-05 00:06:56 +03:00
404e43dd2d Note that ISA-L does not need to be enabled separately 2022-06-04 22:58:02 +03:00
87613ed590 Add ISA-L into RPM specs 2022-06-04 13:27:06 +03:00
2a2e914ef9 Show no_recovery, no_rebalance and readonly flags in status 2022-06-04 13:27:06 +03:00
0cdc9292c8 Fix true/false parsing in json configs like no_recovery/no_rebalance 2022-06-04 13:27:06 +03:00
3e1b03bb5c Show all etcd addresses in the "reporting to..." message 2022-06-04 13:27:06 +03:00
36e851505a Make monitor delete pool statistics when the pool is deleted 2022-06-04 13:27:06 +03:00
1efbbb0c36 Make deleted inodes vanish from statistics after 60 seconds 2022-06-04 13:27:06 +03:00
088dd15449 Exclude empty inodes from stats 2022-06-04 00:18:17 +03:00
4a531d7b8b Fix listings sending too many parallel requests to OSDs, fix rm-data crashing with --wait-list 2022-06-03 23:36:37 +03:00
a0cae4c180 Rename "jerasure" to "ec" in pool configuration, function names, fix documentation and Debian build scripts
Old pool configurations with "jerasure" also remain supported as an alias for "ec"
2022-06-03 15:40:00 +03:00
c4eb46600d Merge run_3osds and run_7osds scripts 2022-06-03 01:56:36 +03:00
21b306e25f Add ISA-L support 2022-06-02 01:47:33 +03:00
d8313e939a Release 0.7.0
- Add documentation! :-) in Russian and English
- Implement an NFS proxy for file-based access emulation to Vitastor
  images for non-QEMU based hypervisors like VMWare, as a better way
  than iSCSI
- Implement "primary affinity tags"
- Add a patch for libvirt 6.0
- Fix free_down_raw in cli status
- Fix a rare bug where OSDs could drop unrelated connections on errors
2022-05-29 23:39:53 +03:00
3e92c3f082 Add patch for libvirt 6.0 2022-05-28 23:53:41 +03:00
82b9f4c52d Add a test with OSD kills 2022-05-28 00:51:14 +03:00
2bdf415eb3 Fix unknown OSD numbers on error 2022-05-28 00:51:14 +03:00
67 changed files with 1312 additions and 303 deletions

View File

@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)
project(vitastor) project(vitastor)
set(VERSION "0.6.17") set(VERSION "0.7.1")
add_subdirectory(src) add_subdirectory(src)

View File

@@ -1,4 +1,4 @@
VERSION ?= v0.6.17 VERSION ?= v0.7.1
all: build push all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities: capabilities:
add: ["SYS_ADMIN"] add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v0.6.17 image: vitalif/vitastor-csi:v0.7.1
args: args:
- "--node=$(NODE_ID)" - "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)" - "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -116,7 +116,7 @@ spec:
privileged: true privileged: true
capabilities: capabilities:
add: ["SYS_ADMIN"] add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v0.6.17 image: vitalif/vitastor-csi:v0.7.1
args: args:
- "--node=$(NODE_ID)" - "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)" - "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -5,7 +5,7 @@ package vitastor
const ( const (
vitastorCSIDriverName = "csi.vitastor.io" vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "0.6.17" vitastorCSIDriverVersion = "0.7.1"
) )
// Config struct fills the parameters of request or user input // Config struct fills the parameters of request or user input

16
debian/changelog vendored
View File

@@ -1,4 +1,18 @@
vitastor (0.6.17-1) unstable; urgency=medium vitastor (0.7.1-1) unstable; urgency=medium
* Bugfixes
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
vitastor (0.7.1-1) unstable; urgency=medium
* Implement NFS proxy
* Add documentation
* Bugfixes
-- Vitaliy Filippov <vitalif@yourcmc.ru> Sun, 29 May 2022 23:39:13 +0300
vitastor (0.6.3-1) unstable; urgency=medium
* RDMA support * RDMA support
* Bugfixes * Bugfixes

2
debian/control vendored
View File

@@ -2,7 +2,7 @@ Source: vitastor
Section: admin Section: admin
Priority: optional Priority: optional
Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru> Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev
Standards-Version: 4.5.0 Standards-Version: 4.5.0
Homepage: https://vitastor.io/ Homepage: https://vitastor.io/
Rules-Requires-Root: no Rules-Requires-Root: no

11
debian/libisal.pc vendored Normal file
View File

@@ -0,0 +1,11 @@
prefix=/usr
exec_prefix=${prefix}
libdir=${prefix}/lib/x86_64-linux-gnu
includedir=${prefix}/include
Name: libisal
Description: Library for storage systems
Version: 2.30.0
Libs: -L${libdir} -lisal
Libs.private:
Cflags: -I${includedir}

View File

@@ -22,10 +22,11 @@ RUN apt-get update
RUN apt-get -y install fio liburing1 liburing-dev libgoogle-perftools-dev devscripts RUN apt-get -y install fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
RUN apt-get -y build-dep fio RUN apt-get -y build-dep fio
RUN apt-get --download-only source fio RUN apt-get --download-only source fio
RUN apt-get update && apt-get -y install libjerasure-dev cmake libibverbs-dev RUN apt-get update && apt-get -y install libjerasure-dev cmake libibverbs-dev libisal-dev
ADD . /root/vitastor ADD . /root/vitastor
RUN set -e -x; \ RUN set -e -x; \
[ -e /usr/lib/x86_64-linux-gnu/pkgconfig/libisal.pc ] || cp /root/vitastor/debian/libisal.pc /usr/lib/x86_64-linux-gnu/pkgconfig; \
mkdir -p /root/fio-build/; \ mkdir -p /root/fio-build/; \
cd /root/fio-build/; \ cd /root/fio-build/; \
rm -rf /root/fio-build/*; \ rm -rf /root/fio-build/*; \
@@ -33,8 +34,8 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \ mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \ rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \ cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-0.6.17; \ cp -r /root/vitastor vitastor-0.7.1; \
cd vitastor-0.6.17; \ cd vitastor-0.7.1; \
ln -s /root/fio-build/fio-*/ ./fio; \ ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \ FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \ ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -47,8 +48,8 @@ RUN set -e -x; \
rm -rf a b; \ rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \ echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL; \ cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.17.orig.tar.xz vitastor-0.6.17; \ tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.7.1.orig.tar.xz vitastor-0.7.1; \
cd vitastor-0.6.17; \ cd vitastor-0.7.1; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \ V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \ DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \ DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

View File

@@ -23,6 +23,7 @@ initialization and can be changed with an OSD restart.
- [no_rebalance](#no_rebalance) - [no_rebalance](#no_rebalance)
- [print_stats_interval](#print_stats_interval) - [print_stats_interval](#print_stats_interval)
- [slow_log_interval](#slow_log_interval) - [slow_log_interval](#slow_log_interval)
- [inode_vanish_time](#inode_vanish_time)
- [max_write_iodepth](#max_write_iodepth) - [max_write_iodepth](#max_write_iodepth)
- [min_flusher_count](#min_flusher_count) - [min_flusher_count](#min_flusher_count)
- [max_flusher_count](#max_flusher_count) - [max_flusher_count](#max_flusher_count)
@@ -163,6 +164,13 @@ Time interval at which OSDs dump slow or stuck operations on stdout, if
they're any. Also it's the time after which an operation is considered they're any. Also it's the time after which an operation is considered
"slow". "slow".
## inode_vanish_time
- Type: seconds
- Default: 60
Number of seconds after which a deleted inode is removed from OSD statistics.
## max_write_iodepth ## max_write_iodepth
- Type: integer - Type: integer

View File

@@ -24,6 +24,7 @@
- [no_rebalance](#no_rebalance) - [no_rebalance](#no_rebalance)
- [print_stats_interval](#print_stats_interval) - [print_stats_interval](#print_stats_interval)
- [slow_log_interval](#slow_log_interval) - [slow_log_interval](#slow_log_interval)
- [inode_vanish_time](#inode_vanish_time)
- [max_write_iodepth](#max_write_iodepth) - [max_write_iodepth](#max_write_iodepth)
- [min_flusher_count](#min_flusher_count) - [min_flusher_count](#min_flusher_count)
- [max_flusher_count](#max_flusher_count) - [max_flusher_count](#max_flusher_count)
@@ -169,6 +170,13 @@ OSD.
медленных или зависших операций, если таковые имеются. Также время, при медленных или зависших операций, если таковые имеются. Также время, при
превышении которого операция считается "медленной". превышении которого операция считается "медленной".
## inode_vanish_time
- Тип: секунды
- Значение по умолчанию: 60
Число секунд, через которое удалённые инод удаляется и из статистики OSD.
## max_write_iodepth ## max_write_iodepth
- Тип: целое число - Тип: целое число

View File

@@ -106,9 +106,12 @@ Pool name.
- Type: string - Type: string
- Required - Required
- One of: "replicated", "xor" or "jerasure" - One of: "replicated", "xor", "ec" or "jerasure"
Redundancy scheme used for data in this pool. Redundancy scheme used for data in this pool. "jerasure" is an alias for "ec",
both use Reed-Solomon-Vandermonde codes based on ISA-L or jerasure libraries.
Fast ISA-L based implementation is used automatically when it's available,
slower jerasure version is used otherwise.
## pg_size ## pg_size
@@ -243,7 +246,7 @@ of the OSDs containing a data chunk for a PG.
{ {
"2": { "2": {
"name":"ecpool", "name":"ecpool",
"scheme":"jerasure", "scheme":"ec",
"pg_size":3, "pg_size":3,
"parity_chunks":1, "parity_chunks":1,
"pg_minsize":2, "pg_minsize":2,

View File

@@ -106,9 +106,13 @@
- Тип: строка - Тип: строка
- Обязательный - Обязательный
- Возможные значения: "replicated", "xor" или "jerasure" - Возможные значения: "replicated", "xor", "ec" или "jerasure"
Схема избыточности, используемая в данном пуле. Схема избыточности, используемая в данном пуле. "jerasure" - синоним для "ec",
в обеих схемах используются коды Рида-Соломона-Вандермонда, реализованные на
основе библиотек ISA-L или jerasure. Быстрая реализацяю на основе ISA-L
используется автоматически, когда доступна, в противном случае используется
более медленная jerasure-версия.
## pg_size ## pg_size
@@ -242,7 +246,7 @@ PG в Vitastor эферемерны, то есть вы можете менят
{ {
"2": { "2": {
"name":"ecpool", "name":"ecpool",
"scheme":"jerasure", "scheme":"ec",
"pg_size":3, "pg_size":3,
"parity_chunks":1, "parity_chunks":1,
"pg_minsize":2, "pg_minsize":2,

View File

@@ -158,6 +158,13 @@
Временной интервал, с которым OSD выводят в стандартный вывод список Временной интервал, с которым OSD выводят в стандартный вывод список
медленных или зависших операций, если таковые имеются. Также время, при медленных или зависших операций, если таковые имеются. Также время, при
превышении которого операция считается "медленной". превышении которого операция считается "медленной".
- name: inode_vanish_time
type: sec
default: 60
info: |
Number of seconds after which a deleted inode is removed from OSD statistics.
info_ru: |
Число секунд, через которое удалённые инод удаляется и из статистики OSD.
- name: max_write_iodepth - name: max_write_iodepth
type: int type: int
default: 128 default: 128

View File

@@ -15,7 +15,8 @@
- gcc and g++ 8 or newer, clang 10 or newer, or other compiler with C++11 plus - gcc and g++ 8 or newer, clang 10 or newer, or other compiler with C++11 plus
designated initializers support from C++20 designated initializers support from C++20
- CMake - CMake
- liburing, jerasure headers - liburing, jerasure headers and libraries
- ISA-L, libibverbs headers and libraries (optional)
- tcmalloc (google-perftools-dev) - tcmalloc (google-perftools-dev)
## Basic instructions ## Basic instructions

View File

@@ -15,7 +15,8 @@
- gcc и g++ >= 8, либо clang >= 10, либо другой компилятор с поддержкой C++11 плюс - gcc и g++ >= 8, либо clang >= 10, либо другой компилятор с поддержкой C++11 плюс
назначенных инициализаторов (designated initializers) из C++20 назначенных инициализаторов (designated initializers) из C++20
- CMake - CMake
- Заголовки liburing, jerasure - Заголовки и библиотеки liburing, jerasure
- Опционально - заголовки и библиотеки ISA-L, libibverbs
- tcmalloc (google-perftools-dev) - tcmalloc (google-perftools-dev)
## Базовая инструкция ## Базовая инструкция

View File

@@ -15,7 +15,7 @@
- Basic part: highly-available block storage with symmetric clustering and no SPOF - Basic part: highly-available block storage with symmetric clustering and no SPOF
- [Performance](../performance/comparison1.en.md) ;-D - [Performance](../performance/comparison1.en.md) ;-D
- [Multiple redundancy schemes](../config/pool.en.md#scheme): Replication, XOR n+1, Reed-Solomon erasure codes - [Multiple redundancy schemes](../config/pool.en.md#scheme): Replication, XOR n+1, Reed-Solomon erasure codes
based on jerasure library with any number of data and parity drives in a group based on jerasure and ISA-L libraries with any number of data and parity drives in a group
- Configuration via simple JSON data structures in etcd (parameters, pools and images) - Configuration via simple JSON data structures in etcd (parameters, pools and images)
- Automatic data distribution over OSDs, with support for: - Automatic data distribution over OSDs, with support for:
- Mathematical optimization for better uniformity and less data movement - Mathematical optimization for better uniformity and less data movement

View File

@@ -15,7 +15,7 @@
- Базовая часть - надёжное кластерное блочное хранилище без единой точки отказа - Базовая часть - надёжное кластерное блочное хранилище без единой точки отказа
- [Производительность](../comparison1.ru.md) ;-D - [Производительность](../comparison1.ru.md) ;-D
- [Несколько схем отказоустойчивости](../config/pool.ru.md#scheme): репликация, XOR n+1 (1 диск чётности), коды коррекции ошибок - [Несколько схем отказоустойчивости](../config/pool.ru.md#scheme): репликация, XOR n+1 (1 диск чётности), коды коррекции ошибок
Рида-Соломона на основе библиотеки jerasure с любым числом дисков данных и чётности в группе Рида-Соломона на основе библиотек jerasure и ISA-L с любым числом дисков данных и чётности в группе
- Конфигурация через простые человекочитаемые JSON-структуры в etcd - Конфигурация через простые человекочитаемые JSON-структуры в etcd
- Автоматическое распределение данных по OSD, с поддержкой: - Автоматическое распределение данных по OSD, с поддержкой:
- Математической оптимизации для лучшей равномерности распределения и минимизации перемещений данных - Математической оптимизации для лучшей равномерности распределения и минимизации перемещений данных

View File

@@ -63,11 +63,11 @@ etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool",
"scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}' "scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'
``` ```
For jerasure pools the configuration should look like the following: For EC pools the configuration should look like the following:
``` ```
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool", etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
"scheme":"jerasure","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}` "scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`
``` ```
After you do this, one of the monitors will configure PGs and OSDs will start them. After you do this, one of the monitors will configure PGs and OSDs will start them.

View File

@@ -75,7 +75,7 @@ etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool",
``` ```
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool", etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
"scheme":"jerasure","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}` "scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`
``` ```
После этого один из мониторов должен сконфигурировать PG, а OSD должны запустить их. После этого один из мониторов должен сконфигурировать PG, а OSD должны запустить их.

View File

@@ -105,6 +105,7 @@ const etcd_tree = {
no_rebalance: false, no_rebalance: false,
print_stats_interval: 3, print_stats_interval: 3,
slow_log_interval: 10, slow_log_interval: 10,
inode_vanish_time: 60,
osd_memlock: false, osd_memlock: false,
// blockstore - fixed in superblock // blockstore - fixed in superblock
block_size, block_size,
@@ -147,11 +148,11 @@ const etcd_tree = {
/* pools: { /* pools: {
<id>: { <id>: {
name: 'testpool', name: 'testpool',
// jerasure uses Reed-Solomon-Vandermonde codes // 'ec' uses Reed-Solomon-Vandermonde codes, 'jerasure' is an alias for 'ec'
scheme: 'replicated' | 'xor' | 'jerasure', scheme: 'replicated' | 'xor' | 'ec' | 'jerasure',
pg_size: 3, pg_size: 3,
pg_minsize: 2, pg_minsize: 2,
// number of parity chunks, required for jerasure // number of parity chunks, required for EC
parity_chunks?: 1, parity_chunks?: 1,
pg_count: 100, pg_count: 100,
failure_domain: 'host', failure_domain: 'host',
@@ -1013,14 +1014,15 @@ class Mon
console.log('Pool ID '+pool_id+' is invalid'); console.log('Pool ID '+pool_id+' is invalid');
return false; return false;
} }
if (pool_cfg.scheme !== 'xor' && pool_cfg.scheme !== 'replicated' && pool_cfg.scheme !== 'jerasure') if (pool_cfg.scheme !== 'xor' && pool_cfg.scheme !== 'replicated' &&
pool_cfg.scheme !== 'ec' && pool_cfg.scheme !== 'jerasure')
{ {
if (warn) if (warn)
console.log('Pool '+pool_id+' has invalid coding scheme (one of "xor", "replicated" and "jerasure" required)'); console.log('Pool '+pool_id+' has invalid coding scheme (one of "xor", "replicated", "ec" and "jerasure" required)');
return false; return false;
} }
if (!pool_cfg.pg_size || pool_cfg.pg_size < 1 || pool_cfg.pg_size > 256 || if (!pool_cfg.pg_size || pool_cfg.pg_size < 1 || pool_cfg.pg_size > 256 ||
(pool_cfg.scheme === 'xor' || pool_cfg.scheme == 'jerasure') && pool_cfg.pg_size < 3) pool_cfg.scheme !== 'replicated' && pool_cfg.pg_size < 3)
{ {
if (warn) if (warn)
console.log('Pool '+pool_id+' has invalid pg_size'); console.log('Pool '+pool_id+' has invalid pg_size');
@@ -1039,7 +1041,8 @@ class Mon
console.log('Pool '+pool_id+' has invalid parity_chunks (must be 1)'); console.log('Pool '+pool_id+' has invalid parity_chunks (must be 1)');
return false; return false;
} }
if (pool_cfg.scheme === 'jerasure' && (pool_cfg.parity_chunks < 1 || pool_cfg.parity_chunks > pool_cfg.pg_size-2)) if ((pool_cfg.scheme === 'ec' || pool_cfg.scheme === 'jerasure') &&
(pool_cfg.parity_chunks < 1 || pool_cfg.parity_chunks > pool_cfg.pg_size-2))
{ {
if (warn) if (warn)
console.log('Pool '+pool_id+' has invalid parity_chunks (must be between 1 and pg_size-2)'); console.log('Pool '+pool_id+' has invalid parity_chunks (must be between 1 and pg_size-2)');
@@ -1153,6 +1156,10 @@ class Mon
{ {
prev_pgs[pg-1] = this.state.config.pgs.items[pool_id][pg].osd_set; prev_pgs[pg-1] = this.state.config.pgs.items[pool_id][pg].osd_set;
} }
// Also delete pool statistics
etcd_request.success.push({ requestDeleteRange: {
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
} });
this.save_new_pgs_txn(etcd_request, pool_id, up_osds, osd_tree, prev_pgs, [], []); this.save_new_pgs_txn(etcd_request, pool_id, up_osds, osd_tree, prev_pgs, [], []);
} }
} }

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver from cinder.volume import driver
from cinder.volume import volume_utils from cinder.volume import volume_utils
VERSION = '0.6.17' VERSION = '0.7.1'
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)

View File

@@ -0,0 +1,659 @@
commit 7f01510ef207940b07fac4f5fc8b9f1580b443aa
Author: Vitaliy Filippov <vitalif@yourcmc.ru>
Date: Sun Jun 27 12:52:40 2021 +0300
Add Vitastor support
diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng
index aa50eac..082b4f8 100644
--- a/docs/schemas/domaincommon.rng
+++ b/docs/schemas/domaincommon.rng
@@ -1766,6 +1766,35 @@
</element>
</define>
+ <define name="diskSourceNetworkProtocolVitastor">
+ <element name="source">
+ <interleave>
+ <attribute name="protocol">
+ <value>vitastor</value>
+ </attribute>
+ <ref name="diskSourceCommon"/>
+ <optional>
+ <attribute name="name"/>
+ </optional>
+ <optional>
+ <attribute name="query"/>
+ </optional>
+ <zeroOrMore>
+ <ref name="diskSourceNetworkHost"/>
+ </zeroOrMore>
+ <optional>
+ <element name="config">
+ <attribute name="file">
+ <ref name="absFilePath"/>
+ </attribute>
+ <empty/>
+ </element>
+ </optional>
+ <empty/>
+ </interleave>
+ </element>
+ </define>
+
<define name="diskSourceNetworkProtocolISCSI">
<element name="source">
<attribute name="protocol">
@@ -1891,6 +1920,7 @@
<ref name="diskSourceNetworkProtocolHTTP"/>
<ref name="diskSourceNetworkProtocolSimple"/>
<ref name="diskSourceNetworkProtocolVxHS"/>
+ <ref name="diskSourceNetworkProtocolVitastor"/>
</choice>
</define>
diff --git a/include/libvirt/libvirt-storage.h b/include/libvirt/libvirt-storage.h
index 4bf2b5f..dbc011b 100644
--- a/include/libvirt/libvirt-storage.h
+++ b/include/libvirt/libvirt-storage.h
@@ -245,6 +245,7 @@ typedef enum {
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS = 1 << 17,
VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE = 1 << 18,
VIR_CONNECT_LIST_STORAGE_POOLS_ISCSI_DIRECT = 1 << 19,
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR = 1 << 20,
} virConnectListAllStoragePoolsFlags;
int virConnectListAllStoragePools(virConnectPtr conn,
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index 222bb8c..2c30c55 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -5114,8 +5114,7 @@ virDomainDiskDefPostParse(virDomainDiskD
const virDomainDef *def,
virDomainXMLOptionPtr xmlopt)
{
- /* internal snapshots and config files are currently supported
- * only with rbd: */
+ /* internal snapshots are currently supported only with rbd: */
if (virStorageSourceGetActualType(disk->src) != VIR_STORAGE_TYPE_NETWORK &&
disk->src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD) {
if (disk->src->snapshot) {
@@ -5124,11 +5123,15 @@ virDomainDiskDefPostParse(virDomainDiskD
"only with 'rbd' disks"));
return -1;
}
-
+ }
+ /* config files are currently supported only with rbd and vitastor: */
+ if (virStorageSourceGetActualType(disk->src) != VIR_STORAGE_TYPE_NETWORK &&
+ disk->src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD &&
+ disk->src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR) {
if (disk->src->configFile) {
virReportError(VIR_ERR_XML_ERROR, "%s",
_("<config> element is currently supported "
- "only with 'rbd' disks"));
+ "only with 'rbd' and 'vitastor' disks"));
return -1;
}
}
@@ -9258,6 +9261,10 @@ virDomainDiskSourceNetworkParse(xmlNodeP
return -1;
}
+ if (src->protocol == VIR_STORAGE_NET_PROTOCOL_VITASTOR) {
+ src->relPath = virXMLPropString(node, "query");
+ }
+
if ((haveTLS = virXMLPropString(node, "tls")) &&
(src->haveTLS = virTristateBoolTypeFromString(haveTLS)) <= 0) {
virReportError(VIR_ERR_XML_ERROR,
@@ -9303,6 +9310,10 @@ virDomainDiskSourceNetworkParse(xmlNodeP
/* config file currently only works with remote disks */
src->configFile = virXPathString("string(./config/@file)", ctxt);
+ if (src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTP ||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS)
+ src->query = virXMLPropString(node, "query");
+
if (virDomainStorageNetworkParseHosts(node, &src->hosts, &src->nhosts) < 0)
return -1;
@@ -24141,6 +24152,10 @@ virDomainDiskSourceFormatNetwork(virBuff
virBufferEscapeString(attrBuf, " name='%s'", path ? path : src->path);
+ if (src->protocol == VIR_STORAGE_NET_PROTOCOL_VITASTOR && src->relPath != NULL) {
+ virBufferEscapeString(attrBuf, " query='%s'", src->relPath);
+ }
+
if (src->haveTLS != VIR_TRISTATE_BOOL_ABSENT &&
!(flags & VIR_DOMAIN_DEF_FORMAT_MIGRATABLE &&
src->tlsFromConfig))
@@ -31402,6 +31417,7 @@ virDomainDiskTranslateSourcePool(virDomainDiskDefPtr def)
case VIR_STORAGE_POOL_MPATH:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_SHEEPDOG:
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_LAST:
diff --git a/src/conf/storage_conf.c b/src/conf/storage_conf.c
index 55db7a9..7cbe937 100644
--- a/src/conf/storage_conf.c
+++ b/src/conf/storage_conf.c
@@ -59,7 +59,7 @@ VIR_ENUM_IMPL(virStoragePool,
"logical", "disk", "iscsi",
"iscsi-direct", "scsi", "mpath",
"rbd", "sheepdog", "gluster",
- "zfs", "vstorage",
+ "zfs", "vstorage", "vitastor",
);
VIR_ENUM_IMPL(virStoragePoolFormatFileSystem,
@@ -248,6 +248,18 @@ static virStoragePoolTypeInfo poolTypeInfo[] = {
.formatToString = virStorageFileFormatTypeToString,
}
},
+ {.poolType = VIR_STORAGE_POOL_VITASTOR,
+ .poolOptions = {
+ .flags = (VIR_STORAGE_POOL_SOURCE_HOST |
+ VIR_STORAGE_POOL_SOURCE_NETWORK |
+ VIR_STORAGE_POOL_SOURCE_NAME),
+ },
+ .volOptions = {
+ .defaultFormat = VIR_STORAGE_FILE_RAW,
+ .formatFromString = virStorageVolumeFormatFromString,
+ .formatToString = virStorageFileFormatTypeToString,
+ }
+ },
{.poolType = VIR_STORAGE_POOL_SHEEPDOG,
.poolOptions = {
.flags = (VIR_STORAGE_POOL_SOURCE_HOST |
@@ -550,6 +562,11 @@ virStoragePoolDefParseSource(xmlXPathContextPtr ctxt,
_("element 'name' is mandatory for RBD pool"));
goto cleanup;
}
+ if (pool_type == VIR_STORAGE_POOL_VITASTOR && source->name == NULL) {
+ virReportError(VIR_ERR_XML_ERROR, "%s",
+ _("element 'name' is mandatory for Vitastor pool"));
+ return -1;
+ }
if (options->formatFromString) {
char *format = virXPathString("string(./format/@type)", ctxt);
@@ -1173,6 +1190,7 @@ virStoragePoolDefFormatBuf(virBufferPtr buf,
/* RBD, Sheepdog, Gluster and Iscsi-direct devices are not local block devs nor
* files, so they don't have a target */
if (def->type != VIR_STORAGE_POOL_RBD &&
+ def->type != VIR_STORAGE_POOL_VITASTOR &&
def->type != VIR_STORAGE_POOL_SHEEPDOG &&
def->type != VIR_STORAGE_POOL_GLUSTER &&
def->type != VIR_STORAGE_POOL_ISCSI_DIRECT) {
diff --git a/src/conf/storage_conf.h b/src/conf/storage_conf.h
index dc0aa2a..ed4983d 100644
--- a/src/conf/storage_conf.h
+++ b/src/conf/storage_conf.h
@@ -110,6 +110,7 @@ typedef enum {
VIR_STORAGE_POOL_GLUSTER, /* Gluster device */
VIR_STORAGE_POOL_ZFS, /* ZFS */
VIR_STORAGE_POOL_VSTORAGE, /* Virtuozzo Storage */
+ VIR_STORAGE_POOL_VITASTOR, /* Vitastor */
VIR_STORAGE_POOL_LAST,
} virStoragePoolType;
@@ -466,6 +467,7 @@ VIR_ENUM_DECL(virStoragePartedFs)
VIR_CONNECT_LIST_STORAGE_POOLS_SCSI | \
VIR_CONNECT_LIST_STORAGE_POOLS_MPATH | \
VIR_CONNECT_LIST_STORAGE_POOLS_RBD | \
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR | \
VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG | \
VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER | \
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS | \
diff --git a/src/conf/virstorageobj.c b/src/conf/virstorageobj.c
index 6ea6a97..3ba45b9 100644
--- a/src/conf/virstorageobj.c
+++ b/src/conf/virstorageobj.c
@@ -1493,6 +1493,7 @@ virStoragePoolObjSourceFindDuplicateCb(const void *payload,
return 1;
break;
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_RBD:
case VIR_STORAGE_POOL_LAST:
break;
@@ -1994,6 +1995,8 @@ virStoragePoolObjMatch(virStoragePoolObjPtr obj,
(obj->def->type == VIR_STORAGE_POOL_MPATH)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_RBD) &&
(obj->def->type == VIR_STORAGE_POOL_RBD)) ||
+ (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR) &&
+ (obj->def->type == VIR_STORAGE_POOL_VITASTOR)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG) &&
(obj->def->type == VIR_STORAGE_POOL_SHEEPDOG)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER) &&
diff --git a/src/libvirt-storage.c b/src/libvirt-storage.c
index 2ea3e94..d5d2273 100644
--- a/src/libvirt-storage.c
+++ b/src/libvirt-storage.c
@@ -92,6 +92,7 @@ virStoragePoolGetConnect(virStoragePoolPtr pool)
* VIR_CONNECT_LIST_STORAGE_POOLS_SCSI
* VIR_CONNECT_LIST_STORAGE_POOLS_MPATH
* VIR_CONNECT_LIST_STORAGE_POOLS_RBD
+ * VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR
* VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG
*
* Returns the number of storage pools found or -1 and sets @pools to
diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c
index 73e988a..ab7bb81 100644
--- a/src/libxl/libxl_conf.c
+++ b/src/libxl/libxl_conf.c
@@ -888,6 +888,7 @@ libxlMakeNetworkDiskSrcStr(virStorageSourcePtr src,
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_SSH:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
virReportError(VIR_ERR_NO_SUPPORT,
diff --git a/src/libxl/xen_xl.c b/src/libxl/xen_xl.c
index 17b93d0..c5a0084 100644
--- a/src/libxl/xen_xl.c
+++ b/src/libxl/xen_xl.c
@@ -1601,6 +1601,7 @@ xenFormatXLDiskSrcNet(virStorageSourcePtr src)
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_SSH:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
virReportError(VIR_ERR_NO_SUPPORT,
diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c
index cbf0aa4..f0ca9e7 100644
--- a/src/qemu/qemu_block.c
+++ b/src/qemu/qemu_block.c
@@ -869,6 +869,42 @@ qemuBlockStorageSourceGetRBDProps(virStorageSourcePtr src)
}
+static virJSONValuePtr
+qemuBlockStorageSourceGetVitastorProps(virStorageSource *src)
+{
+ virJSONValuePtr ret = NULL;
+ virStorageNetHostDefPtr host;
+ size_t i;
+ virBuffer buf = VIR_BUFFER_INITIALIZER;
+ char *etcd = NULL;
+
+ for (i = 0; i < src->nhosts; i++) {
+ host = src->hosts + i;
+ if ((virStorageNetHostTransport)host->transport != VIR_STORAGE_NET_HOST_TRANS_TCP) {
+ goto cleanup;
+ }
+ virBufferAsprintf(&buf, i > 0 ? ",%s:%u" : "%s:%u", host->name, host->port);
+ }
+ if (src->nhosts > 0) {
+ etcd = virBufferContentAndReset(&buf);
+ }
+
+ if (virJSONValueObjectCreate(&ret,
+ "s:driver", "vitastor",
+ "S:etcd-host", etcd,
+ "S:etcd-prefix", src->relPath,
+ "S:config-path", src->configFile,
+ "s:image", src->path,
+ NULL) < 0)
+ goto cleanup;
+
+cleanup:
+ VIR_FREE(etcd);
+ virBufferFreeAndReset(&buf);
+ return ret;
+}
+
+
static virJSONValuePtr
qemuBlockStorageSourceGetSheepdogProps(virStorageSourcePtr src)
{
@@ -1130,6 +1166,11 @@ qemuBlockStorageSourceGetBackendProps(virStorageSourcePtr src,
return NULL;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ if (!(fileprops = qemuBlockStorageSourceGetVitastorProps(src)))
+ return NULL;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
if (!(fileprops = qemuBlockStorageSourceGetSheepdogProps(src)))
return NULL;
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index 822d5f8..abec34e 100644
--- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c
@@ -1078,6 +1078,43 @@ qemuBuildNetworkDriveStr(virStorageSourcePtr src,
ret = virBufferContentAndReset(&buf);
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ if (strchr(src->path, ':')) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+ _("':' not allowed in Vitastor source volume name '%s'"),
+ src->path);
+ return NULL;
+ }
+
+ virBufferStrcat(&buf, "vitastor:image=", src->path, NULL);
+
+ if (src->nhosts > 0) {
+ virBufferAddLit(&buf, ":etcd-host=");
+ for (i = 0; i < src->nhosts; i++) {
+ if (i)
+ virBufferAddLit(&buf, ",");
+
+ /* assume host containing : is ipv6 */
+ if (strchr(src->hosts[i].name, ':'))
+ virBufferEscape(&buf, '\\', ":", "[%s]",
+ src->hosts[i].name);
+ else
+ virBufferAsprintf(&buf, "%s", src->hosts[i].name);
+
+ if (src->hosts[i].port)
+ virBufferAsprintf(&buf, "\\:%u", src->hosts[i].port);
+ }
+ }
+
+ if (src->configFile)
+ virBufferEscape(&buf, '\\', ":", ":config-path=%s", src->configFile);
+
+ if (src->relPath)
+ virBufferEscape(&buf, '\\', ":", ":etcd-prefix=%s", src->relPath);
+
+ ret = virBufferContentAndReset(&buf);
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_VXHS:
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("VxHS protocol does not support URI syntax"));
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index ec6b340..f399efa 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -6862,6 +6862,16 @@ qemuDomainValidateStorageSource(virStora
return -1;
}
+ if (src->query &&
+ (actualType != VIR_STORAGE_TYPE_NETWORK ||
+ (src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTPS &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR))) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+ _("query is supported only with HTTP(S) protocols"));
+ return -1;
+ }
+
return 0;
}
@@ -13836,6 +13846,7 @@ qemuDomainPrepareStorageSourceTLS(virStorageSourcePtr src,
break;
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
index 1d96170..2d24396 100644
--- a/src/qemu/qemu_driver.c
+++ b/src/qemu/qemu_driver.c
@@ -14841,6 +14841,7 @@ qemuDomainSnapshotPrepareDiskExternalInactive(virDomainSnapshotDiskDefPtr snapdi
case VIR_STORAGE_NET_PROTOCOL_TFTP:
case VIR_STORAGE_NET_PROTOCOL_SSH:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_LAST:
virReportError(VIR_ERR_INTERNAL_ERROR,
_("external inactive snapshots are not supported on "
@@ -14925,6 +14926,7 @@ qemuDomainSnapshotPrepareDiskExternalActive(virDomainSnapshotDiskDefPtr snapdisk
case VIR_STORAGE_NET_PROTOCOL_TFTP:
case VIR_STORAGE_NET_PROTOCOL_SSH:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_LAST:
virReportError(VIR_ERR_INTERNAL_ERROR,
_("external active snapshots are not supported on "
@@ -15054,6 +15056,7 @@ qemuDomainSnapshotPrepareDiskInternal(virDomainDiskDefPtr disk,
case VIR_STORAGE_NET_PROTOCOL_TFTP:
case VIR_STORAGE_NET_PROTOCOL_SSH:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_LAST:
virReportError(VIR_ERR_INTERNAL_ERROR,
_("internal inactive snapshots are not supported on "
diff --git a/src/storage/storage_driver.c b/src/storage/storage_driver.c
index 4a13e90..33301c7 100644
--- a/src/storage/storage_driver.c
+++ b/src/storage/storage_driver.c
@@ -1641,6 +1641,7 @@ storageVolLookupByPathCallback(virStoragePoolObjPtr obj,
case VIR_STORAGE_POOL_RBD:
case VIR_STORAGE_POOL_SHEEPDOG:
case VIR_STORAGE_POOL_ZFS:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_LAST:
ignore_value(VIR_STRDUP(stable_path, data->path));
break;
diff --git a/src/test/test_driver.c b/src/test/test_driver.c
index 29c4c86..a27ad94 100644
--- a/src/test/test_driver.c
+++ b/src/test/test_driver.c
@@ -7086,6 +7086,7 @@ testStorageVolumeTypeForPool(int pooltype)
case VIR_STORAGE_POOL_ISCSI_DIRECT:
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
return VIR_STORAGE_VOL_NETWORK;
case VIR_STORAGE_POOL_LOGICAL:
case VIR_STORAGE_POOL_DISK:
diff --git a/src/util/virstoragefile.c b/src/util/virstoragefile.c
index 0d3c2af..edb7f9e 100644
--- a/src/util/virstoragefile.c
+++ b/src/util/virstoragefile.c
@@ -90,6 +90,7 @@ VIR_ENUM_IMPL(virStorageNetProtocol,
"tftp",
"ssh",
"vxhs",
+ "vitastor",
);
VIR_ENUM_IMPL(virStorageNetHostTransport,
@@ -2927,6 +2928,73 @@ virStorageSourceParseRBDColonString(cons
return 0;
}
+static int
+virStorageSourceParseVitastorColonString(const char *colonstr,
+ virStorageSourcePtr src)
+{
+ char *p, *e, *next;
+ g_autofree char *options = NULL;
+
+ /* optionally skip the "vitastor:" prefix if provided */
+ if (STRPREFIX(colonstr, "vitastor:"))
+ colonstr += strlen("vitastor:");
+
+ options = g_strdup(colonstr);
+
+ p = options;
+ while (*p) {
+ /* find : delimiter or end of string */
+ for (e = p; *e && *e != ':'; ++e) {
+ if (*e == '\\') {
+ e++;
+ if (*e == '\0')
+ break;
+ }
+ }
+ if (*e == '\0') {
+ next = e; /* last kv pair */
+ } else {
+ next = e + 1;
+ *e = '\0';
+ }
+
+ if (STRPREFIX(p, "image=")) {
+ src->path = g_strdup(p + strlen("image="));
+ } else if (STRPREFIX(p, "etcd-prefix=")) {
+ src->query = g_strdup(p + strlen("etcd-prefix="));
+ } else if (STRPREFIX(p, "config-path=")) {
+ src->configFile = g_strdup(p + strlen("config-path="));
+ } else if (STRPREFIX(p, "etcd-host=")) {
+ char *h, *sep;
+
+ h = p + strlen("etcd-host=");
+ while (h < e) {
+ for (sep = h; sep < e; ++sep) {
+ if (*sep == '\\' && (sep[1] == ',' ||
+ sep[1] == ';' ||
+ sep[1] == ' ')) {
+ *sep = '\0';
+ sep += 2;
+ break;
+ }
+ }
+
+ if (virStorageSourceRBDAddHost(src, h) < 0)
+ return -1;
+
+ h = sep;
+ }
+ }
+
+ p = next;
+ }
+
+ if (!src->path) {
+ return -1;
+ }
+
+ return 0;
+}
static int
virStorageSourceParseNBDColonString(const char *nbdstr,
@@ -3022,6 +3090,11 @@ virStorageSourceParseBackingColon(virSto
return -1;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ if (virStorageSourceParseVitastorColonString(path, src) < 0)
+ return -1;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
@@ -3507,6 +3580,54 @@ virStorageSourceParseBackingJSONRBD(virS
}
static int
+virStorageSourceParseBackingJSONVitastor(virStorageSourcePtr src,
+ virJSONValuePtr json,
+ const char *jsonstr G_GNUC_UNUSED,
+ int opaque G_GNUC_UNUSED)
+{
+ const char *filename;
+ const char *image = virJSONValueObjectGetString(json, "image");
+ const char *conf = virJSONValueObjectGetString(json, "config-path");
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd-prefix");
+ virJSONValuePtr servers = virJSONValueObjectGetArray(json, "server");
+ size_t nservers;
+ size_t i;
+
+ src->type = VIR_STORAGE_TYPE_NETWORK;
+ src->protocol = VIR_STORAGE_NET_PROTOCOL_VITASTOR;
+
+ /* legacy syntax passed via 'filename' option */
+ if ((filename = virJSONValueObjectGetString(json, "filename")))
+ return virStorageSourceParseVitastorColonString(filename, src);
+
+ if (!image) {
+ virReportError(VIR_ERR_INVALID_ARG, "%s",
+ _("missing image name in Vitastor backing volume "
+ "JSON specification"));
+ return -1;
+ }
+
+ src->path = g_strdup(image);
+ src->configFile = g_strdup(conf);
+ src->query = g_strdup(etcd_prefix);
+
+ if (servers) {
+ nservers = virJSONValueArraySize(servers);
+
+ src->hosts = g_new0(virStorageNetHostDef, nservers);
+ src->nhosts = nservers;
+
+ for (i = 0; i < nservers; i++) {
+ if (virStorageSourceParseBackingJSONInetSocketAddress(src->hosts + i,
+ virJSONValueArrayGet(servers, i)) < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int
virStorageSourceParseBackingJSONRaw(virStorageSourcePtr src,
virJSONValuePtr json,
int opaque G_GNUC_UNUSED)
@@ -3578,6 +3699,7 @@ static const struct virStorageSourceJSON
{"sheepdog", virStorageSourceParseBackingJSONSheepdog, 0},
{"ssh", virStorageSourceParseBackingJSONSSH, 0},
{"rbd", virStorageSourceParseBackingJSONRBD, 0},
+ {"vitastor", virStorageSourceParseBackingJSONVitastor, 0},
{"raw", virStorageSourceParseBackingJSONRaw, 0},
{"vxhs", virStorageSourceParseBackingJSONVxHS, 0},
};
@@ -4364,6 +4486,7 @@ virStorageSourceNetworkDefaultPort(virSt
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
return 24007;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_RBD:
/* we don't provide a default for RBD */
return 0;
diff --git a/src/util/virstoragefile.h b/src/util/virstoragefile.h
index 1d6161a..8d83bf3 100644
--- a/src/util/virstoragefile.h
+++ b/src/util/virstoragefile.h
@@ -134,6 +134,7 @@ typedef enum {
VIR_STORAGE_NET_PROTOCOL_TFTP,
VIR_STORAGE_NET_PROTOCOL_SSH,
VIR_STORAGE_NET_PROTOCOL_VXHS,
+ VIR_STORAGE_NET_PROTOCOL_VITASTOR,
VIR_STORAGE_NET_PROTOCOL_LAST
} virStorageNetProtocol;
@@ -265,6 +266,7 @@ struct _virStorageSource {
char *snapshot; /* for storage systems supporting internal snapshots */
char *configFile; /* some storage systems use config file as part of
the source definition */
+ char *query; /* query string for HTTP based protocols */
size_t nhosts;
virStorageNetHostDefPtr hosts;
virStorageSourcePoolDefPtr srcpool;
diff --git a/tools/virsh-pool.c b/tools/virsh-pool.c
index 70ca39b..9caef51 100644
--- a/tools/virsh-pool.c
+++ b/tools/virsh-pool.c
@@ -1219,6 +1219,9 @@ cmdPoolList(vshControl *ctl, const vshCmd *cmd ATTRIBUTE_UNUSED)
case VIR_STORAGE_POOL_VSTORAGE:
flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE;
break;
+ case VIR_STORAGE_POOL_VITASTOR:
+ flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR;
+ break;
case VIR_STORAGE_POOL_LAST:
break;
}

View File

@@ -25,4 +25,4 @@ rm fio
mv fio-copy fio mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'` FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-0.6.17/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.17$(rpm --eval '%dist').tar.gz * tar --transform 's#^#vitastor-0.7.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.7.1$(rpm --eval '%dist').tar.gz *

View File

@@ -9,7 +9,8 @@ WORKDIR /root
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
RUN yum -y --enablerepo=extras install centos-release-scl epel-release yum-utils rpm-build RUN yum -y --enablerepo=extras install centos-release-scl epel-release yum-utils rpm-build
RUN yum -y install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm RUN yum -y install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm
RUN yum -y install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gcc make cmake gperftools-devel fio rh-nodejs12 jerasure-devel gf-complete-devel rdma-core-devel RUN yum -y install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gcc make cmake gperftools-devel \
fio rh-nodejs12 jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel
RUN yumdownloader --disablerepo=centos-sclo-rh --source fio RUN yumdownloader --disablerepo=centos-sclo-rh --source fio
RUN rpm --nomd5 -i fio*.src.rpm RUN rpm --nomd5 -i fio*.src.rpm
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
@@ -34,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \ RUN set -e; \
cd /root/vitastor/rpm; \ cd /root/vitastor/rpm; \
sh build-tarball.sh; \ sh build-tarball.sh; \
cp /root/vitastor-0.6.17.el7.tar.gz ~/rpmbuild/SOURCES; \ cp /root/vitastor-0.7.1.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \ cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \ cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \ rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor Name: vitastor
Version: 0.6.17 Version: 0.7.1
Release: 1%{?dist} Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1 License: Vitastor Network Public License 1.1
URL: https://vitastor.io/ URL: https://vitastor.io/
Source0: vitastor-0.6.17.el7.tar.gz Source0: vitastor-0.7.1.el7.tar.gz
BuildRequires: liburing-devel >= 0.6 BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel BuildRequires: gperftools-devel
@@ -13,6 +13,7 @@ BuildRequires: devtoolset-9-gcc-c++
BuildRequires: rh-nodejs12 BuildRequires: rh-nodejs12
BuildRequires: rh-nodejs12-npm BuildRequires: rh-nodejs12-npm
BuildRequires: jerasure-devel BuildRequires: jerasure-devel
BuildRequires: libisa-l-devel
BuildRequires: gf-complete-devel BuildRequires: gf-complete-devel
BuildRequires: libibverbs-devel BuildRequires: libibverbs-devel
BuildRequires: cmake BuildRequires: cmake
@@ -32,6 +33,7 @@ size with configurable redundancy (replication or erasure codes/XOR).
%package -n vitastor-osd %package -n vitastor-osd
Summary: Vitastor - OSD Summary: Vitastor - OSD
Requires: libJerasure2 Requires: libJerasure2
Requires: libisa-l
Requires: liburing >= 0.6 Requires: liburing >= 0.6
Requires: vitastor-client = %{version}-%{release} Requires: vitastor-client = %{version}-%{release}

View File

@@ -6,10 +6,12 @@ FROM centos:8
WORKDIR /root WORKDIR /root
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
RUN sed -i 's/^mirrorlist=/#mirrorlist=/; s!#baseurl=http://mirror.centos.org/!baseurl=http://vault.centos.org/!' /etc/yum.repos.d/*.repo
RUN dnf -y install centos-release-advanced-virtualization epel-release dnf-plugins-core RUN dnf -y install centos-release-advanced-virtualization epel-release dnf-plugins-core
RUN sed -i 's/^mirrorlist=/#mirrorlist=/; s!#baseurl=.*!baseurl=http://vault.centos.org/centos/8.4.2105/virt/$basearch/$avdir/!; s!^baseurl=.*Source/.*!baseurl=http://vault.centos.org/centos/8.4.2105/virt/Source/advanced-virtualization/!' /etc/yum.repos.d/CentOS-Advanced-Virtualization.repo
RUN yum -y install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm RUN yum -y install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm
RUN dnf -y install gcc-toolset-9 gcc-toolset-9-gcc-c++ gperftools-devel \ RUN dnf -y install gcc-toolset-9 gcc-toolset-9-gcc-c++ gperftools-devel \
fio nodejs rpm-build jerasure-devel gf-complete-devel libibverbs-devel libarchive cmake fio nodejs rpm-build jerasure-devel libisa-l-devel gf-complete-devel libibverbs-devel libarchive cmake
RUN dnf download --source fio RUN dnf download --source fio
RUN rpm --nomd5 -i fio*.src.rpm RUN rpm --nomd5 -i fio*.src.rpm
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=powertools --spec fio.spec RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=powertools --spec fio.spec
@@ -33,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \ RUN set -e; \
cd /root/vitastor/rpm; \ cd /root/vitastor/rpm; \
sh build-tarball.sh; \ sh build-tarball.sh; \
cp /root/vitastor-0.6.17.el8.tar.gz ~/rpmbuild/SOURCES; \ cp /root/vitastor-0.7.1.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \ cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \ cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \ rpmbuild -ba vitastor.spec; \

View File

@@ -1,17 +1,18 @@
Name: vitastor Name: vitastor
Version: 0.6.17 Version: 0.7.1
Release: 1%{?dist} Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1 License: Vitastor Network Public License 1.1
URL: https://vitastor.io/ URL: https://vitastor.io/
Source0: vitastor-0.6.17.el8.tar.gz Source0: vitastor-0.7.1.el8.tar.gz
BuildRequires: liburing-devel >= 0.6 BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel BuildRequires: gperftools-devel
BuildRequires: gcc-toolset-9-gcc-c++ BuildRequires: gcc-toolset-9-gcc-c++
BuildRequires: nodejs >= 10 BuildRequires: nodejs >= 10
BuildRequires: jerasure-devel BuildRequires: jerasure-devel
BuildRequires: libisa-l-devel
BuildRequires: gf-complete-devel BuildRequires: gf-complete-devel
BuildRequires: libibverbs-devel BuildRequires: libibverbs-devel
BuildRequires: cmake BuildRequires: cmake
@@ -31,6 +32,7 @@ size with configurable redundancy (replication or erasure codes/XOR).
%package -n vitastor-osd %package -n vitastor-osd
Summary: Vitastor - OSD Summary: Vitastor - OSD
Requires: libJerasure2 Requires: libJerasure2
Requires: libisa-l
Requires: liburing >= 0.6 Requires: liburing >= 0.6
Requires: vitastor-client = %{version}-%{release} Requires: vitastor-client = %{version}-%{release}

View File

@@ -15,7 +15,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif() endif()
add_definitions(-DVERSION="0.6.17") add_definitions(-DVERSION="0.7.1")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src) add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
if (${WITH_ASAN}) if (${WITH_ASAN})
add_definitions(-fsanitize=address -fno-omit-frame-pointer) add_definitions(-fsanitize=address -fno-omit-frame-pointer)
@@ -50,6 +50,10 @@ pkg_check_modules(IBVERBS libibverbs)
if (IBVERBS_LIBRARIES) if (IBVERBS_LIBRARIES)
add_definitions(-DWITH_RDMA) add_definitions(-DWITH_RDMA)
endif (IBVERBS_LIBRARIES) endif (IBVERBS_LIBRARIES)
pkg_check_modules(ISAL libisal)
if (ISAL_LIBRARIES)
add_definitions(-DWITH_ISAL)
endif (ISAL_LIBRARIES)
include_directories( include_directories(
../ ../
@@ -104,6 +108,7 @@ target_link_libraries(vitastor-osd
vitastor_common vitastor_common
vitastor_blk vitastor_blk
Jerasure Jerasure
${ISAL_LIBRARIES}
${IBVERBS_LIBRARIES} ${IBVERBS_LIBRARIES}
) )
@@ -225,7 +230,7 @@ target_link_libraries(osd_test tcmalloc_minimal)
# osd_rmw_test # osd_rmw_test
add_executable(osd_rmw_test osd_rmw_test.cpp allocator.cpp) add_executable(osd_rmw_test osd_rmw_test.cpp allocator.cpp)
target_link_libraries(osd_rmw_test Jerasure tcmalloc_minimal) target_link_libraries(osd_rmw_test Jerasure ${ISAL_LIBRARIES} tcmalloc_minimal)
# stub_uring_osd # stub_uring_osd
add_executable(stub_uring_osd add_executable(stub_uring_osd

View File

@@ -912,7 +912,11 @@ void blockstore_init_journal::erase_dirty_object(blockstore_dirty_db_t::iterator
? clean_it->second.location : UINT64_MAX; ? clean_it->second.location : UINT64_MAX;
if (exists && clean_loc == UINT64_MAX) if (exists && clean_loc == UINT64_MAX)
{ {
bs->inode_space_stats[oid.inode] -= bs->block_size; auto & sp = bs->inode_space_stats[oid.inode];
if (sp > bs->block_size)
sp -= bs->block_size;
else
bs->inode_space_stats.erase(oid.inode);
} }
bs->erase_dirty(dirty_it, dirty_end, clean_loc); bs->erase_dirty(dirty_it, dirty_end, clean_loc);
// Remove it from the flusher's queue, too // Remove it from the flusher's queue, too

View File

@@ -200,7 +200,11 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
} }
else if (IS_DELETE(dirty_it->second.state)) else if (IS_DELETE(dirty_it->second.state))
{ {
inode_space_stats[dirty_it->first.oid.inode] -= block_size; auto & sp = inode_space_stats[dirty_it->first.oid.inode];
if (sp > block_size)
sp -= block_size;
else
inode_space_stats.erase(dirty_it->first.oid.inode);
} }
} }
if (forget_dirty && (IS_BIG_WRITE(dirty_it->second.state) || if (forget_dirty && (IS_BIG_WRITE(dirty_it->second.state) ||

View File

@@ -39,6 +39,7 @@ public:
ring_loop_t *ringloop = NULL; ring_loop_t *ringloop = NULL;
epoll_manager_t *epmgr = NULL; epoll_manager_t *epmgr = NULL;
cluster_client_t *cli = NULL; cluster_client_t *cli = NULL;
bool no_recovery = false, no_rebalance = false, readonly = false;
int waiting = 0; int waiting = 0;
cli_result_t etcd_err; cli_result_t etcd_err;

View File

@@ -127,7 +127,7 @@ resume_1:
pool_stats[pool_cfg.id] = json11::Json::object { pool_stats[pool_cfg.id] = json11::Json::object {
{ "name", pool_cfg.name }, { "name", pool_cfg.name },
{ "pg_count", pool_cfg.pg_count }, { "pg_count", pool_cfg.pg_count },
{ "scheme", pool_cfg.scheme == POOL_SCHEME_REPLICATED ? "replicated" : "jerasure" }, { "scheme", pool_cfg.scheme == POOL_SCHEME_REPLICATED ? "replicated" : "ec" },
{ "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED { "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize) ? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
: "EC "+std::to_string(pool_cfg.pg_size-pool_cfg.parity_chunks)+"+"+std::to_string(pool_cfg.parity_chunks) }, : "EC "+std::to_string(pool_cfg.pg_size-pool_cfg.parity_chunks)+"+"+std::to_string(pool_cfg.parity_chunks) },

View File

@@ -64,8 +64,9 @@ struct rm_inode_t
} }
rm->obj_pos = rm->objects.begin(); rm->obj_pos = rm->objects.begin();
lists.push_back(rm); lists.push_back(rm);
if (parent->list_first) if (parent->list_first && !(status & INODE_LIST_DONE))
{ {
// The listing object is dead when DONE => don't call next()
parent->cli->list_inode_next(lister, 1); parent->cli->list_inode_next(lister, 1);
} }
if (status & INODE_LIST_DONE) if (status & INODE_LIST_DONE)

View File

@@ -5,6 +5,7 @@
#include "cluster_client.h" #include "cluster_client.h"
#include "base64.h" #include "base64.h"
#include "pg_states.h" #include "pg_states.h"
#include "http_client.h"
// Print cluster status: // Print cluster status:
// etcd, mon, osd states // etcd, mon, osd states
@@ -207,6 +208,9 @@ resume_2:
obj_n = agg_stats["object_counts"]["incomplete"].uint64_value(); obj_n = agg_stats["object_counts"]["incomplete"].uint64_value();
if (obj_n > 0) if (obj_n > 0)
more_states += ", "+format_size(obj_n*object_size)+" incomplete"; more_states += ", "+format_size(obj_n*object_size)+" incomplete";
bool readonly = json_is_true(parent->cli->merged_config["readonly"]);
bool no_recovery = json_is_true(parent->cli->merged_config["no_recovery"]);
bool no_rebalance = json_is_true(parent->cli->merged_config["no_rebalance"]);
std::string recovery_io; std::string recovery_io;
{ {
uint64_t deg_bps = agg_stats["recovery_stats"]["degraded"]["bps"].uint64_value(); uint64_t deg_bps = agg_stats["recovery_stats"]["degraded"]["bps"].uint64_value();
@@ -214,9 +218,19 @@ resume_2:
uint64_t misp_bps = agg_stats["recovery_stats"]["misplaced"]["bps"].uint64_value(); uint64_t misp_bps = agg_stats["recovery_stats"]["misplaced"]["bps"].uint64_value();
uint64_t misp_iops = agg_stats["recovery_stats"]["misplaced"]["iops"].uint64_value(); uint64_t misp_iops = agg_stats["recovery_stats"]["misplaced"]["iops"].uint64_value();
if (deg_iops > 0 || deg_bps > 0) if (deg_iops > 0 || deg_bps > 0)
recovery_io += " recovery: "+format_size(deg_bps)+"/s, "+format_size(deg_iops, true)+" op/s\n"; {
recovery_io += " recovery: "+std::string(no_recovery ? "disabled, " : "")+
format_size(deg_bps)+"/s, "+format_size(deg_iops, true)+" op/s\n";
}
else if (no_recovery)
recovery_io += " recovery: disabled\n";
if (misp_iops > 0 || misp_bps > 0) if (misp_iops > 0 || misp_bps > 0)
recovery_io += " rebalance: "+format_size(misp_bps)+"/s, "+format_size(misp_iops, true)+" op/s\n"; {
recovery_io += " rebalance: "+std::string(no_rebalance ? "disabled, " : "")+
format_size(misp_bps)+"/s, "+format_size(misp_iops, true)+" op/s\n";
}
else if (no_rebalance)
recovery_io += " rebalance: disabled\n";
} }
if (parent->json_output) if (parent->json_output)
{ {
@@ -233,6 +247,9 @@ resume_2:
{ "free_raw", free_raw }, { "free_raw", free_raw },
{ "down_raw", down_raw }, { "down_raw", down_raw },
{ "free_down_raw", free_down_raw }, { "free_down_raw", free_down_raw },
{ "readonly", readonly },
{ "no_recovery", no_recovery },
{ "no_rebalance", no_rebalance },
{ "clean_data", agg_stats["object_counts"]["clean"].uint64_value() * object_size }, { "clean_data", agg_stats["object_counts"]["clean"].uint64_value() * object_size },
{ "misplaced_data", agg_stats["object_counts"]["misplaced"].uint64_value() * object_size }, { "misplaced_data", agg_stats["object_counts"]["misplaced"].uint64_value() * object_size },
{ "degraded_data", agg_stats["object_counts"]["degraded"].uint64_value() * object_size }, { "degraded_data", agg_stats["object_counts"]["degraded"].uint64_value() * object_size },
@@ -259,7 +276,7 @@ resume_2:
" pools: %d / %d active\n" " pools: %d / %d active\n"
" pgs: %s\n" " pgs: %s\n"
" \n" " \n"
" io:\n" " io%s:\n"
" client:%s %s/s rd, %s op/s rd, %s/s wr, %s op/s wr\n" " client:%s %s/s rd, %s op/s rd, %s/s wr, %s op/s wr\n"
"%s", "%s",
etcd_alive, etcd_states.size(), format_size(etcd_db_size).c_str(), etcd_alive, etcd_states.size(), format_size(etcd_db_size).c_str(),
@@ -272,6 +289,7 @@ resume_2:
format_size(agg_stats["object_counts"]["clean"].uint64_value() * object_size).c_str(), more_states.c_str(), format_size(agg_stats["object_counts"]["clean"].uint64_value() * object_size).c_str(), more_states.c_str(),
pools_active, pool_count, pools_active, pool_count,
pgs_by_state_str.c_str(), pgs_by_state_str.c_str(),
readonly ? " (read-only mode)" : "",
recovery_io.size() > 0 ? " " : "", recovery_io.size() > 0 ? " " : "",
format_size(agg_stats["op_stats"]["primary_read"]["bps"].uint64_value()).c_str(), format_size(agg_stats["op_stats"]["primary_read"]["bps"].uint64_value()).c_str(),
format_size(agg_stats["op_stats"]["primary_read"]["iops"].uint64_value(), true).c_str(), format_size(agg_stats["op_stats"]["primary_read"]["iops"].uint64_value(), true).c_str(),

View File

@@ -279,6 +279,11 @@ static uint32_t is_power_of_two(uint64_t value)
void cluster_client_t::on_load_config_hook(json11::Json::object & config) void cluster_client_t::on_load_config_hook(json11::Json::object & config)
{ {
this->merged_config = config;
for (auto & kv: this->config.object_items())
{
this->merged_config[kv.first] = kv.second;
}
bs_block_size = config["block_size"].uint64_value(); bs_block_size = config["block_size"].uint64_value();
bs_bitmap_granularity = config["bitmap_granularity"].uint64_value(); bs_bitmap_granularity = config["bitmap_granularity"].uint64_value();
if (!bs_block_size) if (!bs_block_size)

View File

@@ -111,6 +111,7 @@ public:
etcd_state_client_t st_cli; etcd_state_client_t st_cli;
osd_messenger_t msgr; osd_messenger_t msgr;
json11::Json config; json11::Json config;
json11::Json::object merged_config;
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config); cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
~cluster_client_t(); ~cluster_client_t();

View File

@@ -153,16 +153,6 @@ void cluster_client_t::continue_listing(inode_list_t *lst)
{ {
if (lst->done_pgs >= lst->pgs.size()) if (lst->done_pgs >= lst->pgs.size())
{ {
// All done
for (int i = 0; i < lists.size(); i++)
{
if (lists[i] == lst)
{
lists.erase(lists.begin()+i, lists.begin()+i+1);
break;
}
}
delete lst;
return; return;
} }
if (lst->want <= 0) if (lst->want <= 0)
@@ -178,7 +168,7 @@ void cluster_client_t::continue_listing(inode_list_t *lst)
send_list(&lst->pgs[i]->list_osds[j]); send_list(&lst->pgs[i]->list_osds[j]);
if (lst->want <= 0) if (lst->want <= 0)
{ {
break; return;
} }
} }
} }
@@ -268,6 +258,24 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
lst->callback(lst, std::move(pg->objects), pg->pg_num, pg->cur_primary, status); lst->callback(lst, std::move(pg->objects), pg->pg_num, pg->cur_primary, status);
lst->pgs[pg->pos] = NULL; lst->pgs[pg->pos] = NULL;
delete pg; delete pg;
if (lst->done_pgs >= lst->pgs.size())
{
// All done
for (int i = 0; i < lists.size(); i++)
{
if (lists[i] == lst)
{
lists.erase(lists.begin()+i, lists.begin()+i+1);
break;
}
}
delete lst;
return;
}
}
else
{
lst->want++;
} }
continue_listing(lst); continue_listing(lst);
}; };

View File

@@ -673,18 +673,18 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
pc.scheme = POOL_SCHEME_REPLICATED; pc.scheme = POOL_SCHEME_REPLICATED;
else if (pool_item.second["scheme"] == "xor") else if (pool_item.second["scheme"] == "xor")
pc.scheme = POOL_SCHEME_XOR; pc.scheme = POOL_SCHEME_XOR;
else if (pool_item.second["scheme"] == "jerasure") else if (pool_item.second["scheme"] == "ec" || pool_item.second["scheme"] == "jerasure")
pc.scheme = POOL_SCHEME_JERASURE; pc.scheme = POOL_SCHEME_EC;
else else
{ {
fprintf(stderr, "Pool %u has invalid coding scheme (one of \"xor\", \"replicated\" or \"jerasure\" required), skipping pool\n", pool_id); fprintf(stderr, "Pool %u has invalid coding scheme (one of \"xor\", \"replicated\", \"ec\" or \"jerasure\" required), skipping pool\n", pool_id);
continue; continue;
} }
// PG Size // PG Size
pc.pg_size = pool_item.second["pg_size"].uint64_value(); pc.pg_size = pool_item.second["pg_size"].uint64_value();
if (pc.pg_size < 1 || if (pc.pg_size < 1 ||
pool_item.second["pg_size"].uint64_value() < 3 && pool_item.second["pg_size"].uint64_value() < 3 &&
(pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_JERASURE) || (pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_EC) ||
pool_item.second["pg_size"].uint64_value() > 256) pool_item.second["pg_size"].uint64_value() > 256)
{ {
fprintf(stderr, "Pool %u has invalid pg_size, skipping pool\n", pool_id); fprintf(stderr, "Pool %u has invalid pg_size, skipping pool\n", pool_id);
@@ -701,7 +701,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
} }
pc.parity_chunks = 1; pc.parity_chunks = 1;
} }
if (pc.scheme == POOL_SCHEME_JERASURE && if (pc.scheme == POOL_SCHEME_EC &&
(pc.parity_chunks < 1 || pc.parity_chunks > pc.pg_size-2)) (pc.parity_chunks < 1 || pc.parity_chunks > pc.pg_size-2))
{ {
fprintf(stderr, "Pool %u has invalid parity_chunks (must be between 1 and pg_size-2), skipping pool\n", pool_id); fprintf(stderr, "Pool %u has invalid parity_chunks (must be between 1 and pg_size-2), skipping pool\n", pool_id);
@@ -710,7 +710,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
// PG MinSize // PG MinSize
pc.pg_minsize = pool_item.second["pg_minsize"].uint64_value(); pc.pg_minsize = pool_item.second["pg_minsize"].uint64_value();
if (pc.pg_minsize < 1 || pc.pg_minsize > pc.pg_size || if (pc.pg_minsize < 1 || pc.pg_minsize > pc.pg_size ||
(pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_JERASURE) && (pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_EC) &&
pc.pg_minsize < (pc.pg_size-pc.parity_chunks)) pc.pg_minsize < (pc.pg_size-pc.parity_chunks))
{ {
fprintf(stderr, "Pool %u has invalid pg_minsize, skipping pool\n", pool_id); fprintf(stderr, "Pool %u has invalid pg_minsize, skipping pool\n", pool_id);

View File

@@ -36,6 +36,7 @@ struct sec_data
/* The list of completed io_u structs. */ /* The list of completed io_u structs. */
std::vector<io_u*> completed; std::vector<io_u*> completed;
uint64_t inflight = 0; uint64_t inflight = 0;
int mirror_fd = -1;
bool trace = false; bool trace = false;
}; };
@@ -46,6 +47,7 @@ struct sec_options
char *etcd_host = NULL; char *etcd_host = NULL;
char *etcd_prefix = NULL; char *etcd_prefix = NULL;
char *image = NULL; char *image = NULL;
char *mirror_file = NULL;
uint64_t pool = 0; uint64_t pool = 0;
uint64_t inode = 0; uint64_t inode = 0;
int cluster_log = 0; int cluster_log = 0;
@@ -132,6 +134,15 @@ static struct fio_option options[] = {
.category = FIO_OPT_C_ENGINE, .category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME, .group = FIO_OPT_G_FILENAME,
}, },
{
.name = "mirror_file",
.lname = "File name to mirror writes to",
.type = FIO_OPT_STR_STORE,
.off1 = offsetof(struct sec_options, mirror_file),
.help = "File name to mirror writes to (for debug purpose)",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{ {
.name = "use_rdma", .name = "use_rdma",
.lname = "Use RDMA", .lname = "Use RDMA",
@@ -212,6 +223,16 @@ static int sec_setup(struct thread_data *td)
td->o.open_files++; td->o.open_files++;
} }
if (o->mirror_file)
{
bsd->mirror_fd = open(o->mirror_file, O_CREAT|O_RDWR, 0666);
if (bsd->mirror_fd < 0)
{
td_verror(td, errno, "open mirror file");
return 1;
}
}
if (!o->image) if (!o->image)
{ {
if (!(o->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1))) if (!(o->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)))
@@ -265,6 +286,10 @@ static void sec_cleanup(struct thread_data *td)
sec_data *bsd = (sec_data*)td->io_ops_data; sec_data *bsd = (sec_data*)td->io_ops_data;
if (bsd) if (bsd)
{ {
if (bsd->mirror_fd >= 0)
{
close(bsd->mirror_fd);
}
if (bsd->watch) if (bsd->watch)
{ {
vitastor_c_close_watch(bsd->cli, bsd->watch); vitastor_c_close_watch(bsd->cli, bsd->watch);
@@ -325,6 +350,24 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
bsd->last_sync = false; bsd->last_sync = false;
break; break;
case DDIR_WRITE: case DDIR_WRITE:
if (opt->mirror_file)
{
size_t done = 0;
while (done < io->xfer_buflen)
{
ssize_t r = pwrite(bsd->mirror_fd, io->xfer_buf+done, io->xfer_buflen-done, io->offset+done);
if (r < 0 && errno != EAGAIN)
{
fprintf(stderr, "Error writing mirror file: %s\n", strerror(errno));
io->error = errno;
return FIO_Q_COMPLETED;
}
if (r > 0)
{
done += r;
}
}
}
if (opt->image && vitastor_c_inode_get_readonly(bsd->watch)) if (opt->image && vitastor_c_inode_get_readonly(bsd->watch))
{ {
io->error = EROFS; io->error = EROFS;

View File

@@ -758,3 +758,21 @@ static std::string trim(const std::string & in)
int end = in.find_last_not_of(" \n\r\t"); int end = in.find_last_not_of(" \n\r\t");
return in.substr(begin, end+1-begin); return in.substr(begin, end+1-begin);
} }
bool json_is_true(const json11::Json & val)
{
if (val.is_string())
return val == "true" || val == "yes" || val == "1";
return val.bool_value();
}
bool json_is_false(const json11::Json & val)
{
if (val.is_string())
return val.string_value() == "false" || val.string_value() == "no" || val.string_value() == "0";
if (val.is_number())
return val.number_value() == 0;
if (val.is_bool())
return !val.bool_value();
return false;
}

View File

@@ -52,3 +52,6 @@ void http_close(http_co_t *co);
// Utils // Utils
uint64_t stoull_full(const std::string & str, int base = 10); uint64_t stoull_full(const std::string & str, int base = 10);
std::string strtolower(const std::string & in); std::string strtolower(const std::string & in);
// FIXME: move to json11
bool json_is_true(const json11::Json & val);
bool json_is_false(const json11::Json & val);

View File

@@ -92,10 +92,7 @@ void nfs_proxy_t::run(json11::Json cfg)
if (bind_address == "") if (bind_address == "")
bind_address = "0.0.0.0"; bind_address = "0.0.0.0";
default_pool = cfg["pool"].as_string(); default_pool = cfg["pool"].as_string();
portmap_enabled = cfg.object_items().find("portmap") == cfg.object_items().end() || portmap_enabled = !json_is_false(cfg["portmap"]);
cfg["portmap"].uint64_value() ||
cfg["portmap"].string_value() == "yes" ||
cfg["portmap"].string_value() == "true";
nfs_port = cfg["port"].uint64_value() & 0xffff; nfs_port = cfg["port"].uint64_value() & 0xffff;
if (!nfs_port) if (!nfs_port)
nfs_port = 2049; nfs_port = 2049;

View File

@@ -54,7 +54,7 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
autosync_writes = max_autosync; autosync_writes = max_autosync;
} }
if (this->config["osd_memlock"] == "true" || this->config["osd_memlock"] == "1" || this->config["osd_memlock"] == "yes") if (json_is_true(this->config["osd_memlock"]))
{ {
// Lock all OSD memory if requested // Lock all OSD memory if requested
if (mlockall(MCL_CURRENT|MCL_FUTURE if (mlockall(MCL_CURRENT|MCL_FUTURE
@@ -127,11 +127,11 @@ void osd_t::parse_config(const json11::Json & config)
etcd_report_interval = config["etcd_report_interval"].uint64_value(); etcd_report_interval = config["etcd_report_interval"].uint64_value();
if (etcd_report_interval <= 0) if (etcd_report_interval <= 0)
etcd_report_interval = 5; etcd_report_interval = 5;
readonly = config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes"; readonly = json_is_true(config["readonly"]);
run_primary = config["run_primary"] != "false" && config["run_primary"] != "0" && config["run_primary"] != "no"; run_primary = !json_is_false(config["run_primary"]);
no_rebalance = config["no_rebalance"] == "true" || config["no_rebalance"] == "1" || config["no_rebalance"] == "yes"; no_rebalance = json_is_true(config["no_rebalance"]);
no_recovery = config["no_recovery"] == "true" || config["no_recovery"] == "1" || config["no_recovery"] == "yes"; no_recovery = json_is_true(config["no_recovery"]);
allow_test_ops = config["allow_test_ops"] == "true" || config["allow_test_ops"] == "1" || config["allow_test_ops"] == "yes"; allow_test_ops = json_is_true(config["allow_test_ops"]);
if (config["immediate_commit"] == "all") if (config["immediate_commit"] == "all")
immediate_commit = IMMEDIATE_ALL; immediate_commit = IMMEDIATE_ALL;
else if (config["immediate_commit"] == "small") else if (config["immediate_commit"] == "small")
@@ -168,6 +168,9 @@ void osd_t::parse_config(const json11::Json & config)
slow_log_interval = config["slow_log_interval"].uint64_value(); slow_log_interval = config["slow_log_interval"].uint64_value();
if (!slow_log_interval) if (!slow_log_interval)
slow_log_interval = 10; slow_log_interval = 10;
inode_vanish_time = config["inode_vanish_time"].uint64_value();
if (!inode_vanish_time)
inode_vanish_time = 60;
} }
void osd_t::bind_socket() void osd_t::bind_socket()

View File

@@ -113,6 +113,7 @@ class osd_t
int autosync_writes = DEFAULT_AUTOSYNC_WRITES; int autosync_writes = DEFAULT_AUTOSYNC_WRITES;
int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE; int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH; int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
int inode_vanish_time = 60;
int log_level = 0; int log_level = 0;
// cluster state // cluster state
@@ -165,6 +166,7 @@ class osd_t
// op statistics // op statistics
osd_op_stats_t prev_stats; osd_op_stats_t prev_stats;
std::map<uint64_t, inode_stats_t> inode_stats; std::map<uint64_t, inode_stats_t> inode_stats;
std::map<uint64_t, timespec> vanishing_inodes;
const char* recovery_stat_names[2] = { "degraded", "misplaced" }; const char* recovery_stat_names[2] = { "degraded", "misplaced" };
uint64_t recovery_stat_count[2][2] = {}; uint64_t recovery_stat_count[2][2] = {};
uint64_t recovery_stat_bytes[2][2] = {}; uint64_t recovery_stat_bytes[2][2] = {};

View File

@@ -186,7 +186,8 @@ void osd_t::report_statistics()
json11::Json::object inode_space; json11::Json::object inode_space;
json11::Json::object last_stat; json11::Json::object last_stat;
pool_id_t last_pool = 0; pool_id_t last_pool = 0;
for (auto kv: bs->get_inode_space_stats()) auto & bs_inode_space = bs->get_inode_space_stats();
for (auto kv: bs_inode_space)
{ {
pool_id_t pool_id = INODE_POOL(kv.first); pool_id_t pool_id = INODE_POOL(kv.first);
uint64_t only_inode_num = INODE_NO_POOL(kv.first); uint64_t only_inode_num = INODE_NO_POOL(kv.first);
@@ -204,8 +205,26 @@ void osd_t::report_statistics()
last_stat = json11::Json::object(); last_stat = json11::Json::object();
last_pool = 0; last_pool = 0;
json11::Json::object inode_ops; json11::Json::object inode_ops;
for (auto kv: inode_stats) timespec tv_now;
for (auto st_it = inode_stats.begin(); st_it != inode_stats.end(); )
{ {
auto & kv = *st_it;
if (!bs_inode_space[kv.first])
{
// Is it an empty inode?
if (!tv_now.tv_sec)
clock_gettime(CLOCK_REALTIME, &tv_now);
auto & tv_van = vanishing_inodes[kv.first];
if (!tv_van.tv_sec)
tv_van = tv_now;
else if (tv_van.tv_sec < tv_now.tv_sec-inode_vanish_time)
{
// Inode vanished <inode_vanish_time> seconds ago, remove it from stats
vanishing_inodes.erase(kv.first);
inode_stats.erase(st_it++);
continue;
}
}
pool_id_t pool_id = INODE_POOL(kv.first); pool_id_t pool_id = INODE_POOL(kv.first);
uint64_t only_inode_num = (kv.first & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)); uint64_t only_inode_num = (kv.first & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1));
if (!last_pool || pool_id != last_pool) if (!last_pool || pool_id != last_pool)
@@ -232,6 +251,7 @@ void osd_t::report_statistics()
{ "bytes", kv.second.op_bytes[INODE_STATS_DELETE] }, { "bytes", kv.second.op_bytes[INODE_STATS_DELETE] },
} }, } },
}; };
st_it++;
} }
if (last_pool) if (last_pool)
inode_ops[std::to_string(last_pool)] = last_stat; inode_ops[std::to_string(last_pool)] = last_stat;
@@ -370,7 +390,11 @@ void osd_t::acquire_lease()
etcd_lease_id = data["ID"].string_value(); etcd_lease_id = data["ID"].string_value();
create_osd_state(); create_osd_state();
}); });
printf("[OSD %lu] reporting to etcd at %s every %d seconds\n", this->osd_num, config["etcd_address"].string_value().c_str(), etcd_report_interval); printf(
"[OSD %lu] reporting to etcd at %s every %d seconds\n", this->osd_num,
(config["etcd_address"].is_string() ? config["etcd_address"].string_value() : config["etcd_address"].dump()).c_str(),
etcd_report_interval
);
tfd->set_timer(etcd_report_interval*1000, true, [this](int timer_id) tfd->set_timer(etcd_report_interval*1000, true, [this](int timer_id)
{ {
renew_lease(); renew_lease();
@@ -676,9 +700,9 @@ void osd_t::apply_pg_config()
.all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end()), .all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end()),
.target_set = pg_cfg.target_set, .target_set = pg_cfg.target_set,
}; };
if (pg.scheme == POOL_SCHEME_JERASURE) if (pg.scheme == POOL_SCHEME_EC)
{ {
use_jerasure(pg.pg_size, pg.pg_data_size, true); use_ec(pg.pg_size, pg.pg_data_size, true);
} }
this->pg_state_dirty.insert({ .pool_id = pool_id, .pg_num = pg_num }); this->pg_state_dirty.insert({ .pool_id = pool_id, .pg_num = pg_num });
pg.print_state(); pg.print_state();
@@ -890,9 +914,9 @@ void osd_t::report_pg_states()
{ {
// Forget offline PGs after reporting their state // Forget offline PGs after reporting their state
// (if the state wasn't changed again) // (if the state wasn't changed again)
if (pg_it->second.scheme == POOL_SCHEME_JERASURE) if (pg_it->second.scheme == POOL_SCHEME_EC)
{ {
use_jerasure(pg_it->second.pg_size, pg_it->second.pg_data_size, false); use_ec(pg_it->second.pg_size, pg_it->second.pg_data_size, false);
} }
this->pgs.erase(pg_it); this->pgs.erase(pg_it);
} }

View File

@@ -5,7 +5,7 @@
#define POOL_SCHEME_REPLICATED 1 #define POOL_SCHEME_REPLICATED 1
#define POOL_SCHEME_XOR 2 #define POOL_SCHEME_XOR 2
#define POOL_SCHEME_JERASURE 3 #define POOL_SCHEME_EC 3
#define POOL_ID_MAX 0x10000 #define POOL_ID_MAX 0x10000
#define POOL_ID_BITS 16 #define POOL_ID_BITS 16
#define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS)) #define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS))

View File

@@ -317,7 +317,7 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
// Self // Self
osd_op_t *op = new osd_op_t(); osd_op_t *op = new osd_op_t();
op->op_type = 0; op->op_type = 0;
op->peer_fd = 0; op->peer_fd = -1;
clock_gettime(CLOCK_REALTIME, &op->tv_begin); clock_gettime(CLOCK_REALTIME, &op->tv_begin);
op->bs_op = new blockstore_op_t(); op->bs_op = new blockstore_op_t();
op->bs_op->opcode = BS_OP_SYNC; op->bs_op->opcode = BS_OP_SYNC;
@@ -383,7 +383,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
// Self // Self
osd_op_t *op = new osd_op_t(); osd_op_t *op = new osd_op_t();
op->op_type = 0; op->op_type = 0;
op->peer_fd = 0; op->peer_fd = -1;
clock_gettime(CLOCK_REALTIME, &op->tv_begin); clock_gettime(CLOCK_REALTIME, &op->tv_begin);
op->bs_op = new blockstore_op_t(); op->bs_op = new blockstore_op_t();
op->bs_op->opcode = BS_OP_LIST; op->bs_op->opcode = BS_OP_LIST;

View File

@@ -241,9 +241,9 @@ resume_2:
{ {
reconstruct_stripes_xor(stripes, op_data->pg_size, clean_entry_bitmap_size); reconstruct_stripes_xor(stripes, op_data->pg_size, clean_entry_bitmap_size);
} }
else if (op_data->scheme == POOL_SCHEME_JERASURE) else if (op_data->scheme == POOL_SCHEME_EC)
{ {
reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size); reconstruct_stripes_ec(stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size);
} }
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len); cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
for (int role = 0; role < op_data->pg_size; role++) for (int role = 0; role < op_data->pg_size; role++)

View File

@@ -110,9 +110,9 @@ resume_1:
{ {
reconstruct_stripes_xor(local_stripes, pg.pg_size, clean_entry_bitmap_size); reconstruct_stripes_xor(local_stripes, pg.pg_size, clean_entry_bitmap_size);
} }
else if (pg.scheme == POOL_SCHEME_JERASURE) else if (pg.scheme == POOL_SCHEME_EC)
{ {
reconstruct_stripes_jerasure(local_stripes, pg.pg_size, pg.pg_data_size, clean_entry_bitmap_size); reconstruct_stripes_ec(local_stripes, pg.pg_size, pg.pg_data_size, clean_entry_bitmap_size);
} }
break; break;
} }
@@ -295,6 +295,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
else else
{ {
// Fail it immediately // Fail it immediately
subop->peer_fd = -1;
subop->reply.hdr.retval = -EPIPE; subop->reply.hdr.retval = -EPIPE;
subop->callback(subop); subop->callback(subop);
} }
@@ -505,9 +506,9 @@ void osd_t::send_chained_read_results(pg_t & pg, osd_op_t *cur_op)
{ {
reconstruct_stripes_xor(stripes, pg.pg_size, clean_entry_bitmap_size); reconstruct_stripes_xor(stripes, pg.pg_size, clean_entry_bitmap_size);
} }
else if (op_data->scheme == POOL_SCHEME_JERASURE) else if (op_data->scheme == POOL_SCHEME_EC)
{ {
reconstruct_stripes_jerasure(stripes, pg.pg_size, pg.pg_data_size, clean_entry_bitmap_size); reconstruct_stripes_ec(stripes, pg.pg_size, pg.pg_data_size, clean_entry_bitmap_size);
} }
} }
} }

View File

@@ -233,6 +233,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o
else else
{ {
// Fail it immediately // Fail it immediately
subop->peer_fd = -1;
subop->reply.hdr.retval = -EPIPE; subop->reply.hdr.retval = -EPIPE;
subop->callback(subop); subop->callback(subop);
} }
@@ -321,7 +322,21 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
osd_primary_op_data_t *op_data = cur_op->op_data; osd_primary_op_data_t *op_data = cur_op->op_data;
if (retval != expected) if (retval != expected)
{ {
printf("%s subop failed: retval = %d (expected %d)\n", osd_op_names[opcode], retval, expected); if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
{
printf(
"%s subop to %lx:%lx v%lu failed on peer %d: retval = %d (expected %d)\n",
osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, subop->req.sec_rw.version,
subop->peer_fd, retval, expected
);
}
else
{
printf(
"%s subop failed on peer %d: retval = %d (expected %d)\n",
osd_op_names[opcode], subop->peer_fd, retval, expected
);
}
if (retval == -EPIPE) if (retval == -EPIPE)
{ {
op_data->epipe++; op_data->epipe++;
@@ -495,6 +510,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
else else
{ {
// Fail it immediately // Fail it immediately
subops[i].peer_fd = -1;
subops[i].reply.hdr.retval = -EPIPE; subops[i].reply.hdr.retval = -EPIPE;
subops[i].callback(&subops[i]); subops[i].callback(&subops[i]);
} }
@@ -609,6 +625,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
else else
{ {
// Fail it immediately // Fail it immediately
subops[i].peer_fd = -1;
subops[i].reply.hdr.retval = -EPIPE; subops[i].reply.hdr.retval = -EPIPE;
subops[i].callback(&subops[i]); subops[i].callback(&subops[i]);
} }

View File

@@ -132,9 +132,9 @@ resume_3:
{ {
calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, clean_entry_bitmap_size); calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, clean_entry_bitmap_size);
} }
else if (pg.scheme == POOL_SCHEME_JERASURE) else if (pg.scheme == POOL_SCHEME_EC)
{ {
calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, clean_entry_bitmap_size); calc_rmw_parity_ec(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, clean_entry_bitmap_size);
} }
} }
// Send writes // Send writes

View File

@@ -4,8 +4,11 @@
#include <stdexcept> #include <stdexcept>
#include <string.h> #include <string.h>
#include <assert.h> #include <assert.h>
#include <jerasure/reed_sol.h> #include <reed_sol.h>
#include <jerasure.h> #include <jerasure.h>
#ifdef WITH_ISAL
#include <isa-l/erasure_code.h>
#endif
#include <map> #include <map>
#include "allocator.h" #include "allocator.h"
#include "xor.h" #include "xor.h"
@@ -147,13 +150,14 @@ inline bool operator < (const reed_sol_erased_t &a, const reed_sol_erased_t &b)
struct reed_sol_matrix_t struct reed_sol_matrix_t
{ {
int refs = 0; int refs = 0;
int *data; int *je_data;
std::map<reed_sol_erased_t, int*> decodings; uint8_t *isal_data;
std::map<reed_sol_erased_t, void*> decodings;
}; };
std::map<uint64_t, reed_sol_matrix_t> matrices; static std::map<uint64_t, reed_sol_matrix_t> matrices;
void use_jerasure(int pg_size, int pg_minsize, bool use) void use_ec(int pg_size, int pg_minsize, bool use)
{ {
uint64_t key = (uint64_t)pg_size | ((uint64_t)pg_minsize) << 32; uint64_t key = (uint64_t)pg_size | ((uint64_t)pg_minsize) << 32;
auto rs_it = matrices.find(key); auto rs_it = matrices.find(key);
@@ -164,19 +168,33 @@ void use_jerasure(int pg_size, int pg_minsize, bool use)
return; return;
} }
int *matrix = reed_sol_vandermonde_coding_matrix(pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W); int *matrix = reed_sol_vandermonde_coding_matrix(pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W);
uint8_t *isal_table = NULL;
#ifdef WITH_ISAL
uint8_t *isal_matrix = (uint8_t*)malloc_or_die(pg_minsize*(pg_size-pg_minsize));
for (int i = 0; i < pg_minsize*(pg_size-pg_minsize); i++)
{
isal_matrix[i] = matrix[i];
}
isal_table = (uint8_t*)malloc_or_die(pg_minsize*(pg_size-pg_minsize)*32);
ec_init_tables(pg_minsize, pg_size-pg_minsize, isal_matrix, isal_table);
free(isal_matrix);
#endif
matrices[key] = (reed_sol_matrix_t){ matrices[key] = (reed_sol_matrix_t){
.refs = 0, .refs = 0,
.data = matrix, .je_data = matrix,
.isal_data = isal_table,
}; };
rs_it = matrices.find(key); rs_it = matrices.find(key);
} }
rs_it->second.refs += (!use ? -1 : 1); rs_it->second.refs += (!use ? -1 : 1);
if (rs_it->second.refs <= 0) if (rs_it->second.refs <= 0)
{ {
free(rs_it->second.data); free(rs_it->second.je_data);
if (rs_it->second.isal_data)
free(rs_it->second.isal_data);
for (auto dec_it = rs_it->second.decodings.begin(); dec_it != rs_it->second.decodings.end();) for (auto dec_it = rs_it->second.decodings.begin(); dec_it != rs_it->second.decodings.end();)
{ {
int *data = dec_it->second; void *data = dec_it->second;
rs_it->second.decodings.erase(dec_it++); rs_it->second.decodings.erase(dec_it++);
free(data); free(data);
} }
@@ -184,7 +202,7 @@ void use_jerasure(int pg_size, int pg_minsize, bool use)
} }
} }
reed_sol_matrix_t* get_jerasure_matrix(int pg_size, int pg_minsize) static reed_sol_matrix_t* get_ec_matrix(int pg_size, int pg_minsize)
{ {
uint64_t key = (uint64_t)pg_size | ((uint64_t)pg_minsize) << 32; uint64_t key = (uint64_t)pg_size | ((uint64_t)pg_minsize) << 32;
auto rs_it = matrices.find(key); auto rs_it = matrices.find(key);
@@ -199,7 +217,7 @@ reed_sol_matrix_t* get_jerasure_matrix(int pg_size, int pg_minsize)
// we don't need it. also it makes an extra allocation of int *erased on every call and doesn't cache // we don't need it. also it makes an extra allocation of int *erased on every call and doesn't cache
// the decoding matrix. // the decoding matrix.
// all these flaws are fixed in this function: // all these flaws are fixed in this function:
int* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize) static void* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize)
{ {
int edd = 0; int edd = 0;
int erased[pg_size]; int erased[pg_size];
@@ -210,16 +228,57 @@ int* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size, int pg
edd++; edd++;
if (edd == 0) if (edd == 0)
return NULL; return NULL;
reed_sol_matrix_t *matrix = get_jerasure_matrix(pg_size, pg_minsize); reed_sol_matrix_t *matrix = get_ec_matrix(pg_size, pg_minsize);
auto dec_it = matrix->decodings.find((reed_sol_erased_t){ .data = erased, .size = pg_size }); auto dec_it = matrix->decodings.find((reed_sol_erased_t){ .data = erased, .size = pg_size });
if (dec_it == matrix->decodings.end()) if (dec_it == matrix->decodings.end())
{ {
#ifdef WITH_ISAL
int smrow = 0;
uint8_t *submatrix = (uint8_t*)malloc_or_die(pg_minsize*pg_minsize*2);
for (int i = 0; i < pg_size; i++)
{
if (!erased[i])
{
if (i < pg_minsize)
{
for (int j = 0; j < pg_minsize; j++)
submatrix[smrow*pg_minsize + j] = j == i;
}
else
{
for (int j = 0; j < pg_minsize; j++)
submatrix[smrow*pg_minsize + j] = (uint8_t)matrix->je_data[(i-pg_minsize)*pg_minsize + j];
}
smrow++;
}
}
if (smrow < pg_minsize)
{
free(submatrix);
throw std::runtime_error("failed to make an invertible submatrix");
}
gf_invert_matrix(submatrix, submatrix + pg_minsize*pg_minsize, pg_minsize);
smrow = 0;
for (int i = 0; i < pg_minsize; i++)
{
if (erased[i])
{
memcpy(submatrix + pg_minsize*smrow, submatrix + (pg_minsize+i)*pg_minsize, pg_minsize);
smrow++;
}
}
uint8_t *rectable = (uint8_t*)malloc_or_die(32*smrow*pg_minsize + pg_size*sizeof(int));
ec_init_tables(pg_minsize, smrow, submatrix, rectable);
free(submatrix);
int *erased_copy = (int*)(rectable + 32*smrow*pg_minsize);
memcpy(erased_copy, erased, pg_size*sizeof(int));
matrix->decodings.emplace((reed_sol_erased_t){ .data = erased_copy, .size = pg_size }, rectable);
return rectable;
#else
int *dm_ids = (int*)malloc_or_die(sizeof(int)*(pg_minsize + pg_minsize*pg_minsize + pg_size)); int *dm_ids = (int*)malloc_or_die(sizeof(int)*(pg_minsize + pg_minsize*pg_minsize + pg_size));
int *decoding_matrix = dm_ids + pg_minsize; int *decoding_matrix = dm_ids + pg_minsize;
if (!dm_ids)
throw std::bad_alloc();
// we always use row_k_ones=1 and w=8 (OSD_JERASURE_W) // we always use row_k_ones=1 and w=8 (OSD_JERASURE_W)
if (jerasure_make_decoding_matrix(pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->data, erased, decoding_matrix, dm_ids) < 0) if (jerasure_make_decoding_matrix(pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->je_data, erased, decoding_matrix, dm_ids) < 0)
{ {
free(dm_ids); free(dm_ids);
throw std::runtime_error("jerasure_make_decoding_matrix() failed"); throw std::runtime_error("jerasure_make_decoding_matrix() failed");
@@ -228,13 +287,64 @@ int* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size, int pg
memcpy(erased_copy, erased, pg_size*sizeof(int)); memcpy(erased_copy, erased, pg_size*sizeof(int));
matrix->decodings.emplace((reed_sol_erased_t){ .data = erased_copy, .size = pg_size }, dm_ids); matrix->decodings.emplace((reed_sol_erased_t){ .data = erased_copy, .size = pg_size }, dm_ids);
return dm_ids; return dm_ids;
#endif
} }
return dec_it->second; return dec_it->second;
} }
void reconstruct_stripes_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size) #ifdef WITH_ISAL
void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size)
{ {
int *dm_ids = get_jerasure_decoding_matrix(stripes, pg_size, pg_minsize); uint8_t *dectable = (uint8_t*)get_jerasure_decoding_matrix(stripes, pg_size, pg_minsize);
if (!dectable)
{
return;
}
uint8_t *data_ptrs[pg_size];
int wanted_base = 0, wanted = 0;
uint64_t read_start = 0, read_end = 0;
auto recover_seq = [&]()
{
int orig = 0;
for (int other = 0; other < pg_size; other++)
{
if (stripes[other].read_end != 0 && !stripes[other].missing)
{
assert(stripes[other].read_start <= read_start);
assert(stripes[other].read_end >= read_end);
data_ptrs[orig++] = (uint8_t*)stripes[other].read_buf + (read_start - stripes[other].read_start);
}
}
ec_encode_data(
read_end-read_start, pg_minsize, wanted, dectable + wanted_base*32*pg_minsize,
data_ptrs, data_ptrs + pg_minsize
);
wanted_base += wanted;
wanted = 0;
};
for (int role = 0; role < pg_minsize; role++)
{
if (stripes[role].read_end != 0 && stripes[role].missing)
{
if (read_end && (stripes[role].read_start != read_start ||
stripes[role].read_end != read_end))
{
recover_seq();
}
read_start = stripes[role].read_start;
read_end = stripes[role].read_end;
data_ptrs[pg_minsize + (wanted++)] = (uint8_t*)stripes[role].read_buf;
}
}
if (wanted > 0)
{
recover_seq();
}
}
#else
void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size)
{
int *dm_ids = (int*)get_jerasure_decoding_matrix(stripes, pg_size, pg_minsize);
if (!dm_ids) if (!dm_ids)
{ {
return; return;
@@ -242,7 +352,9 @@ void reconstruct_stripes_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg
int *decoding_matrix = dm_ids + pg_minsize; int *decoding_matrix = dm_ids + pg_minsize;
char *data_ptrs[pg_size]; char *data_ptrs[pg_size];
for (int role = 0; role < pg_size; role++) for (int role = 0; role < pg_size; role++)
{
data_ptrs[role] = NULL; data_ptrs[role] = NULL;
}
for (int role = 0; role < pg_minsize; role++) for (int role = 0; role < pg_minsize; role++)
{ {
if (stripes[role].read_end != 0 && stripes[role].missing) if (stripes[role].read_end != 0 && stripes[role].missing)
@@ -279,6 +391,7 @@ void reconstruct_stripes_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg
} }
} }
} }
#endif
int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int pg_minsize, int pg_size) int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int pg_minsize, int pg_size)
{ {
@@ -679,12 +792,12 @@ void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_
calc_rmw_parity_copy_parity(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end); calc_rmw_parity_copy_parity(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);
} }
void calc_rmw_parity_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_size) uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_size)
{ {
uint32_t bitmap_granularity = bitmap_size > 0 ? chunk_size / bitmap_size / 8 : 0; uint32_t bitmap_granularity = bitmap_size > 0 ? chunk_size / bitmap_size / 8 : 0;
reed_sol_matrix_t *matrix = get_jerasure_matrix(pg_size, pg_minsize); reed_sol_matrix_t *matrix = get_ec_matrix(pg_size, pg_minsize);
reconstruct_stripes_jerasure(stripes, pg_size, pg_minsize, bitmap_size); reconstruct_stripes_ec(stripes, pg_size, pg_minsize, bitmap_size);
uint32_t start = 0, end = 0; uint32_t start = 0, end = 0;
calc_rmw_parity_copy_mod(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, bitmap_granularity, start, end); calc_rmw_parity_copy_mod(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, bitmap_granularity, start, end);
if (end != 0) if (end != 0)
@@ -741,20 +854,34 @@ void calc_rmw_parity_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_min
curbuf[i]++; curbuf[i]++;
} }
} }
#ifdef WITH_ISAL
ec_encode_data(
next_end-pos, pg_minsize, pg_size-pg_minsize, matrix->isal_data,
(uint8_t**)data_ptrs, (uint8_t**)data_ptrs+pg_minsize
);
#else
jerasure_matrix_encode( jerasure_matrix_encode(
pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->data, pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->je_data,
(char**)data_ptrs, (char**)data_ptrs+pg_minsize, next_end-pos (char**)data_ptrs, (char**)data_ptrs+pg_minsize, next_end-pos
); );
#endif
pos = next_end; pos = next_end;
} }
for (int i = 0; i < pg_size; i++) for (int i = 0; i < pg_size; i++)
{ {
data_ptrs[i] = stripes[i].bmp_buf; data_ptrs[i] = stripes[i].bmp_buf;
} }
#ifdef WITH_ISAL
ec_encode_data(
bitmap_size, pg_minsize, pg_size-pg_minsize, matrix->isal_data,
(uint8_t**)data_ptrs, (uint8_t**)data_ptrs+pg_minsize
);
#else
jerasure_matrix_encode( jerasure_matrix_encode(
pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->data, pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->je_data,
(char**)data_ptrs, (char**)data_ptrs+pg_minsize, bitmap_size (char**)data_ptrs, (char**)data_ptrs+pg_minsize, bitmap_size
); );
#endif
} }
} }
calc_rmw_parity_copy_parity(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end); calc_rmw_parity_copy_parity(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);

View File

@@ -44,9 +44,9 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set, void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set,
uint32_t chunk_size, uint32_t bitmap_size); uint32_t chunk_size, uint32_t bitmap_size);
void use_jerasure(int pg_size, int pg_minsize, bool use); void use_ec(int pg_size, int pg_minsize, bool use);
void reconstruct_stripes_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size); void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size);
void calc_rmw_parity_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_size); uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_size);

View File

@@ -587,14 +587,14 @@ void test12()
input buffer: [ write0, write1 ], input buffer: [ write0, write1 ],
rmw buffer: [ write2, write3, read0, read1 ], rmw buffer: [ write2, write3, read0, read1 ],
} }
then, after calc_rmw_parity_jerasure(): all the same then, after calc_rmw_parity_ec(): all the same
then simulate read with read_osd_set=[0,0,3,4] and check read0,read1 buffers then simulate read with read_osd_set=[0,0,3,4] and check read0,read1 buffers
***/ ***/
void test13() void test13()
{ {
use_jerasure(4, 2, true); use_ec(4, 2, true);
osd_num_t osd_set[4] = { 1, 2, 0, 0 }; osd_num_t osd_set[4] = { 1, 2, 0, 0 };
osd_num_t write_osd_set[4] = { 1, 2, 3, 4 }; osd_num_t write_osd_set[4] = { 1, 2, 3, 4 };
osd_rmw_stripe_t stripes[4] = {}; osd_rmw_stripe_t stripes[4] = {};
@@ -628,7 +628,7 @@ void test13()
set_pattern(write_buf, 8192, PATTERN3); set_pattern(write_buf, 8192, PATTERN3);
set_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1); set_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
set_pattern(stripes[1].read_buf, 128*1024-4096, PATTERN2); set_pattern(stripes[1].read_buf, 128*1024-4096, PATTERN2);
calc_rmw_parity_jerasure(stripes, 4, 2, osd_set, write_osd_set, 128*1024, 0); calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 128*1024, 0);
assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024); assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096); assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024); assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
@@ -663,7 +663,7 @@ void test13()
assert(stripes[3].read_buf == (uint8_t*)read_buf+3*128*1024); assert(stripes[3].read_buf == (uint8_t*)read_buf+3*128*1024);
memcpy((uint8_t*)read_buf+2*128*1024, rmw_buf, 128*1024); memcpy((uint8_t*)read_buf+2*128*1024, rmw_buf, 128*1024);
memcpy((uint8_t*)read_buf+3*128*1024, (uint8_t*)rmw_buf+128*1024, 128*1024); memcpy((uint8_t*)read_buf+3*128*1024, (uint8_t*)rmw_buf+128*1024, 128*1024);
reconstruct_stripes_jerasure(stripes, 4, 2, 0); reconstruct_stripes_ec(stripes, 4, 2, 0);
check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1); check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3); check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3);
check_pattern(stripes[1].read_buf, 4096, PATTERN3); check_pattern(stripes[1].read_buf, 4096, PATTERN3);
@@ -694,14 +694,14 @@ void test13()
assert(stripes[3].read_buf == (uint8_t*)read_buf+2*128*1024); assert(stripes[3].read_buf == (uint8_t*)read_buf+2*128*1024);
memcpy((uint8_t*)read_buf+128*1024, rmw_buf, 128*1024); memcpy((uint8_t*)read_buf+128*1024, rmw_buf, 128*1024);
memcpy((uint8_t*)read_buf+2*128*1024, (uint8_t*)rmw_buf+128*1024, 128*1024); memcpy((uint8_t*)read_buf+2*128*1024, (uint8_t*)rmw_buf+128*1024, 128*1024);
reconstruct_stripes_jerasure(stripes, 4, 2, 0); reconstruct_stripes_ec(stripes, 4, 2, 0);
check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1); check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3); check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3);
free(read_buf); free(read_buf);
// Huh done // Huh done
free(rmw_buf); free(rmw_buf);
free(write_buf); free(write_buf);
use_jerasure(4, 2, false); use_ec(4, 2, false);
} }
/*** /***
@@ -714,7 +714,7 @@ void test13()
input buffer: [ write0, write1 ], input buffer: [ write0, write1 ],
rmw buffer: [ write2, read0, read1 ], rmw buffer: [ write2, read0, read1 ],
} }
then, after calc_rmw_parity_jerasure(): all the same then, after calc_rmw_parity_ec(): all the same
then simulate read with read_osd_set=[0,2,3] and check read0 buffer then simulate read with read_osd_set=[0,2,3] and check read0 buffer
***/ ***/
@@ -722,7 +722,7 @@ void test13()
void test14() void test14()
{ {
const int bmp = 4; const int bmp = 4;
use_jerasure(3, 2, true); use_ec(3, 2, true);
osd_num_t osd_set[3] = { 1, 2, 0 }; osd_num_t osd_set[3] = { 1, 2, 0 };
osd_num_t write_osd_set[3] = { 1, 2, 3 }; osd_num_t write_osd_set[3] = { 1, 2, 3 };
osd_rmw_stripe_t stripes[3] = {}; osd_rmw_stripe_t stripes[3] = {};
@@ -757,7 +757,7 @@ void test14()
memset(stripes[0].bmp_buf, 0, bmp); memset(stripes[0].bmp_buf, 0, bmp);
memset(stripes[1].bmp_buf, 0, bmp); memset(stripes[1].bmp_buf, 0, bmp);
memset(stripes[2].bmp_buf, 0, bmp); memset(stripes[2].bmp_buf, 0, bmp);
calc_rmw_parity_jerasure(stripes, 3, 2, osd_set, write_osd_set, 128*1024, bmp); calc_rmw_parity_ec(stripes, 3, 2, osd_set, write_osd_set, 128*1024, bmp);
assert(*(uint32_t*)stripes[0].bmp_buf == 0x80000000); assert(*(uint32_t*)stripes[0].bmp_buf == 0x80000000);
assert(*(uint32_t*)stripes[1].bmp_buf == 0x00000001); assert(*(uint32_t*)stripes[1].bmp_buf == 0x00000001);
assert(*(uint32_t*)stripes[2].bmp_buf == 0x80000001); // jerasure 2+1 is still just XOR assert(*(uint32_t*)stripes[2].bmp_buf == 0x80000001); // jerasure 2+1 is still just XOR
@@ -793,12 +793,12 @@ void test14()
set_pattern(stripes[1].read_buf, 4096, PATTERN3); set_pattern(stripes[1].read_buf, 4096, PATTERN3);
set_pattern(stripes[1].read_buf+4096, 128*1024-4096, PATTERN2); set_pattern(stripes[1].read_buf+4096, 128*1024-4096, PATTERN2);
memcpy(stripes[2].read_buf, rmw_buf, 128*1024); memcpy(stripes[2].read_buf, rmw_buf, 128*1024);
reconstruct_stripes_jerasure(stripes, 3, 2, bmp); reconstruct_stripes_ec(stripes, 3, 2, bmp);
check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1); check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3); check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3);
free(read_buf); free(read_buf);
// Huh done // Huh done
free(rmw_buf); free(rmw_buf);
free(write_buf); free(write_buf);
use_jerasure(3, 2, false); use_ec(3, 2, false);
} }

View File

@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: Vitastor Name: Vitastor
Description: Vitastor client library Description: Vitastor client library
Version: 0.6.17 Version: 0.7.1
Libs: -L${libdir} -lvitastor_client Libs: -L${libdir} -lvitastor_client
Cflags: -I${includedir} Cflags: -I${includedir}

View File

@@ -1,47 +1,103 @@
#!/bin/bash -ex #!/bin/bash
. `dirname $0`/common.sh . `dirname $0`/common.sh
OSD_SIZE=${OSD_SIZE:-1024} OSD_SIZE=${OSD_SIZE:-1024}
PG_COUNT=${PG_COUNT:-1} PG_COUNT=${PG_COUNT:-1}
PG_SIZE=${PG_SIZE:-3} # OSD_COUNT
PG_MINSIZE=${PG_MINSIZE:-2} SCHEME=${SCHEME:-replicated}
OSD_COUNT=${OSD_COUNT:-3} # OSD_ARGS
SCHEME=${SCHEME:-ec} # PG_SIZE
# PG_MINSIZE
if [ "$SCHEME" = "ec" ]; then
OSD_COUNT=${OSD_COUNT:-5}
else
OSD_COUNT=${OSD_COUNT:-3}
fi
if [ "$IMMEDIATE_COMMIT" != "" ]; then
NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 1"
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1,"immediate_commit":"all"}'
else
NO_SAME="--journal_sector_buffer_count 1024 --log_level 1"
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1}'
fi
start_osd()
{
local i=$1
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
eval OSD${i}_PID=$!
}
for i in $(seq 1 $OSD_COUNT); do for i in $(seq 1 $OSD_COUNT); do
dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1)) dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log & start_osd $i
eval OSD${i}_PID=$!
done done
cd mon cd mon
npm install npm install
cd .. cd ..
node mon/mon-main.js --etcd_url $ETCD_URL --etcd_prefix "/vitastor" &>./testdata/mon.log & node mon/mon-main.js --etcd_url $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log &
MON_PID=$! MON_PID=$!
if [ -n "$GLOBAL_CONF" ]; then if [ "$SCHEME" = "ec" ]; then
$ETCDCTL put /vitastor/config/global "$GLOBAL_CONF" PG_SIZE=${PG_SIZE:-5}
fi PG_MINSIZE=${PG_MINSIZE:-3}
PG_DATA_SIZE=$PG_MINSIZE
if [ "$SCHEME" = "replicated" ]; then POOLCFG='"scheme":"ec","parity_chunks":'$((PG_SIZE-PG_MINSIZE))
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":'$PG_SIZE',"pg_minsize":'$PG_MINSIZE',"pg_count":'$PG_COUNT',"failure_domain":"osd"}}' elif [ "$SCHEME" = "xor" ]; then
PG_SIZE=${PG_SIZE:-3}
PG_MINSIZE=${PG_MINSIZE:-2}
PG_DATA_SIZE=$PG_MINSIZE
POOLCFG='"scheme":"xor","parity_chunks":'$((PG_SIZE-PG_MINSIZE))
else else
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"xor","pg_size":'$PG_SIZE',"pg_minsize":'$PG_MINSIZE',"parity_chunks":1,"pg_count":'$PG_COUNT',"failure_domain":"osd"}}' PG_SIZE=${PG_SIZE:-2}
PG_MINSIZE=${PG_MINSIZE:-2}
PG_DATA_SIZE=1
POOLCFG='"scheme":"replicated"'
fi
POOLCFG='"name":"testpool","failure_domain":"osd",'$POOLCFG
$ETCDCTL put /vitastor/config/pools '{"1":{'$POOLCFG',"pg_size":'$PG_SIZE',"pg_minsize":'$PG_MINSIZE',"pg_count":'$PG_COUNT'}}'
sleep 2
if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == '$PG_SIZE') | length) == '$PG_COUNT); then
format_error "FAILED: $PG_COUNT PGS NOT CONFIGURED"
fi fi
sleep 3 if ! ($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == '$PG_COUNT); then
format_error "FAILED: $PG_COUNT PGS NOT UP"
if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(. | length) != 0 and ([ .[0].items["1"][] | select(((.osd_set | select(. != 0) | sort | unique) | length) == '$PG_SIZE') ] | length) == '$PG_COUNT); then
format_error "FAILED: $PG_COUNT PG(s) NOT CONFIGURED"
fi fi
if ! ($ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active"]) ] | length == '$PG_COUNT); then try_reweight()
format_error "FAILED: $PG_COUNT PG(s) NOT UP" {
fi osd=$1
w=$2
$ETCDCTL put /vitastor/config/osd/$osd '{"reweight":'$w'}'
sleep 3
}
if ! cmp build/src/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so; then wait_finish_rebalance()
sudo rm -f /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so {
sudo ln -s "$(realpath .)/build/src/block-vitastor.so" /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so sec=$1
fi i=0
while [[ $i -lt $sec ]]; do
($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == 32') && \
break
if [ $i -eq 60 ]; then
format_error "Rebalance couldn't finish in $sec seconds"
fi
sleep 1
i=$((i+1))
done
}
check_qemu()
{
if ! cmp build/src/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so; then
sudo rm -f /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so
sudo ln -s "$(realpath .)/build/src/block-vitastor.so" /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so
fi
}

View File

@@ -1,68 +0,0 @@
#!/bin/bash
. `dirname $0`/common.sh
if [ "$IMMEDIATE_COMMIT" != "" ]; then
NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 1"
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":5,"immediate_commit":"all"}'
else
NO_SAME="--journal_sector_buffer_count 1024 --log_level 1"
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":5}'
fi
OSD_SIZE=1024
OSD_COUNT=7
OSD_ARGS=
for i in $(seq 1 $OSD_COUNT); do
dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
eval OSD${i}_PID=$!
done
cd mon
npm install
cd ..
node mon/mon-main.js --etcd_url $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log &
MON_PID=$!
if [ "$EC" != "" ]; then
POOLCFG='"scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1'
PG_SIZE=3
else
POOLCFG='"scheme":"replicated","pg_size":2,"pg_minsize":2'
PG_SIZE=2
fi
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":32,"failure_domain":"osd"}}'
sleep 2
if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == '$PG_SIZE') | length) == 32'); then
format_error "FAILED: 32 PGS NOT CONFIGURED"
fi
if ! ($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == 32'); then
format_error "FAILED: 32 PGS NOT UP"
fi
try_reweight()
{
osd=$1
w=$2
$ETCDCTL put /vitastor/config/osd/$osd '{"reweight":'$w'}'
sleep 3
}
wait_finish_rebalance()
{
sec=$1
i=0
while [[ $i -lt $sec ]]; do
($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == 32') && \
break
if [ $i -eq 60 ]; then
format_error "Rebalance couldn't finish in $sec seconds"
fi
sleep 1
i=$((i+1))
done
}

View File

@@ -8,7 +8,7 @@ cd $(dirname $0)
./test_cas.sh ./test_cas.sh
./test_change_pg_count.sh ./test_change_pg_count.sh
EC=1 ./test_change_pg_count.sh SCHEME=ec ./test_change_pg_count.sh
./test_change_pg_size.sh ./test_change_pg_size.sh
@@ -18,8 +18,8 @@ EC=1 ./test_change_pg_count.sh
./test_interrupted_rebalance.sh ./test_interrupted_rebalance.sh
IMMEDIATE_COMMIT=1 ./test_interrupted_rebalance.sh IMMEDIATE_COMMIT=1 ./test_interrupted_rebalance.sh
EC=1 ./test_interrupted_rebalance.sh SCHEME=ec ./test_interrupted_rebalance.sh
EC=1 IMMEDIATE_COMMIT=1 ./test_interrupted_rebalance.sh SCHEME=ec IMMEDIATE_COMMIT=1 ./test_interrupted_rebalance.sh
./test_minsize_1.sh ./test_minsize_1.sh
@@ -27,17 +27,17 @@ EC=1 IMMEDIATE_COMMIT=1 ./test_interrupted_rebalance.sh
./test_rebalance_verify.sh ./test_rebalance_verify.sh
IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh
EC=1 ./test_rebalance_verify.sh SCHEME=ec ./test_rebalance_verify.sh
EC=1 IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh SCHEME=ec IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh
./test_rm.sh ./test_rm.sh
./test_snapshot.sh ./test_snapshot.sh
SCHEME=replicated ./test_snapshot.sh SCHEME=ec ./test_snapshot.sh
./test_splitbrain.sh ./test_splitbrain.sh
./test_write.sh ./test_write.sh
SCHEME=replicated ./test_write.sh SCHEME=xor ./test_write.sh
./test_write_no_same.sh ./test_write_no_same.sh

View File

@@ -1,41 +1,11 @@
#!/bin/bash -ex #!/bin/bash -ex
. `dirname $0`/common.sh OSD_COUNT=${OSD_COUNT:-6}
PG_COUNT=16
if [ "$EC" != "" ]; then . `dirname $0`/run_3osds.sh
POOLCFG='"scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1'
NOBJ=512
else
POOLCFG='"scheme":"replicated","pg_size":2,"pg_minsize":2'
NOBJ=1024
fi
OSD_SIZE=1024 NOBJ=$(((128*8+PG_DATA_SIZE-1)/PG_DATA_SIZE))
OSD_COUNT=6
OSD_ARGS=
for i in $(seq 1 $OSD_COUNT); do
dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
eval OSD${i}_PID=$!
done
cd mon
npm install
cd ..
node mon/mon-main.js --etcd_url $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log &
MON_PID=$!
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":16,"failure_domain":"osd"}}'
sleep 2
if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == 2) | length) == 16'); then
format_error "FAILED: 16 PGS NOT CONFIGURED"
fi
if ! ($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == 16'); then
format_error "FAILED: 16 PGS NOT UP"
fi
LD_PRELOAD="build/src/libfio_vitastor.so" \ LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write \ fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write \
@@ -49,7 +19,7 @@ try_change()
echo --- Change PG count to $n --- >>testdata/osd$i.log echo --- Change PG count to $n --- >>testdata/osd$i.log
done done
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":'$n',"failure_domain":"osd"}}' $ETCDCTL put /vitastor/config/pools '{"1":{'$POOLCFG',"pg_size":'$PG_SIZE',"pg_minsize":'$PG_MINSIZE',"pg_count":'$n'}}'
for i in {1..10}; do for i in {1..10}; do
($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == 2) | length) == '$n) && \ ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == 2) | length) == '$n) && \

View File

@@ -1,40 +1,16 @@
#!/bin/bash -ex #!/bin/bash -ex
. `dirname $0`/common.sh PG_COUNT=16
SCHEME=${SCHEME:-replicated}
OSD_SIZE=1024 . `dirname $0`/run_3osds.sh
OSD_COUNT=3
OSD_ARGS=
for i in $(seq 1 $OSD_COUNT); do
dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
eval OSD${i}_PID=$!
done
cd mon
npm install
cd ..
node mon/mon-main.js --etcd_url $ETCD_URL --etcd_prefix "/vitastor" &>./testdata/mon.log &
MON_PID=$!
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":3,"pg_minsize":2,"pg_count":16,"failure_domain":"osd"}}'
sleep 2
if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | sort) == ["1","2","3"]) | length) == 16'); then
format_error "FAILED: 16 PGS NOT CONFIGURED"
fi
if ! ($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == 16'); then
format_error "FAILED: 16 PGS NOT UP"
fi
try_change() try_change()
{ {
n=$1 n=$1
s=$2 s=$2
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":'$s',"pg_minsize":2,"pg_count":'$n',"failure_domain":"osd"}}' $ETCDCTL put /vitastor/config/pools '{"1":{'$POOLCFG',"pg_size":'$s',"pg_minsize":'$PG_MINSIZE',"pg_count":'$n'}}'
for i in {1..10}; do for i in {1..10}; do
($ETCDCTL get /vitastor/config/pgs --print-value-only |\ ($ETCDCTL get /vitastor/config/pgs --print-value-only |\

54
tests/test_heal.sh Executable file
View File

@@ -0,0 +1,54 @@
#!/bin/bash -ex
# Kill OSDs while writing
PG_SIZE=3
OSD_COUNT=7
PG_COUNT=32
. `dirname $0`/run_3osds.sh
check_qemu
IMG_SIZE=960
$ETCDCTL put /vitastor/config/inode/1/1 '{"name":"testimg","size":'$((IMG_SIZE*1024*1024))'}'
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write \
-mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -cluster_log_level=10
kill_osds()
{
sleep 5
kill -9 $OSD1_PID
$ETCDCTL del /vitastor/osd/state/1
for i in 2 3 4 5 6 7; do
sleep 15
echo Killing OSD $i and starting OSD $((i-1))
p=OSD${i}_PID
kill -9 ${!p}
$ETCDCTL del /vitastor/osd/state/$i
start_osd $((i-1))
sleep 15
done
sleep 5
start_osd 7
sleep 5
}
kill_osds &
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=16 -fsync=256 -rw=randwrite \
-mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120 2>/dev/null
qemu-img convert -S 4096 -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:image=testimg" \
-O raw ./testdata/read.bin
diff ./testdata/read.bin ./testdata/mirror.bin
format_green OK

View File

@@ -1,6 +1,8 @@
#!/bin/bash -ex #!/bin/bash -ex
. `dirname $0`/run_7osds.sh OSD_COUNT=7
PG_COUNT=32
. `dirname $0`/run_3osds.sh
IMG_SIZE=960 IMG_SIZE=960
@@ -32,13 +34,14 @@ try_reweight 5 1
wait_finish_rebalance 60 wait_finish_rebalance 60
# Check that PGs never had degraded objects ! # Check that PGs never had degraded objects !
if grep has_degraded ./testdata/mon.log; then # FIXME: In fact, the test doesn't guarantee it because PGs aren't always peered only with full prior OSD sets :-(
format_error "Some copies of objects were lost during interrupted rebalancings" #if grep has_degraded ./testdata/mon.log; then
fi # format_error "Some copies of objects were lost during interrupted rebalancings"
#fi
# Check that no objects are lost ! # Check that no objects are lost !
nobj=`$ETCDCTL get --prefix '/vitastor/pg/stats' --print-value-only | jq -s '[ .[].object_count ] | reduce .[] as $num (0; .+$num)'` nobj=`$ETCDCTL get --prefix '/vitastor/pg/stats' --print-value-only | jq -s '[ .[].object_count ] | reduce .[] as $num (0; .+$num)'`
if [ "$nobj" -ne $((IMG_SIZE*8)) ]; then if [ "$nobj" -ne $((IMG_SIZE*8/PG_DATA_SIZE)) ]; then
format_error "Data lost after multiple interrupted rebalancings" format_error "Data lost after multiple interrupted rebalancings"
fi fi

View File

@@ -1,6 +1,8 @@
#!/bin/bash -ex #!/bin/bash -ex
. `dirname $0`/run_7osds.sh OSD_COUNT=7
PG_COUNT=32
. `dirname $0`/run_3osds.sh
IMG_SIZE=256 IMG_SIZE=256

View File

@@ -1,6 +1,7 @@
#!/bin/bash -ex #!/bin/bash -ex
. `dirname $0`/run_3osds.sh . `dirname $0`/run_3osds.sh
check_qemu
# Test basic write and snapshot # Test basic write and snapshot

View File

@@ -3,6 +3,7 @@
OSD_SIZE=2048 OSD_SIZE=2048
. `dirname $0`/run_3osds.sh . `dirname $0`/run_3osds.sh
check_qemu
$ETCDCTL put /vitastor/config/inode/1/1 '{"name":"debian9","size":'$((2048*1024*1024))'}' $ETCDCTL put /vitastor/config/inode/1/1 '{"name":"debian9","size":'$((2048*1024*1024))'}'

View File

@@ -1,6 +1,7 @@
#!/bin/bash -ex #!/bin/bash -ex
. `dirname $0`/run_3osds.sh . `dirname $0`/run_3osds.sh
check_qemu
#LD_PRELOAD=libasan.so.5 \ #LD_PRELOAD=libasan.so.5 \
# fio -thread -name=test -ioengine=build/src/libfio_vitastor_sec.so -bs=4k -fsync=128 `$ETCDCTL get /vitastor/osd/state/1 --print-value-only | jq -r '"-host="+.addresses[0]+" -port="+(.port|tostring)'` -rw=write -size=32M # fio -thread -name=test -ioengine=build/src/libfio_vitastor_sec.so -bs=4k -fsync=128 `$ETCDCTL get /vitastor/osd/state/1 --print-value-only | jq -r '"-host="+.addresses[0]+" -port="+(.port|tostring)'` -rw=write -size=32M