Compare commits

..

21 Commits

Author SHA1 Message Date
42ae8f91ee WIP/experimental LRC matrix generator 2022-06-10 18:50:09 +03:00
dac12d8a4c Implement metadata dump tool 2022-06-10 18:50:09 +03:00
1eec4407ab Fix inode creation when /index/maxid is out of sync 2022-06-06 16:35:51 +03:00
huy
3b7c6dcac2 Fix volume creation from snapshots in Cinder driver 2022-06-06 15:46:13 +03:00
342517d126 Fix typo 2022-06-05 00:45:02 +03:00
675bc12a13 Add extern "C" for systems like Gentoo which miss it in jerasure includes 2022-06-05 00:33:38 +03:00
101592bbff Release 0.7.1
- Add ISA-L erasure code implementation, now used automatically instead of jerasure when available
- Fix listings sending too many parallel requests to OSDs
- Fix rm-data crashing with --wait-list
- Remove empty inodes from statistics and `ls` output, after <inode_vanish_time> seconds after deletion
- Make monitor delete pool statistics when the pool is deleted and thus remove them from `df` output
- Log multiple etcd addresses in OSD logs correctly
- Fix true/false parsing in json configs like no_recovery/no_rebalance
- Show no_recovery, no_rebalance, readonly flags in status
2022-06-05 00:07:24 +03:00
be4087d9d2 Add a FIXME to test_interrupted_rebalance 2022-06-05 00:06:56 +03:00
404e43dd2d Note that ISA-L does not need to be enabled separately 2022-06-04 22:58:02 +03:00
87613ed590 Add ISA-L into RPM specs 2022-06-04 13:27:06 +03:00
2a2e914ef9 Show no_recovery, no_rebalance and readonly flags in status 2022-06-04 13:27:06 +03:00
0cdc9292c8 Fix true/false parsing in json configs like no_recovery/no_rebalance 2022-06-04 13:27:06 +03:00
3e1b03bb5c Show all etcd addresses in the "reporting to..." message 2022-06-04 13:27:06 +03:00
36e851505a Make monitor delete pool statistics when the pool is deleted 2022-06-04 13:27:06 +03:00
1efbbb0c36 Make deleted inodes vanish from statistics after 60 seconds 2022-06-04 13:27:06 +03:00
088dd15449 Exclude empty inodes from stats 2022-06-04 00:18:17 +03:00
4a531d7b8b Fix listings sending too many parallel requests to OSDs, fix rm-data crashing with --wait-list 2022-06-03 23:36:37 +03:00
a0cae4c180 Rename "jerasure" to "ec" in pool configuration, function names, fix documentation and Debian build scripts
Old pool configurations with "jerasure" also remain supported as an alias for "ec"
2022-06-03 15:40:00 +03:00
c4eb46600d Merge run_3osds and run_7osds scripts 2022-06-03 01:56:36 +03:00
21b306e25f Add ISA-L support 2022-06-02 01:47:33 +03:00
d8313e939a Release 0.7.0
- Add documentation! :-) in Russian and English
- Implement an NFS proxy for file-based access emulation to Vitastor
  images for non-QEMU based hypervisors like VMWare, as a better way
  than iSCSI
- Implement "primary affinity tags"
- Add a patch for libvirt 6.0
- Fix free_down_raw in cli status
- Fix a rare bug where OSDs could drop unrelated connections on errors
2022-05-29 23:39:53 +03:00
83 changed files with 1160 additions and 855 deletions

View File

@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)
project(vitastor)
set(VERSION "0.6.17")
set(VERSION "0.7.1")
add_subdirectory(src)

View File

@@ -1,4 +1,4 @@
VERSION ?= v0.6.17
VERSION ?= v0.7.1
all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v0.6.17
image: vitalif/vitastor-csi:v0.7.1
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -116,7 +116,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v0.6.17
image: vitalif/vitastor-csi:v0.7.1
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "0.6.17"
vitastorCSIDriverVersion = "0.7.1"
)
// Config struct fills the parameters of request or user input

16
debian/changelog vendored
View File

@@ -1,4 +1,18 @@
vitastor (0.6.17-1) unstable; urgency=medium
vitastor (0.7.1-1) unstable; urgency=medium
* Bugfixes
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
vitastor (0.7.1-1) unstable; urgency=medium
* Implement NFS proxy
* Add documentation
* Bugfixes
-- Vitaliy Filippov <vitalif@yourcmc.ru> Sun, 29 May 2022 23:39:13 +0300
vitastor (0.6.3-1) unstable; urgency=medium
* RDMA support
* Bugfixes

2
debian/control vendored
View File

@@ -2,7 +2,7 @@ Source: vitastor
Section: admin
Priority: optional
Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev
Standards-Version: 4.5.0
Homepage: https://vitastor.io/
Rules-Requires-Root: no

11
debian/libisal.pc vendored Normal file
View File

@@ -0,0 +1,11 @@
prefix=/usr
exec_prefix=${prefix}
libdir=${prefix}/lib/x86_64-linux-gnu
includedir=${prefix}/include
Name: libisal
Description: Library for storage systems
Version: 2.30.0
Libs: -L${libdir} -lisal
Libs.private:
Cflags: -I${includedir}

View File

@@ -1,2 +1,3 @@
usr/bin/vitastor-osd
usr/bin/vitastor-dump-journal
usr/bin/vitastor-dump-meta

View File

@@ -22,10 +22,11 @@ RUN apt-get update
RUN apt-get -y install fio liburing1 liburing-dev libgoogle-perftools-dev devscripts
RUN apt-get -y build-dep fio
RUN apt-get --download-only source fio
RUN apt-get update && apt-get -y install libjerasure-dev cmake libibverbs-dev
RUN apt-get update && apt-get -y install libjerasure-dev cmake libibverbs-dev libisal-dev
ADD . /root/vitastor
RUN set -e -x; \
[ -e /usr/lib/x86_64-linux-gnu/pkgconfig/libisal.pc ] || cp /root/vitastor/debian/libisal.pc /usr/lib/x86_64-linux-gnu/pkgconfig; \
mkdir -p /root/fio-build/; \
cd /root/fio-build/; \
rm -rf /root/fio-build/*; \
@@ -33,8 +34,8 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-0.6.17; \
cd vitastor-0.6.17; \
cp -r /root/vitastor vitastor-0.7.1; \
cd vitastor-0.7.1; \
ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -47,8 +48,8 @@ RUN set -e -x; \
rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.6.17.orig.tar.xz vitastor-0.6.17; \
cd vitastor-0.6.17; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.7.1.orig.tar.xz vitastor-0.7.1; \
cd vitastor-0.7.1; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

View File

@@ -23,6 +23,7 @@ initialization and can be changed with an OSD restart.
- [no_rebalance](#no_rebalance)
- [print_stats_interval](#print_stats_interval)
- [slow_log_interval](#slow_log_interval)
- [inode_vanish_time](#inode_vanish_time)
- [max_write_iodepth](#max_write_iodepth)
- [min_flusher_count](#min_flusher_count)
- [max_flusher_count](#max_flusher_count)
@@ -163,6 +164,13 @@ Time interval at which OSDs dump slow or stuck operations on stdout, if
they're any. Also it's the time after which an operation is considered
"slow".
## inode_vanish_time
- Type: seconds
- Default: 60
Number of seconds after which a deleted inode is removed from OSD statistics.
## max_write_iodepth
- Type: integer

View File

@@ -24,6 +24,7 @@
- [no_rebalance](#no_rebalance)
- [print_stats_interval](#print_stats_interval)
- [slow_log_interval](#slow_log_interval)
- [inode_vanish_time](#inode_vanish_time)
- [max_write_iodepth](#max_write_iodepth)
- [min_flusher_count](#min_flusher_count)
- [max_flusher_count](#max_flusher_count)
@@ -169,6 +170,13 @@ OSD.
медленных или зависших операций, если таковые имеются. Также время, при
превышении которого операция считается "медленной".
## inode_vanish_time
- Тип: секунды
- Значение по умолчанию: 60
Число секунд, через которое удалённые инод удаляется и из статистики OSD.
## max_write_iodepth
- Тип: целое число

View File

@@ -106,9 +106,12 @@ Pool name.
- Type: string
- Required
- One of: "replicated", "xor" or "jerasure"
- One of: "replicated", "xor", "ec" or "jerasure"
Redundancy scheme used for data in this pool.
Redundancy scheme used for data in this pool. "jerasure" is an alias for "ec",
both use Reed-Solomon-Vandermonde codes based on ISA-L or jerasure libraries.
Fast ISA-L based implementation is used automatically when it's available,
slower jerasure version is used otherwise.
## pg_size
@@ -243,7 +246,7 @@ of the OSDs containing a data chunk for a PG.
{
"2": {
"name":"ecpool",
"scheme":"jerasure",
"scheme":"ec",
"pg_size":3,
"parity_chunks":1,
"pg_minsize":2,

View File

@@ -106,9 +106,13 @@
- Тип: строка
- Обязательный
- Возможные значения: "replicated", "xor" или "jerasure"
- Возможные значения: "replicated", "xor", "ec" или "jerasure"
Схема избыточности, используемая в данном пуле.
Схема избыточности, используемая в данном пуле. "jerasure" - синоним для "ec",
в обеих схемах используются коды Рида-Соломона-Вандермонда, реализованные на
основе библиотек ISA-L или jerasure. Быстрая реализация на основе ISA-L
используется автоматически, когда доступна, в противном случае используется
более медленная jerasure-версия.
## pg_size
@@ -242,7 +246,7 @@ PG в Vitastor эферемерны, то есть вы можете менят
{
"2": {
"name":"ecpool",
"scheme":"jerasure",
"scheme":"ec",
"pg_size":3,
"parity_chunks":1,
"pg_minsize":2,

View File

@@ -158,6 +158,13 @@
Временной интервал, с которым OSD выводят в стандартный вывод список
медленных или зависших операций, если таковые имеются. Также время, при
превышении которого операция считается "медленной".
- name: inode_vanish_time
type: sec
default: 60
info: |
Number of seconds after which a deleted inode is removed from OSD statistics.
info_ru: |
Число секунд, через которое удалённые инод удаляется и из статистики OSD.
- name: max_write_iodepth
type: int
default: 128

View File

@@ -15,7 +15,8 @@
- gcc and g++ 8 or newer, clang 10 or newer, or other compiler with C++11 plus
designated initializers support from C++20
- CMake
- liburing, jerasure headers
- liburing, jerasure headers and libraries
- ISA-L, libibverbs headers and libraries (optional)
- tcmalloc (google-perftools-dev)
## Basic instructions

View File

@@ -15,7 +15,8 @@
- gcc и g++ >= 8, либо clang >= 10, либо другой компилятор с поддержкой C++11 плюс
назначенных инициализаторов (designated initializers) из C++20
- CMake
- Заголовки liburing, jerasure
- Заголовки и библиотеки liburing, jerasure
- Опционально - заголовки и библиотеки ISA-L, libibverbs
- tcmalloc (google-perftools-dev)
## Базовая инструкция

View File

@@ -15,7 +15,7 @@
- Basic part: highly-available block storage with symmetric clustering and no SPOF
- [Performance](../performance/comparison1.en.md) ;-D
- [Multiple redundancy schemes](../config/pool.en.md#scheme): Replication, XOR n+1, Reed-Solomon erasure codes
based on jerasure library with any number of data and parity drives in a group
based on jerasure and ISA-L libraries with any number of data and parity drives in a group
- Configuration via simple JSON data structures in etcd (parameters, pools and images)
- Automatic data distribution over OSDs, with support for:
- Mathematical optimization for better uniformity and less data movement

View File

@@ -15,7 +15,7 @@
- Базовая часть - надёжное кластерное блочное хранилище без единой точки отказа
- [Производительность](../comparison1.ru.md) ;-D
- [Несколько схем отказоустойчивости](../config/pool.ru.md#scheme): репликация, XOR n+1 (1 диск чётности), коды коррекции ошибок
Рида-Соломона на основе библиотеки jerasure с любым числом дисков данных и чётности в группе
Рида-Соломона на основе библиотек jerasure и ISA-L с любым числом дисков данных и чётности в группе
- Конфигурация через простые человекочитаемые JSON-структуры в etcd
- Автоматическое распределение данных по OSD, с поддержкой:
- Математической оптимизации для лучшей равномерности распределения и минимизации перемещений данных

View File

@@ -63,11 +63,11 @@ etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool",
"scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'
```
For jerasure pools the configuration should look like the following:
For EC pools the configuration should look like the following:
```
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
"scheme":"jerasure","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`
```
After you do this, one of the monitors will configure PGs and OSDs will start them.

View File

@@ -75,7 +75,7 @@ etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool",
```
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
"scheme":"jerasure","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}`
```
После этого один из мониторов должен сконфигурировать PG, а OSD должны запустить их.

View File

@@ -10,25 +10,18 @@ function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_p
if (!new_pg_history[new_pg])
{
new_pg_history[new_pg] = {
osd_set_epochs: {},
osd_sets: {},
all_peers: {},
epoch: 0,
};
}
const nh = new_pg_history[new_pg], oh = prev_pg_history[old_pg];
nh.osd_set_epochs[prev_pgs[old_pg].join(' ')] = { osd_set: prev_pgs[old_pg] };
nh.osd_sets[prev_pgs[old_pg].join(' ')] = prev_pgs[old_pg];
if (oh && oh.osd_sets && oh.osd_sets.length)
{
for (const pg of oh.osd_sets)
{
nh.osd_set_epochs[pg.join(' ')] = { osd_set: pg };
}
}
if (oh && oh.osd_set_epochs && oh.osd_set_epochs.length)
{
for (const pg of oh.osd_set_epochs)
{
nh.osd_set_epochs[pg.osd_set.join(' ')] = { osd_set: pg.osd_set };
nh.osd_sets[pg.join(' ')] = pg;
}
}
if (oh && oh.all_peers && oh.all_peers.length)
@@ -46,7 +39,7 @@ function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_p
function finish_pg_history(merged_history)
{
merged_history.osd_set_epochs = Object.values(merged_history.osd_set_epochs);
merged_history.osd_sets = Object.values(merged_history.osd_sets);
merged_history.all_peers = Object.values(merged_history.all_peers);
}

View File

@@ -105,6 +105,7 @@ const etcd_tree = {
no_rebalance: false,
print_stats_interval: 3,
slow_log_interval: 10,
inode_vanish_time: 60,
osd_memlock: false,
// blockstore - fixed in superblock
block_size,
@@ -147,11 +148,11 @@ const etcd_tree = {
/* pools: {
<id>: {
name: 'testpool',
// jerasure uses Reed-Solomon-Vandermonde codes
scheme: 'replicated' | 'xor' | 'jerasure',
// 'ec' uses Reed-Solomon-Vandermonde codes, 'jerasure' is an alias for 'ec'
scheme: 'replicated' | 'xor' | 'ec' | 'jerasure',
pg_size: 3,
pg_minsize: 2,
// number of parity chunks, required for jerasure
// number of parity chunks, required for EC
parity_chunks?: 1,
pg_count: 100,
failure_domain: 'host',
@@ -276,12 +277,7 @@ const etcd_tree = {
history: {
/* <pool_id>: {
<pg_id>: {
osd_set_epochs: {
osd_set: osd_num_t[],
min_epoch: uint64_t,
max_epoch: uint64_t,
}[],
osd_sets: osd_num_t[][], // outdated
osd_sets: osd_num_t[][],
all_peers: osd_num_t[],
epoch: uint64_t,
},
@@ -952,6 +948,18 @@ class Mon
osd_set,
primary: this.pick_primary(pool_id, osd_set, up_osds, aff_osds),
};
if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' ') &&
prev_pgs[i].filter(osd_num => osd_num).length > 0)
{
pg_history[i] = pg_history[i] || {};
pg_history[i].osd_sets = pg_history[i].osd_sets || [];
pg_history[i].osd_sets.push(prev_pgs[i]);
}
if (pg_history[i] && pg_history[i].osd_sets)
{
pg_history[i].osd_sets = Object.values(pg_history[i].osd_sets
.reduce((a, c) => { a[c.join(' ')] = c; return a; }, {}));
}
});
for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++)
{
@@ -1006,14 +1014,15 @@ class Mon
console.log('Pool ID '+pool_id+' is invalid');
return false;
}
if (pool_cfg.scheme !== 'xor' && pool_cfg.scheme !== 'replicated' && pool_cfg.scheme !== 'jerasure')
if (pool_cfg.scheme !== 'xor' && pool_cfg.scheme !== 'replicated' &&
pool_cfg.scheme !== 'ec' && pool_cfg.scheme !== 'jerasure')
{
if (warn)
console.log('Pool '+pool_id+' has invalid coding scheme (one of "xor", "replicated" and "jerasure" required)');
console.log('Pool '+pool_id+' has invalid coding scheme (one of "xor", "replicated", "ec" and "jerasure" required)');
return false;
}
if (!pool_cfg.pg_size || pool_cfg.pg_size < 1 || pool_cfg.pg_size > 256 ||
(pool_cfg.scheme === 'xor' || pool_cfg.scheme == 'jerasure') && pool_cfg.pg_size < 3)
pool_cfg.scheme !== 'replicated' && pool_cfg.pg_size < 3)
{
if (warn)
console.log('Pool '+pool_id+' has invalid pg_size');
@@ -1032,7 +1041,8 @@ class Mon
console.log('Pool '+pool_id+' has invalid parity_chunks (must be 1)');
return false;
}
if (pool_cfg.scheme === 'jerasure' && (pool_cfg.parity_chunks < 1 || pool_cfg.parity_chunks > pool_cfg.pg_size-2))
if ((pool_cfg.scheme === 'ec' || pool_cfg.scheme === 'jerasure') &&
(pool_cfg.parity_chunks < 1 || pool_cfg.parity_chunks > pool_cfg.pg_size-2))
{
if (warn)
console.log('Pool '+pool_id+' has invalid parity_chunks (must be between 1 and pg_size-2)');
@@ -1146,6 +1156,10 @@ class Mon
{
prev_pgs[pg-1] = this.state.config.pgs.items[pool_id][pg].osd_set;
}
// Also delete pool statistics
etcd_request.success.push({ requestDeleteRange: {
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
} });
this.save_new_pgs_txn(etcd_request, pool_id, up_osds, osd_tree, prev_pgs, [], []);
}
}

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VERSION = '0.6.17'
VERSION = '0.7.1'
LOG = logging.getLogger(__name__)
@@ -464,7 +464,7 @@ class VitastorDriver(driver.CloneableImageVD,
vol_name = utils.convert_str(volume.name)
snap_name = utils.convert_str(snapshot.name)
snap = self._get_image(vol_name+'@'+snap_name)
snap = self._get_image('volume-'+snapshot.volume_id+'@'+snap_name)
if not snap:
raise exception.SnapshotNotFound(snapshot_id = snap_name)
snap_inode_id = int(resp['responses'][0]['kvs'][0]['value']['id'])

View File

@@ -25,4 +25,4 @@ rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-0.6.17/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.6.17$(rpm --eval '%dist').tar.gz *
tar --transform 's#^#vitastor-0.7.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.7.1$(rpm --eval '%dist').tar.gz *

View File

@@ -9,7 +9,8 @@ WORKDIR /root
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
RUN yum -y --enablerepo=extras install centos-release-scl epel-release yum-utils rpm-build
RUN yum -y install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm
RUN yum -y install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gcc make cmake gperftools-devel fio rh-nodejs12 jerasure-devel gf-complete-devel rdma-core-devel
RUN yum -y install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gcc make cmake gperftools-devel \
fio rh-nodejs12 jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel
RUN yumdownloader --disablerepo=centos-sclo-rh --source fio
RUN rpm --nomd5 -i fio*.src.rpm
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
@@ -34,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.6.17.el7.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.7.1.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.6.17
Version: 0.7.1
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.6.17.el7.tar.gz
Source0: vitastor-0.7.1.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel
@@ -13,6 +13,7 @@ BuildRequires: devtoolset-9-gcc-c++
BuildRequires: rh-nodejs12
BuildRequires: rh-nodejs12-npm
BuildRequires: jerasure-devel
BuildRequires: libisa-l-devel
BuildRequires: gf-complete-devel
BuildRequires: libibverbs-devel
BuildRequires: cmake
@@ -32,6 +33,7 @@ size with configurable redundancy (replication or erasure codes/XOR).
%package -n vitastor-osd
Summary: Vitastor - OSD
Requires: libJerasure2
Requires: libisa-l
Requires: liburing >= 0.6
Requires: vitastor-client = %{version}-%{release}
@@ -111,6 +113,7 @@ cp -r mon %buildroot/usr/lib/vitastor
%files -n vitastor-osd
%_bindir/vitastor-osd
%_bindir/vitastor-dump-journal
%_bindir/vitastor-dump-meta
%files -n vitastor-mon

View File

@@ -6,10 +6,12 @@ FROM centos:8
WORKDIR /root
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
RUN sed -i 's/^mirrorlist=/#mirrorlist=/; s!#baseurl=http://mirror.centos.org/!baseurl=http://vault.centos.org/!' /etc/yum.repos.d/*.repo
RUN dnf -y install centos-release-advanced-virtualization epel-release dnf-plugins-core
RUN sed -i 's/^mirrorlist=/#mirrorlist=/; s!#baseurl=.*!baseurl=http://vault.centos.org/centos/8.4.2105/virt/$basearch/$avdir/!; s!^baseurl=.*Source/.*!baseurl=http://vault.centos.org/centos/8.4.2105/virt/Source/advanced-virtualization/!' /etc/yum.repos.d/CentOS-Advanced-Virtualization.repo
RUN yum -y install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm
RUN dnf -y install gcc-toolset-9 gcc-toolset-9-gcc-c++ gperftools-devel \
fio nodejs rpm-build jerasure-devel gf-complete-devel libibverbs-devel libarchive cmake
fio nodejs rpm-build jerasure-devel libisa-l-devel gf-complete-devel libibverbs-devel libarchive cmake
RUN dnf download --source fio
RUN rpm --nomd5 -i fio*.src.rpm
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=powertools --spec fio.spec
@@ -33,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.6.17.el8.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.7.1.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,17 +1,18 @@
Name: vitastor
Version: 0.6.17
Version: 0.7.1
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.6.17.el8.tar.gz
Source0: vitastor-0.7.1.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel
BuildRequires: gcc-toolset-9-gcc-c++
BuildRequires: nodejs >= 10
BuildRequires: jerasure-devel
BuildRequires: libisa-l-devel
BuildRequires: gf-complete-devel
BuildRequires: libibverbs-devel
BuildRequires: cmake
@@ -31,6 +32,7 @@ size with configurable redundancy (replication or erasure codes/XOR).
%package -n vitastor-osd
Summary: Vitastor - OSD
Requires: libJerasure2
Requires: libisa-l
Requires: liburing >= 0.6
Requires: vitastor-client = %{version}-%{release}
@@ -108,6 +110,7 @@ cp -r mon %buildroot/usr/lib/vitastor
%files -n vitastor-osd
%_bindir/vitastor-osd
%_bindir/vitastor-dump-journal
%_bindir/vitastor-dump-meta
%files -n vitastor-mon

View File

@@ -15,7 +15,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
add_definitions(-DVERSION="0.6.17")
add_definitions(-DVERSION="0.7.1")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
if (${WITH_ASAN})
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
@@ -50,6 +50,10 @@ pkg_check_modules(IBVERBS libibverbs)
if (IBVERBS_LIBRARIES)
add_definitions(-DWITH_RDMA)
endif (IBVERBS_LIBRARIES)
pkg_check_modules(ISAL libisal)
if (ISAL_LIBRARIES)
add_definitions(-DWITH_ISAL)
endif (ISAL_LIBRARIES)
include_directories(
../
@@ -104,6 +108,7 @@ target_link_libraries(vitastor-osd
vitastor_common
vitastor_blk
Jerasure
${ISAL_LIBRARIES}
${IBVERBS_LIBRARIES}
)
@@ -193,6 +198,11 @@ add_executable(vitastor-dump-journal
dump_journal.cpp crc32c.c
)
# vitastor-dump-meta
add_executable(vitastor-dump-meta
dump_meta.cpp
)
if (${WITH_QEMU})
# qemu_driver.so
add_library(qemu_vitastor SHARED
@@ -225,7 +235,7 @@ target_link_libraries(osd_test tcmalloc_minimal)
# osd_rmw_test
add_executable(osd_rmw_test osd_rmw_test.cpp allocator.cpp)
target_link_libraries(osd_rmw_test Jerasure tcmalloc_minimal)
target_link_libraries(osd_rmw_test Jerasure ${ISAL_LIBRARIES} tcmalloc_minimal)
# stub_uring_osd
add_executable(stub_uring_osd
@@ -270,7 +280,7 @@ target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mo
### Install
install(TARGETS vitastor-osd vitastor-dump-journal vitastor-nbd vitastor-nfs vitastor-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
install(TARGETS vitastor-osd vitastor-dump-journal vitastor-dump-meta vitastor-nbd vitastor-nfs vitastor-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-rm)
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vita)
install(

View File

@@ -19,7 +19,11 @@
#include "timerfd_manager.h"
// Memory alignment for direct I/O (usually 512 bytes)
// All other alignments must be a multiple of this one
#ifndef DIRECT_IO_ALIGNMENT
#define DIRECT_IO_ALIGNMENT 512
#endif
// Memory allocation alignment (page size is usually optimal)
#ifndef MEM_ALIGNMENT
#define MEM_ALIGNMENT 4096
#endif

View File

@@ -912,7 +912,11 @@ void blockstore_init_journal::erase_dirty_object(blockstore_dirty_db_t::iterator
? clean_it->second.location : UINT64_MAX;
if (exists && clean_loc == UINT64_MAX)
{
bs->inode_space_stats[oid.inode] -= bs->block_size;
auto & sp = bs->inode_space_stats[oid.inode];
if (sp > bs->block_size)
sp -= bs->block_size;
else
bs->inode_space_stats.erase(oid.inode);
}
bs->erase_dirty(dirty_it, dirty_end, clean_loc);
// Remove it from the flusher's queue, too

View File

@@ -109,25 +109,25 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
{
disk_alignment = 4096;
}
else if (disk_alignment % MEM_ALIGNMENT)
else if (disk_alignment % DIRECT_IO_ALIGNMENT)
{
throw std::runtime_error("disk_alignment must be a multiple of "+std::to_string(MEM_ALIGNMENT));
throw std::runtime_error("disk_alignment must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
}
if (!journal_block_size)
{
journal_block_size = 4096;
}
else if (journal_block_size % MEM_ALIGNMENT)
else if (journal_block_size % DIRECT_IO_ALIGNMENT)
{
throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(MEM_ALIGNMENT));
throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
}
if (!meta_block_size)
{
meta_block_size = 4096;
}
else if (meta_block_size % MEM_ALIGNMENT)
else if (meta_block_size % DIRECT_IO_ALIGNMENT)
{
throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(MEM_ALIGNMENT));
throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
}
if (data_offset % disk_alignment)
{

View File

@@ -200,7 +200,11 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
}
else if (IS_DELETE(dirty_it->second.state))
{
inode_space_stats[dirty_it->first.oid.inode] -= block_size;
auto & sp = inode_space_stats[dirty_it->first.oid.inode];
if (sp > block_size)
sp -= block_size;
else
inode_space_stats.erase(dirty_it->first.oid.inode);
}
}
if (forget_dirty && (IS_BIG_WRITE(dirty_it->second.state) ||

View File

@@ -39,6 +39,7 @@ public:
ring_loop_t *ringloop = NULL;
epoll_manager_t *epmgr = NULL;
cluster_client_t *cli = NULL;
bool no_recovery = false, no_rebalance = false, readonly = false;
int waiting = 0;
cli_result_t etcd_err;

View File

@@ -276,7 +276,8 @@ resume_4:
new_id = 1+INODE_NO_POOL(kv.value.uint64_value());
max_id_mod_rev = kv.mod_revision;
}
auto ino_it = parent->cli->st_cli.inode_config.lower_bound(INODE_WITH_POOL(new_pool_id, 0));
// Also check existing inodes - for the case when some inodes are created without changing /index/maxid
auto ino_it = parent->cli->st_cli.inode_config.lower_bound(INODE_WITH_POOL(new_pool_id+1, 0));
if (ino_it != parent->cli->st_cli.inode_config.begin())
{
ino_it--;

View File

@@ -127,7 +127,7 @@ resume_1:
pool_stats[pool_cfg.id] = json11::Json::object {
{ "name", pool_cfg.name },
{ "pg_count", pool_cfg.pg_count },
{ "scheme", pool_cfg.scheme == POOL_SCHEME_REPLICATED ? "replicated" : "jerasure" },
{ "scheme", pool_cfg.scheme == POOL_SCHEME_REPLICATED ? "replicated" : "ec" },
{ "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
: "EC "+std::to_string(pool_cfg.pg_size-pool_cfg.parity_chunks)+"+"+std::to_string(pool_cfg.parity_chunks) },

View File

@@ -64,8 +64,9 @@ struct rm_inode_t
}
rm->obj_pos = rm->objects.begin();
lists.push_back(rm);
if (parent->list_first)
if (parent->list_first && !(status & INODE_LIST_DONE))
{
// The listing object is dead when DONE => don't call next()
parent->cli->list_inode_next(lister, 1);
}
if (status & INODE_LIST_DONE)

View File

@@ -14,6 +14,11 @@
std::function<bool(cli_result_t &)> cli_tool_t::simple_offsets(json11::Json cfg)
{
std::string device = cfg["device"].string_value();
if (device == "")
{
fprintf(stderr, "Device path is missing\n");
exit(1);
}
uint64_t object_size = parse_size(cfg["object_size"].string_value());
uint64_t bitmap_granularity = parse_size(cfg["bitmap_granularity"].string_value());
uint64_t journal_size = parse_size(cfg["journal_size"].string_value());

View File

@@ -5,6 +5,7 @@
#include "cluster_client.h"
#include "base64.h"
#include "pg_states.h"
#include "http_client.h"
// Print cluster status:
// etcd, mon, osd states
@@ -207,6 +208,9 @@ resume_2:
obj_n = agg_stats["object_counts"]["incomplete"].uint64_value();
if (obj_n > 0)
more_states += ", "+format_size(obj_n*object_size)+" incomplete";
bool readonly = json_is_true(parent->cli->merged_config["readonly"]);
bool no_recovery = json_is_true(parent->cli->merged_config["no_recovery"]);
bool no_rebalance = json_is_true(parent->cli->merged_config["no_rebalance"]);
std::string recovery_io;
{
uint64_t deg_bps = agg_stats["recovery_stats"]["degraded"]["bps"].uint64_value();
@@ -214,9 +218,19 @@ resume_2:
uint64_t misp_bps = agg_stats["recovery_stats"]["misplaced"]["bps"].uint64_value();
uint64_t misp_iops = agg_stats["recovery_stats"]["misplaced"]["iops"].uint64_value();
if (deg_iops > 0 || deg_bps > 0)
recovery_io += " recovery: "+format_size(deg_bps)+"/s, "+format_size(deg_iops, true)+" op/s\n";
{
recovery_io += " recovery: "+std::string(no_recovery ? "disabled, " : "")+
format_size(deg_bps)+"/s, "+format_size(deg_iops, true)+" op/s\n";
}
else if (no_recovery)
recovery_io += " recovery: disabled\n";
if (misp_iops > 0 || misp_bps > 0)
recovery_io += " rebalance: "+format_size(misp_bps)+"/s, "+format_size(misp_iops, true)+" op/s\n";
{
recovery_io += " rebalance: "+std::string(no_rebalance ? "disabled, " : "")+
format_size(misp_bps)+"/s, "+format_size(misp_iops, true)+" op/s\n";
}
else if (no_rebalance)
recovery_io += " rebalance: disabled\n";
}
if (parent->json_output)
{
@@ -233,6 +247,9 @@ resume_2:
{ "free_raw", free_raw },
{ "down_raw", down_raw },
{ "free_down_raw", free_down_raw },
{ "readonly", readonly },
{ "no_recovery", no_recovery },
{ "no_rebalance", no_rebalance },
{ "clean_data", agg_stats["object_counts"]["clean"].uint64_value() * object_size },
{ "misplaced_data", agg_stats["object_counts"]["misplaced"].uint64_value() * object_size },
{ "degraded_data", agg_stats["object_counts"]["degraded"].uint64_value() * object_size },
@@ -259,7 +276,7 @@ resume_2:
" pools: %d / %d active\n"
" pgs: %s\n"
" \n"
" io:\n"
" io%s:\n"
" client:%s %s/s rd, %s op/s rd, %s/s wr, %s op/s wr\n"
"%s",
etcd_alive, etcd_states.size(), format_size(etcd_db_size).c_str(),
@@ -272,6 +289,7 @@ resume_2:
format_size(agg_stats["object_counts"]["clean"].uint64_value() * object_size).c_str(), more_states.c_str(),
pools_active, pool_count,
pgs_by_state_str.c_str(),
readonly ? " (read-only mode)" : "",
recovery_io.size() > 0 ? " " : "",
format_size(agg_stats["op_stats"]["primary_read"]["bps"].uint64_value()).c_str(),
format_size(agg_stats["op_stats"]["primary_read"]["iops"].uint64_value(), true).c_str(),

View File

@@ -279,6 +279,11 @@ static uint32_t is_power_of_two(uint64_t value)
void cluster_client_t::on_load_config_hook(json11::Json::object & config)
{
this->merged_config = config;
for (auto & kv: this->config.object_items())
{
this->merged_config[kv.first] = kv.second;
}
bs_block_size = config["block_size"].uint64_value();
bs_bitmap_granularity = config["bitmap_granularity"].uint64_value();
if (!bs_block_size)

View File

@@ -111,6 +111,7 @@ public:
etcd_state_client_t st_cli;
osd_messenger_t msgr;
json11::Json config;
json11::Json::object merged_config;
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
~cluster_client_t();

View File

@@ -95,7 +95,7 @@ inode_list_t* cluster_client_t::list_inode_start(inode_t inode,
}
for (auto & hist_item: pg.target_history)
{
for (auto pg_osd: hist_item.osd_set)
for (auto pg_osd: hist_item)
{
if (pg_osd != 0)
{
@@ -105,14 +105,11 @@ inode_list_t* cluster_client_t::list_inode_start(inode_t inode,
}
for (osd_num_t peer_osd: all_peers)
{
if (st_cli.peer_states.find(peer_osd) != st_cli.peer_states.end())
{
r->list_osds.push_back((inode_list_osd_t){
.pg = r,
.osd_num = peer_osd,
.sent = false,
});
}
r->list_osds.push_back((inode_list_osd_t){
.pg = r,
.osd_num = peer_osd,
.sent = false,
});
}
}
else
@@ -156,16 +153,6 @@ void cluster_client_t::continue_listing(inode_list_t *lst)
{
if (lst->done_pgs >= lst->pgs.size())
{
// All done
for (int i = 0; i < lists.size(); i++)
{
if (lists[i] == lst)
{
lists.erase(lists.begin()+i, lists.begin()+i+1);
break;
}
}
delete lst;
return;
}
if (lst->want <= 0)
@@ -181,7 +168,7 @@ void cluster_client_t::continue_listing(inode_list_t *lst)
send_list(&lst->pgs[i]->list_osds[j]);
if (lst->want <= 0)
{
break;
return;
}
}
}
@@ -271,6 +258,24 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
lst->callback(lst, std::move(pg->objects), pg->pg_num, pg->cur_primary, status);
lst->pgs[pg->pos] = NULL;
delete pg;
if (lst->done_pgs >= lst->pgs.size())
{
// All done
for (int i = 0; i < lists.size(); i++)
{
if (lists[i] == lst)
{
lists.erase(lists.begin()+i, lists.begin()+i+1);
break;
}
}
delete lst;
return;
}
}
else
{
lst->want++;
}
continue_listing(lst);
};

View File

@@ -52,7 +52,7 @@ int main(int argc, char *argv[])
self.journal_block = strtoul(argv[b+1], NULL, 10);
self.journal_offset = strtoull(argv[b+2], NULL, 10);
self.journal_len = strtoull(argv[b+3], NULL, 10);
if (self.journal_block < MEM_ALIGNMENT || (self.journal_block % MEM_ALIGNMENT) ||
if (self.journal_block < DIRECT_IO_ALIGNMENT || (self.journal_block % DIRECT_IO_ALIGNMENT) ||
self.journal_block > 128*1024)
{
printf("Invalid journal block size\n");

173
src/dump_meta.cpp Normal file
View File

@@ -0,0 +1,173 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#define _LARGEFILE64_SOURCE
#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdint.h>
#include <malloc.h>
#include <linux/fs.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <stdio.h>
#include "object_id.h"
#include "osd_id.h"
// "VITAstor"
#define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
#define BLOCKSTORE_META_VERSION_V1 1
#define DIRECT_IO_ALIGNMENT 512
#define MEM_ALIGNMENT 4096
struct __attribute__((__packed__)) clean_disk_entry_v0_t
{
object_id oid;
uint64_t version;
uint8_t bitmap[];
};
struct __attribute__((__packed__)) blockstore_meta_header_v1_t
{
uint64_t zero;
uint64_t magic;
uint64_t version;
uint32_t meta_block_size;
uint32_t data_block_size;
uint32_t bitmap_granularity;
};
struct meta_dumper_t
{
char *meta_device;
uint32_t meta_block_size;
uint64_t meta_offset;
uint64_t meta_len;
uint64_t meta_pos;
int fd;
};
int main(int argc, char *argv[])
{
meta_dumper_t self = { 0 };
int b = 1;
if (argc < b+4)
{
printf("USAGE: %s <meta_file> <meta_block_size> <offset> <size>\n", argv[0]);
return 1;
}
self.meta_device = argv[b];
self.meta_block_size = strtoul(argv[b+1], NULL, 10);
self.meta_offset = strtoull(argv[b+2], NULL, 10);
self.meta_len = strtoull(argv[b+3], NULL, 10);
if (self.meta_block_size % DIRECT_IO_ALIGNMENT)
{
printf("Invalid metadata block size\n");
return 1;
}
self.fd = open(self.meta_device, O_DIRECT|O_RDONLY);
if (self.fd == -1)
{
printf("Failed to open metadata device\n");
return 1;
}
// Read all metadata into memory
void *data = memalign(MEM_ALIGNMENT, self.meta_len);
if (!data)
{
printf("Failed to allocate %lu MB of memory\n", self.meta_len/1024/1024);
close(self.fd);
return 1;
}
while (self.meta_pos < self.meta_len)
{
int r = pread(self.fd, data+self.meta_pos, self.meta_len-self.meta_pos, self.meta_offset+self.meta_pos);
assert(r > 0);
self.meta_pos += r;
}
close(self.fd);
// Check superblock
blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)data;
if (hdr->zero == 0 &&
hdr->magic == BLOCKSTORE_META_MAGIC_V1 &&
hdr->version == BLOCKSTORE_META_VERSION_V1)
{
// Vitastor 0.6-0.7 - static array of clean_disk_entry_v0_t with bitmaps
if (hdr->meta_block_size != self.meta_block_size)
{
printf("Using block size %u bytes based on information from the superblock\n", hdr->meta_block_size);
self.meta_block_size = hdr->meta_block_size;
}
uint64_t clean_entry_bitmap_size = hdr->data_block_size / hdr->bitmap_granularity / 8;
uint64_t clean_entry_size = sizeof(clean_disk_entry_v0_t) + 2*clean_entry_bitmap_size;
uint64_t block_num = 0;
printf(
"{\"version\":\"0.6\",\"meta_block_size\":%u,\"data_block_size\":%u,\"bitmap_granularity\":%u,\"entries\":[\n",
hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity
);
bool first = true;
for (uint64_t meta_pos = self.meta_block_size; meta_pos < self.meta_len; meta_pos += self.meta_block_size)
{
for (uint64_t ioff = 0; ioff < self.meta_block_size-clean_entry_size; ioff += clean_entry_size, block_num++)
{
clean_disk_entry_v0_t *entry = (clean_disk_entry_v0_t*)(data + meta_pos + ioff);
if (entry->oid.inode)
{
printf(
#define ENTRY_FMT "{\"block\":%lu,\"pool\":%u,\"inode\":%lu,\"stripe\":%lu,\"version\":%lu,\"bitmap\":\""
(first ? (",\n" ENTRY_FMT) : ENTRY_FMT),
#undef ENTRY_FMT
block_num, INODE_POOL(entry->oid.inode), INODE_NO_POOL(entry->oid.inode),
entry->oid.stripe, entry->version
);
first = false;
for (uint64_t i = 0; i < clean_entry_bitmap_size; i++)
{
printf("%02x", entry->bitmap[i]);
}
printf("\",\"ext_bitmap\":\"");
for (uint64_t i = 0; i < clean_entry_bitmap_size; i++)
{
printf("%02x", entry->bitmap[clean_entry_bitmap_size + i]);
}
printf("\"}");
}
}
}
printf("]}\n");
}
else
{
// Vitastor 0.4-0.5 - static array of clean_disk_entry_v0_t
uint64_t clean_entry_size = sizeof(clean_disk_entry_v0_t);
uint64_t block_num = 0;
printf("{\"version\":\"0.5\",\"meta_block_size\":%u,\"entries\":[\n", self.meta_block_size);
bool first = true;
for (uint64_t meta_pos = 0; meta_pos < self.meta_len; meta_pos += self.meta_block_size)
{
for (uint64_t ioff = 0; ioff < self.meta_block_size-clean_entry_size; ioff += clean_entry_size, block_num++)
{
clean_disk_entry_v0_t *entry = (clean_disk_entry_v0_t*)(data + meta_pos + ioff);
if (entry->oid.inode)
{
printf(
#define ENTRY_FMT "{\"block\":%lu,\"pool\":%u,\"inode\":%lu,\"stripe\":%lu,\"version\":%lu}"
(first ? (",\n" ENTRY_FMT) : ENTRY_FMT),
#undef ENTRY_FMT
block_num, INODE_POOL(entry->oid.inode), INODE_NO_POOL(entry->oid.inode),
entry->oid.stripe, entry->version
);
first = false;
}
}
}
printf("]}\n");
}
free(data);
}

View File

@@ -673,18 +673,18 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
pc.scheme = POOL_SCHEME_REPLICATED;
else if (pool_item.second["scheme"] == "xor")
pc.scheme = POOL_SCHEME_XOR;
else if (pool_item.second["scheme"] == "jerasure")
pc.scheme = POOL_SCHEME_JERASURE;
else if (pool_item.second["scheme"] == "ec" || pool_item.second["scheme"] == "jerasure")
pc.scheme = POOL_SCHEME_EC;
else
{
fprintf(stderr, "Pool %u has invalid coding scheme (one of \"xor\", \"replicated\" or \"jerasure\" required), skipping pool\n", pool_id);
fprintf(stderr, "Pool %u has invalid coding scheme (one of \"xor\", \"replicated\", \"ec\" or \"jerasure\" required), skipping pool\n", pool_id);
continue;
}
// PG Size
pc.pg_size = pool_item.second["pg_size"].uint64_value();
if (pc.pg_size < 1 ||
pool_item.second["pg_size"].uint64_value() < 3 &&
(pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_JERASURE) ||
(pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_EC) ||
pool_item.second["pg_size"].uint64_value() > 256)
{
fprintf(stderr, "Pool %u has invalid pg_size, skipping pool\n", pool_id);
@@ -701,7 +701,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
}
pc.parity_chunks = 1;
}
if (pc.scheme == POOL_SCHEME_JERASURE &&
if (pc.scheme == POOL_SCHEME_EC &&
(pc.parity_chunks < 1 || pc.parity_chunks > pc.pg_size-2))
{
fprintf(stderr, "Pool %u has invalid parity_chunks (must be between 1 and pg_size-2), skipping pool\n", pool_id);
@@ -710,7 +710,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
// PG MinSize
pc.pg_minsize = pool_item.second["pg_minsize"].uint64_value();
if (pc.pg_minsize < 1 || pc.pg_minsize > pc.pg_size ||
(pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_JERASURE) &&
(pc.scheme == POOL_SCHEME_XOR || pc.scheme == POOL_SCHEME_EC) &&
pc.pg_minsize < (pc.pg_size-pc.parity_chunks))
{
fprintf(stderr, "Pool %u has invalid pg_minsize, skipping pool\n", pool_id);
@@ -845,27 +845,7 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
{
history_set.push_back(pg_osd.uint64_value());
}
pg_cfg.target_history.push_back((pg_history_set_t){ .osd_set = history_set });
}
// Newer format with epochs
for (auto hist_item: value["osd_set_epochs"].array_items())
{
pg_history_set_t history_set;
history_set.min_epoch = hist_item["min_epoch"].uint64_value();
history_set.max_epoch = hist_item["max_epoch"].uint64_value();
if (history_set.max_epoch < history_set.min_epoch)
{
history_set.max_epoch = 0;
history_set.min_epoch = 0;
}
for (auto pg_osd: hist_item["osd_set"].array_items())
{
history_set.osd_set.push_back(pg_osd.uint64_value());
}
if (history_set.max_epoch || history_set.osd_set.size())
{
pg_cfg.target_history.push_back(history_set);
}
pg_cfg.target_history.push_back(history_set);
}
// Include these additional OSDs when peering the PG
for (auto pg_osd: value["all_peers"].array_items())

View File

@@ -26,7 +26,7 @@ struct pg_config_t
bool exists;
osd_num_t primary;
std::vector<osd_num_t> target_set;
std::vector<pg_history_set_t> target_history;
std::vector<std::vector<osd_num_t>> target_history;
std::vector<osd_num_t> all_peers;
bool pause;
osd_num_t cur_primary;

View File

@@ -758,3 +758,21 @@ static std::string trim(const std::string & in)
int end = in.find_last_not_of(" \n\r\t");
return in.substr(begin, end+1-begin);
}
bool json_is_true(const json11::Json & val)
{
if (val.is_string())
return val == "true" || val == "yes" || val == "1";
return val.bool_value();
}
bool json_is_false(const json11::Json & val)
{
if (val.is_string())
return val.string_value() == "false" || val.string_value() == "no" || val.string_value() == "0";
if (val.is_number())
return val.number_value() == 0;
if (val.is_bool())
return !val.bool_value();
return false;
}

View File

@@ -52,3 +52,6 @@ void http_close(http_co_t *co);
// Utils
uint64_t stoull_full(const std::string & str, int base = 10);
std::string strtolower(const std::string & in);
// FIXME: move to json11
bool json_is_true(const json11::Json & val);
bool json_is_false(const json11::Json & val);

2
src/lrc/Makefile Normal file
View File

@@ -0,0 +1,2 @@
mat: mat.c
gcc -O3 -I/usr/include/jerasure -o mat mat.c -lJerasure

291
src/lrc/mat.c Normal file
View File

@@ -0,0 +1,291 @@
#include <jerasure/reed_sol.h>
#include <jerasure.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
// Generate LRC matrix: (groups*local + global) code rows with (data_drives) columns
// w should be >= log2(data_drives + groups*local + global), but not necessary 8/16/32
int* reed_sol_vandermonde_lrc_matrix(int data_drives, int groups, int local, int global, int w)
{
if (w < 0 || w > 32 || data_drives + groups*local + global > (1<<w))
{
return NULL;
}
int *lrc_matrix = (int*)malloc(sizeof(int) * (local*groups+global));
int *matrix = reed_sol_vandermonde_coding_matrix(data_drives, local+global, w);
// Enough to transform LRC 8+2+2 GF(8) matrix into MR-LRC
//for (int i = 0; i < local+global; i++)
//{
// int t = matrix[i*data_drives + 3];
// matrix[i*data_drives + 3] = matrix[i*data_drives + 7];
// matrix[i*data_drives + 7] = t;
//}
for (int gr = 0; gr < groups; gr++)
{
for (int l = 0; l < local; l++)
{
for (int j = 0; j < data_drives; j++)
{
lrc_matrix[(gr*local+l)*data_drives + j] = (j / (data_drives/groups)) == gr ? matrix[l*data_drives + j] : 0;
}
}
}
for (int i = 0; i < global; i++)
{
for (int j = 0; j < data_drives; j++)
{
lrc_matrix[(groups*local+i)*data_drives + j] = matrix[(local+i)*data_drives + j];
}
}
free(matrix);
return lrc_matrix;
}
struct lrc_test_result_t
{
int success, impossible, failures;
};
// Check if the generated LRC with given parameters is Maximally Reconstructible (MR-LRC)
// Example of a MR-LRC: (8, 2, 1, 2, 6, 8)
struct lrc_test_result_t check_mr_lrc(int *lrc_matrix, int data_drives, int groups, int local, int global, int w, int log_level)
{
int n = data_drives;
int total_rows = n + groups*local + global;
int impossible = 0, success = 0, failures = 0;
int *lost_per_group = (int*)malloc(sizeof(int) * groups);
int *recovered_per_group = (int*)malloc(sizeof(int) * groups);
int *selected_inverted = (int*)malloc(sizeof(int) * data_drives);
// global+1 is always recoverable
for (int lost = global+2; lost <= groups*local+global; lost++)
{
int *erased_matrix = (int*)malloc(sizeof(int) * (total_rows-lost)*n);
int *inverted_matrix = (int*)malloc(sizeof(int) * (total_rows-lost)*n);
int *p = (int*)malloc(sizeof(int) * (total_rows-lost));
for (int i = 0; i < n; i++)
p[i] = i;
int *p2 = (int*)malloc(sizeof(int) * n);
if (total_rows-lost > n)
{
p[n-1] = n; // skip combinations with all N data disks (0..n-1)
for (int i = n; i < total_rows-lost; i++)
p[i] = i+1;
p[total_rows-lost-1]--; // will be incremented on the first step
}
int inc = total_rows-lost-1;
while (1)
{
p[inc]++;
if (p[inc] >= n+groups*local+global)
{
if (inc == 0)
break;
inc--;
}
else if (inc+1 < total_rows-lost)
{
p[inc+1] = p[inc];
inc++;
}
else
{
// Check if it should be recoverable
// Calculate count of data chunks lost in each group
int nsel = 0;
for (int gr = 0; gr < groups; gr++)
{
lost_per_group[gr] = ((gr+1)*(n/groups) > n ? (n - gr*(n/groups)) : n/groups);
recovered_per_group[gr] = 0;
}
for (int j = 0; j < total_rows-lost; j++)
{
if (p[j] < n)
{
lost_per_group[(p[j] / (n/groups))]--;
selected_inverted[nsel++] = j;
}
}
// Every local parity chunk is supposed to restore 1 missing chunk inside its group
// So, subtract local parity chunk counts from each group lost chunk count
for (int j = 0; j < total_rows-lost; j++)
{
if (p[j] >= n && p[j] < n+groups*local)
{
int gr = (p[j]-n)/local;
if (lost_per_group[gr] > recovered_per_group[gr] && nsel < n)
{
selected_inverted[nsel++] = j;
}
recovered_per_group[gr]++;
}
}
// Every global parity chunk is supposed to restore 1 chunk of all that are still missing
int still_missing = 0;
for (int gr = 0; gr < groups; gr++)
{
int non_fixed = lost_per_group[gr] - recovered_per_group[gr];
still_missing += (non_fixed > 0 ? non_fixed : 0);
}
for (int j = 0; j < total_rows-lost; j++)
{
if (p[j] >= n+groups*local)
{
if (still_missing > 0 && nsel < n)
{
selected_inverted[nsel++] = j;
}
still_missing--;
}
}
if (still_missing <= 0)
{
// We hope it can be recoverable. Try to invert it
assert(nsel == n);
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
erased_matrix[i*n+j] = lrc_matrix[p[selected_inverted[i]]*n+j];
}
}
int invert_ok = jerasure_invert_matrix(erased_matrix, inverted_matrix, n, w);
if (invert_ok < 0)
{
failures++;
if (log_level > 0)
{
printf("\nFAIL: ");
for (int i = 0; i < total_rows-lost; i++)
{
printf("%d ", p[i]);
}
printf("\nDIRECT:\n");
for (int i = 0; i < total_rows-lost; i++)
{
for (int j = 0; j < n; j++)
printf("%d ", lrc_matrix[p[i]*n+j]);
printf("\n");
}
printf("INVERSE:\n");
for (int i = 0; i < total_rows-lost; i++)
{
for (int j = 0; j < n; j++)
printf("%d ", inverted_matrix[i*n+j]);
printf("\n");
}
}
}
else
{
success++;
if (log_level > 2)
{
printf("OK: ");
for (int i = 0; i < total_rows-lost; i++)
{
printf("%d ", p[i]);
}
printf("\n");
}
}
}
else
{
impossible++;
if (log_level > 1)
{
printf("IMPOSSIBLE: ");
for (int i = 0; i < total_rows-lost; i++)
{
printf("%d ", p[i]);
}
printf("\n");
}
}
}
}
free(p2);
free(p);
free(inverted_matrix);
free(erased_matrix);
}
free(lost_per_group);
free(recovered_per_group);
return (struct lrc_test_result_t){
.success = success,
.impossible = impossible,
.failures = failures,
};
}
int main()
{
int W = 8, MATRIX_W = 8;
int n = 8, groups = 2, local = 1, global = 2;
//n = 4, groups = 2, local = 1, global = 1;
int total_rows = n+groups*local+global;
int *matrix = reed_sol_vandermonde_lrc_matrix(n, groups, local, global, MATRIX_W);
int *lrc_matrix = (int*)malloc(sizeof(int) * total_rows*n);
// Fill identity+LRC matrix
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
lrc_matrix[i*n + j] = j == i ? 1 : 0;
memcpy(lrc_matrix + n*n, matrix, (total_rows-n)*n*sizeof(int));
free(matrix);
matrix = NULL;
// Print LRC matrix
for (int i = 0; i < total_rows; i++)
{
for (int j = 0; j < n; j++)
{
printf("%d ", lrc_matrix[i*n+j]);
}
printf("\n");
}
struct lrc_test_result_t t = check_mr_lrc(lrc_matrix, n, groups, local, global, W, 1);
printf("\n%d recovered, %d impossible, %d failures\n", t.success, t.impossible, t.failures);
return 0;
}
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
//
// Can't recover
// 1 2 4 5 8 9 10 11 -1
// 2 3 4 6 8 9 10 11 -1
// FULL:
// 1 0 0 0 0 0 0 0
// 0 1 0 0 0 0 0 0
// 0 0 1 0 0 0 0 0
// 0 0 0 1 0 0 0 0
// 0 0 0 0 1 0 0 0
// 0 0 0 0 0 1 0 0
// 0 0 0 0 0 0 1 0
// 0 0 0 0 0 0 0 1
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
// FIRST UNRECOVERABLE:
// 0 1 0 0 0 0 0 0
// 0 0 1 0 0 0 0 0
// 0 0 0 0 1 0 0 0
// 0 0 0 0 0 1 0 0
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
// SECOND UNRECOVERABLE:
// 0 0 1 0 0 0 0 0
// 0 0 0 1 0 0 0 0
// 0 0 0 0 1 0 0 0
// 0 0 0 0 0 0 1 0
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
// Ho ho ho

View File

@@ -92,10 +92,7 @@ void nfs_proxy_t::run(json11::Json cfg)
if (bind_address == "")
bind_address = "0.0.0.0";
default_pool = cfg["pool"].as_string();
portmap_enabled = cfg.object_items().find("portmap") == cfg.object_items().end() ||
cfg["portmap"].uint64_value() ||
cfg["portmap"].string_value() == "yes" ||
cfg["portmap"].string_value() == "true";
portmap_enabled = !json_is_false(cfg["portmap"]);
nfs_port = cfg["port"].uint64_value() & 0xffff;
if (!nfs_port)
nfs_port = 2049;

View File

@@ -54,7 +54,7 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
autosync_writes = max_autosync;
}
if (this->config["osd_memlock"] == "true" || this->config["osd_memlock"] == "1" || this->config["osd_memlock"] == "yes")
if (json_is_true(this->config["osd_memlock"]))
{
// Lock all OSD memory if requested
if (mlockall(MCL_CURRENT|MCL_FUTURE
@@ -127,11 +127,11 @@ void osd_t::parse_config(const json11::Json & config)
etcd_report_interval = config["etcd_report_interval"].uint64_value();
if (etcd_report_interval <= 0)
etcd_report_interval = 5;
readonly = config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes";
run_primary = config["run_primary"] != "false" && config["run_primary"] != "0" && config["run_primary"] != "no";
no_rebalance = config["no_rebalance"] == "true" || config["no_rebalance"] == "1" || config["no_rebalance"] == "yes";
no_recovery = config["no_recovery"] == "true" || config["no_recovery"] == "1" || config["no_recovery"] == "yes";
allow_test_ops = config["allow_test_ops"] == "true" || config["allow_test_ops"] == "1" || config["allow_test_ops"] == "yes";
readonly = json_is_true(config["readonly"]);
run_primary = !json_is_false(config["run_primary"]);
no_rebalance = json_is_true(config["no_rebalance"]);
no_recovery = json_is_true(config["no_recovery"]);
allow_test_ops = json_is_true(config["allow_test_ops"]);
if (config["immediate_commit"] == "all")
immediate_commit = IMMEDIATE_ALL;
else if (config["immediate_commit"] == "small")
@@ -168,6 +168,9 @@ void osd_t::parse_config(const json11::Json & config)
slow_log_interval = config["slow_log_interval"].uint64_value();
if (!slow_log_interval)
slow_log_interval = 10;
inode_vanish_time = config["inode_vanish_time"].uint64_value();
if (!inode_vanish_time)
inode_vanish_time = 60;
}
void osd_t::bind_socket()

View File

@@ -113,6 +113,7 @@ class osd_t
int autosync_writes = DEFAULT_AUTOSYNC_WRITES;
int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
int inode_vanish_time = 60;
int log_level = 0;
// cluster state
@@ -165,6 +166,7 @@ class osd_t
// op statistics
osd_op_stats_t prev_stats;
std::map<uint64_t, inode_stats_t> inode_stats;
std::map<uint64_t, timespec> vanishing_inodes;
const char* recovery_stat_names[2] = { "degraded", "misplaced" };
uint64_t recovery_stat_count[2][2] = {};
uint64_t recovery_stat_bytes[2][2] = {};

View File

@@ -186,7 +186,8 @@ void osd_t::report_statistics()
json11::Json::object inode_space;
json11::Json::object last_stat;
pool_id_t last_pool = 0;
for (auto kv: bs->get_inode_space_stats())
auto & bs_inode_space = bs->get_inode_space_stats();
for (auto kv: bs_inode_space)
{
pool_id_t pool_id = INODE_POOL(kv.first);
uint64_t only_inode_num = INODE_NO_POOL(kv.first);
@@ -204,8 +205,26 @@ void osd_t::report_statistics()
last_stat = json11::Json::object();
last_pool = 0;
json11::Json::object inode_ops;
for (auto kv: inode_stats)
timespec tv_now;
for (auto st_it = inode_stats.begin(); st_it != inode_stats.end(); )
{
auto & kv = *st_it;
if (!bs_inode_space[kv.first])
{
// Is it an empty inode?
if (!tv_now.tv_sec)
clock_gettime(CLOCK_REALTIME, &tv_now);
auto & tv_van = vanishing_inodes[kv.first];
if (!tv_van.tv_sec)
tv_van = tv_now;
else if (tv_van.tv_sec < tv_now.tv_sec-inode_vanish_time)
{
// Inode vanished <inode_vanish_time> seconds ago, remove it from stats
vanishing_inodes.erase(kv.first);
inode_stats.erase(st_it++);
continue;
}
}
pool_id_t pool_id = INODE_POOL(kv.first);
uint64_t only_inode_num = (kv.first & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1));
if (!last_pool || pool_id != last_pool)
@@ -232,6 +251,7 @@ void osd_t::report_statistics()
{ "bytes", kv.second.op_bytes[INODE_STATS_DELETE] },
} },
};
st_it++;
}
if (last_pool)
inode_ops[std::to_string(last_pool)] = last_stat;
@@ -333,10 +353,7 @@ void osd_t::on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num)
{
oid = op.first;
first = false;
if (op.second->req.hdr.opcode == OSD_OP_DELETE)
continue_primary_del(op.second);
else
continue_primary_write(op.second);
continue_primary_write(op.second);
}
}
}
@@ -373,7 +390,11 @@ void osd_t::acquire_lease()
etcd_lease_id = data["ID"].string_value();
create_osd_state();
});
printf("[OSD %lu] reporting to etcd at %s every %d seconds\n", this->osd_num, config["etcd_address"].string_value().c_str(), etcd_report_interval);
printf(
"[OSD %lu] reporting to etcd at %s every %d seconds\n", this->osd_num,
(config["etcd_address"].is_string() ? config["etcd_address"].string_value() : config["etcd_address"].dump()).c_str(),
etcd_report_interval
);
tfd->set_timer(etcd_report_interval*1000, true, [this](int timer_id)
{
renew_lease();
@@ -611,7 +632,7 @@ void osd_t::apply_pg_config()
}
for (auto & hist_item: pg_cfg.target_history)
{
for (auto pg_osd: hist_item.osd_set)
for (auto pg_osd: hist_item)
{
if (pg_osd != 0)
{
@@ -679,9 +700,9 @@ void osd_t::apply_pg_config()
.all_peers = std::vector<osd_num_t>(all_peers.begin(), all_peers.end()),
.target_set = pg_cfg.target_set,
};
if (pg.scheme == POOL_SCHEME_JERASURE)
if (pg.scheme == POOL_SCHEME_EC)
{
use_jerasure(pg.pg_size, pg.pg_data_size, true);
use_ec(pg.pg_size, pg.pg_data_size, true);
}
this->pg_state_dirty.insert({ .pool_id = pool_id, .pg_num = pg_num });
pg.print_state();
@@ -802,40 +823,11 @@ void osd_t::report_pg_states()
// Prevent race conditions (for the case when the monitor is updating this key at the same time)
pg.history_changed = false;
std::string history_key = base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num));
json11::Json::array target_history;
for (auto & pgh: pg.target_history)
{
target_history.push_back(json11::Json::object {
{ "osd_set", pgh.osd_set },
{ "min_epoch", pgh.min_epoch },
{ "max_epoch", pgh.max_epoch },
});
}
std::vector<osd_num_t> all_peers;
for (auto peer_osd: pg.all_peers)
{
bool found = false;
for (auto target_peer: pg.target_set)
{
if (target_peer == peer_osd)
{
found = true;
break;
}
}
if (!found)
{
all_peers.push_back(peer_osd);
}
}
json11::Json::object history_value = {
{ "epoch", pg.epoch },
{ "osd_set_epochs", target_history },
{ "all_peers", pg.all_peers },
{ "osd_sets", pg.target_history },
};
if (all_peers.size())
{
history_value["all_peers"] = all_peers;
}
checks.push_back(json11::Json::object {
{ "target", "MOD" },
{ "key", history_key },
@@ -922,9 +914,9 @@ void osd_t::report_pg_states()
{
// Forget offline PGs after reporting their state
// (if the state wasn't changed again)
if (pg_it->second.scheme == POOL_SCHEME_JERASURE)
if (pg_it->second.scheme == POOL_SCHEME_EC)
{
use_jerasure(pg_it->second.pg_size, pg_it->second.pg_data_size, false);
use_ec(pg_it->second.pg_size, pg_it->second.pg_data_size, false);
}
this->pgs.erase(pg_it);
}

View File

@@ -268,25 +268,6 @@ bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
void osd_t::submit_recovery_op(osd_recovery_op_t *op)
{
// Check if the object is deleted
bool is_deleted = false;
pool_id_t pool_id = INODE_POOL(op->oid.inode);
auto pool_cfg_it = st_cli.pool_config.find(pool_id);
if (pool_cfg_it != st_cli.pool_config.end())
{
pg_num_t pg_num = (op->oid.stripe/pool_cfg_it->second.pg_stripe_size) % pg_counts[pool_id] + 1; // like map_to_pg()
auto pg_it = pgs.find({ .pool_id = pool_id, .pg_num = pg_num });
if (pg_it != pgs.end())
{
pg_osd_set_state_t *object_state;
get_object_osd_set(pg_it->second, op->oid, pg_it->second.cur_set.data(), &object_state);
if (object_state && (object_state->state & OBJ_DELETED))
{
// Object is deleted, but not from all OSDs - delete remaining copies
is_deleted = true;
}
}
}
op->osd_op = new osd_op_t();
op->osd_op->op_type = OSD_OP_OUT;
op->osd_op->req = (osd_any_op_t){
@@ -294,7 +275,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = 1,
.opcode = (uint64_t)(is_deleted ? OSD_OP_DELETE : OSD_OP_WRITE),
.opcode = OSD_OP_WRITE,
},
.inode = op->oid.inode,
.offset = op->oid.stripe,

View File

@@ -3,11 +3,9 @@
#pragma once
#include <vector>
#define POOL_SCHEME_REPLICATED 1
#define POOL_SCHEME_XOR 2
#define POOL_SCHEME_JERASURE 3
#define POOL_SCHEME_EC 3
#define POOL_ID_MAX 0x10000
#define POOL_ID_BITS 16
#define INODE_POOL(inode) (pool_id_t)((inode) >> (64 - POOL_ID_BITS))
@@ -30,9 +28,3 @@ inline bool operator < (const pool_pg_num_t & a, const pool_pg_num_t & b)
{
return a.pool_id < b.pool_id || a.pool_id == b.pool_id && a.pg_num < b.pg_num;
}
struct pg_history_set_t
{
std::vector<osd_num_t> osd_set;
uint64_t min_epoch, max_epoch;
};

View File

@@ -30,13 +30,19 @@
#define OSD_OP_PING 15
#define OSD_OP_SEC_READ_BMP 16
#define OSD_OP_MAX 16
// Alignment & limit for read/write operations
#ifndef MEM_ALIGNMENT
#define MEM_ALIGNMENT 512
#endif
#define OSD_RW_MAX 64*1024*1024
#define OSD_PROTOCOL_VERSION 1
// Memory alignment for direct I/O (usually 512 bytes)
#ifndef DIRECT_IO_ALIGNMENT
#define DIRECT_IO_ALIGNMENT 512
#endif
// Memory allocation alignment (page size is usually optimal)
#ifndef MEM_ALIGNMENT
#define MEM_ALIGNMENT 4096
#endif
// common request and reply headers
struct __attribute__((__packed__)) osd_op_header_t
{

View File

@@ -228,7 +228,7 @@ void osd_t::start_pg_peering(pg_t & pg)
for (auto & history_set: pg.target_history)
{
bool found = true;
for (auto history_osd: history_set.osd_set)
for (auto history_osd: history_set)
{
if (history_osd != 0)
{
@@ -539,114 +539,47 @@ void osd_t::finish_stop_pg(pg_t & pg)
report_pg_state(pg);
}
static int count_nonzero_osds(const std::vector<osd_num_t> & v)
{
int n = 0;
for (auto & osd_num: v)
{
if (osd_num != 0)
{
n++;
}
}
return n;
}
void osd_t::report_pg_state(pg_t & pg)
{
pg.print_state();
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
if ((pg.state == PG_ACTIVE || pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD)) &&
(pg.target_history.size() != 1 ||
pg.target_history[0].osd_set != pg.target_set ||
pg.target_history[0].min_epoch != 0 ||
pg.target_history[0].max_epoch != pg.epoch ||
pg.all_peers.size() > count_nonzero_osds(pg.target_set)))
if (pg.state == PG_ACTIVE && (pg.target_history.size() > 0 || pg.all_peers.size() > pg.target_set.size()))
{
// Clear history of active+clean PGs
pg.history_changed = true;
pg.target_history.clear();
pg.target_history.push_back((pg_history_set_t){
.osd_set = pg.cur_set,
.min_epoch = 0,
.max_epoch = pg.epoch,
});
if (pg.state == PG_ACTIVE)
pg.all_peers = pg.target_set;
pg.cur_peers = pg.target_set;
}
else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
{
// Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers
pg.history_changed = true;
pg.target_history.clear();
std::set<osd_num_t> dead_peers;
for (auto pg_osd: pg.all_peers)
{
pg.all_peers.clear();
for (auto pg_osd: pg.target_set)
dead_peers.insert(pg_osd);
}
for (auto pg_osd: pg.cur_peers)
{
dead_peers.erase(pg_osd);
}
for (auto pg_osd: pg.target_set)
{
if (pg_osd)
{
if (pg_osd)
pg.all_peers.push_back(pg_osd);
dead_peers.insert(pg_osd);
}
}
else
{
// Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers
std::set<osd_num_t> dead_peers(pg.all_peers.begin(), pg.all_peers.end());
for (auto pg_osd: pg.cur_peers)
{
dead_peers.erase(pg_osd);
}
for (auto pg_osd: pg.target_set)
{
if (pg_osd)
dead_peers.insert(pg_osd);
}
pg.all_peers.clear();
pg.all_peers.insert(pg.all_peers.begin(), dead_peers.begin(), dead_peers.end());
}
pg.all_peers.clear();
pg.all_peers.insert(pg.all_peers.begin(), dead_peers.begin(), dead_peers.end());
pg.cur_peers.clear();
for (auto pg_osd: pg.target_set)
{
if (pg_osd)
{
pg.cur_peers.push_back(pg_osd);
}
}
if (pg.history_changed)
{
bool epoch_already_reported = false;
int max_epoch_pos = -1;
for (int i = pg.target_history.size()-1; i >= 0; i--)
{
if (pg.target_history[i].min_epoch > pg.epoch)
{
printf("[PG %u/%u] Invalid PG history: there is an entry with min_epoch (%lu) > current epoch (%lu)\n",
pg.pool_id, pg.pg_num, pg.target_history[i].min_epoch, pg.epoch);
force_stop(1);
return;
}
if (max_epoch_pos < 0 || pg.target_history[i].max_epoch > pg.target_history[max_epoch_pos].max_epoch)
{
max_epoch_pos = i;
}
if (pg.target_history[i].min_epoch <= pg.epoch &&
pg.target_history[i].max_epoch >= pg.epoch)
{
if (pg.target_history[i].osd_set != pg.cur_set)
{
printf("[PG %u/%u] Invalid target_history: epoch %lu has another OSD set already registered\n", pg.pool_id, pg.pg_num, pg.epoch);
force_stop(1);
return;
}
// Already reported
epoch_already_reported = true;
break;
}
}
if (!epoch_already_reported)
{
if (max_epoch_pos >= 0 && pg.target_history[max_epoch_pos].osd_set == pg.cur_set)
{
pg.target_history[max_epoch_pos].max_epoch = pg.epoch;
}
else
{
pg.target_history.push_back((pg_history_set_t){
.osd_set = pg.cur_set,
.min_epoch = pg.epoch,
.max_epoch = pg.epoch,
});
}
}
}

View File

@@ -52,7 +52,6 @@ struct pg_obj_state_check_t
void walk();
void start_object();
void recheck_version_osd_set();
void handle_version();
void finish_object();
};
@@ -85,19 +84,27 @@ void pg_obj_state_check_t::walk()
pg->state = PG_INCOMPLETE | PG_HAS_INVALID;
return;
}
// Activate PG
if (pg->pg_cursize < pg->pg_size)
{
// History will be reported on first write
// Report PG history and activate
pg->state |= PG_DEGRADED | PG_PEERED;
std::vector<osd_num_t> history_set;
for (auto peer_osd: pg->cur_set)
{
if (peer_osd != 0)
{
history_set.push_back(peer_osd);
}
}
pg->target_history.push_back(history_set);
pg->history_changed = true;
}
else
{
// Just activate
pg->state |= PG_ACTIVE;
// Clear history
pg->history_changed = true;
}
if (pg->cur_peers.size() < pg->all_peers.size())
if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
{
pg->state |= PG_LEFT_ON_DEAD;
}
@@ -114,82 +121,13 @@ void pg_obj_state_check_t::start_object()
n_unstable = n_invalid = 0;
}
void pg_obj_state_check_t::recheck_version_osd_set()
{
uint64_t epoch = (last_ver >> (64-PG_EPOCH_BITS));
if (!pg->epoch_sizes_differ && n_copies >= pg->pg_size)
{
// Enough copies
return;
}
auto epoch_it = pg->target_by_epoch.lower_bound(epoch);
if (epoch_it == pg->target_by_epoch.end() || epoch_it->second.min_epoch > epoch)
{
// Epoch info not found
return;
}
if (pg->epoch_sizes_differ && n_copies >= epoch_it->second.osd_set.size())
{
// For the (unlikely) case of PG size change - enough copies
return;
}
// Recheck version against the OSD set corresponding to epoch if it's known
if (epoch_it != pg->target_by_epoch.end() && epoch_it->second.min_epoch <= epoch)
{
for (int j = 0; j < epoch_it->second.osd_set.size(); j++)
{
osd_num_t cur_osd = epoch_it->second.osd_set[j];
bool found = false;
for (int i = ver_start; i < ver_end; i++)
{
if (cur_osd == list[i].osd_num)
{
found = true;
break;
}
}
if (!found)
{
// Check if a newer version is present on the same OSD and masks the older one
// It happens for overwritten replicas in the following case:
// Version 1 is present on OSD 1,2,3
// Client tries to write Version 2
// OSD 3 succeeds to write Version 2, others don't. OSD 3 crashes, then starts again
// OSD 1 sees: version 1 on OSD 1,2 and version 2 on OSD 3
// (version 1 on OSD 3 is already masked/removed)
// Version 1 is not present on a full set, but it must not be removed
if (replicated)
{
for (int i = obj_start; i < ver_start; i++)
{
if (cur_osd == list[i].osd_num)
{
found = true;
break;
}
}
}
if (!found)
{
// Object is missing from one of the OSDs of that set.
// This means it's deleted or moved and we can safely drop this version.
target_ver = 0;
break;
}
}
}
}
}
void pg_obj_state_check_t::handle_version()
{
if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_data_size))
{
// Version is either stable or recoverable
ver_end = list_pos;
target_ver = last_ver;
// Skip versions that are not present on any of OSDs for the corresponding PG epoch
recheck_version_osd_set();
ver_end = list_pos;
}
if (!target_ver)
{
@@ -253,8 +191,6 @@ void pg_obj_state_check_t::finish_object()
// Version is either stable or recoverable
target_ver = last_ver;
ver_end = list_pos;
// Skip versions that are not present on any of OSDs for the corresponding PG epoch
recheck_version_osd_set();
}
obj_end = list_pos;
// Remember the decision
@@ -308,23 +244,11 @@ void pg_obj_state_check_t::finish_object()
}
}
}
if (!target_ver && (n_unstable >= obj_end-obj_start))
if (!target_ver)
{
return;
}
if (!target_ver)
{
// Object is present, but should not be :) i.e. it's a deleted object that reappeared
if (log_level > 1)
{
printf("Object is deleted: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
}
state = OBJ_DELETED;
pg->state = pg->state | PG_HAS_MISPLACED;
// To record all versions as outdated:
ver_end = obj_start;
}
else if (!replicated && n_roles < pg->pg_data_size)
if (!replicated && n_roles < pg->pg_data_size)
{
if (log_level > 1)
{
@@ -352,7 +276,7 @@ void pg_obj_state_check_t::finish_object()
pg->state = pg->state | PG_HAS_MISPLACED;
}
if (log_level > 1 && (state & (OBJ_INCOMPLETE | OBJ_DEGRADED)) ||
log_level > 2 && (state & (OBJ_MISPLACED | OBJ_DELETED)))
log_level > 2 && (state & OBJ_MISPLACED))
{
for (int i = obj_start; i < obj_end; i++)
{
@@ -361,9 +285,9 @@ void pg_obj_state_check_t::finish_object()
}
}
pg->total_count++;
osd_set.clear();
if (target_ver != 0 && (state != 0 || ver_end < obj_end))
if (state != 0 || ver_end < obj_end)
{
osd_set.clear();
for (int i = ver_start; i < ver_end; i++)
{
osd_set.push_back((pg_obj_loc_t){
@@ -386,8 +310,7 @@ void pg_obj_state_check_t::finish_object()
break;
}
}
if (j >= osd_set.size() && ((state & OBJ_DELETED) ||
pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num))
if (j >= osd_set.size() && pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num)
{
osd_set.push_back((pg_obj_loc_t){
.role = (list[i].oid.stripe & STRIPE_MASK),
@@ -402,11 +325,7 @@ void pg_obj_state_check_t::finish_object()
}
}
}
if (state & OBJ_DELETED)
{
pg->ver_override[oid] = max_ver;
}
else if (target_ver < max_ver)
if (target_ver < max_ver)
{
pg->ver_override[oid] = target_ver;
}
@@ -460,7 +379,6 @@ void pg_obj_state_check_t::finish_object()
}
else
{
assert(it->second.state == state);
it->second.object_count++;
}
if (state & OBJ_INCOMPLETE)
@@ -481,34 +399,6 @@ void pg_obj_state_check_t::finish_object()
// FIXME: Write at least some tests for this function
void pg_t::calc_object_states(int log_level)
{
// Calculate intersections of target_history with cur_peers
for (auto & history_item: target_history)
{
if (history_item.max_epoch)
{
pg_history_set_t & set_copy = target_by_epoch[history_item.max_epoch];
set_copy.min_epoch = history_item.min_epoch;
set_copy.max_epoch = history_item.max_epoch;
for (int i = 0; i < history_item.osd_set.size(); i++)
{
if (history_item.osd_set[i] != 0)
{
for (int j = 0; j < cur_set.size(); j++)
{
if (cur_set[j] == history_item.osd_set[i])
{
set_copy.osd_set.push_back(history_item.osd_set[i]);
break;
}
}
}
}
if (set_copy.osd_set.size() != pg_size)
{
epoch_sizes_differ = true;
}
}
}
// Copy all object lists into one array
pg_obj_state_check_t st;
st.log_level = log_level;
@@ -545,18 +435,10 @@ void pg_t::calc_object_states(int log_level)
std::sort(st.list.begin(), st.list.end());
// Walk over it and check object states
st.walk();
target_by_epoch.clear();
if (this->state != PG_ACTIVE)
if (this->state & (PG_DEGRADED|PG_LEFT_ON_DEAD))
{
assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1));
epoch++;
for (auto & pgh: target_history)
{
if (epoch <= pgh.max_epoch)
{
epoch = pgh.max_epoch+1;
}
}
}
}

View File

@@ -89,9 +89,7 @@ struct pg_t
// epoch number - should increase with each non-clean activation of the PG
uint64_t epoch = 0, reported_epoch = 0;
// target history and all potential peers
std::vector<pg_history_set_t> target_history;
std::map<uint64_t, pg_history_set_t> target_by_epoch;
bool epoch_sizes_differ = false;
std::vector<std::vector<osd_num_t>> target_history;
std::vector<osd_num_t> all_peers;
bool history_changed = false;
// peer list from the last peering event

View File

@@ -199,21 +199,6 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
{
// PG may be degraded or have misplaced objects
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
if (op_data->object_state && (op_data->object_state->state & OBJ_DELETED))
{
// Object is deleted, just return zeroes
cur_op->reply.rw.version = 0;
cur_op->reply.rw.bitmap_len = op_data->pg_data_size * clean_entry_bitmap_size;
uint64_t zero_len = cur_op->reply.rw.bitmap_len + cur_op->req.rw.len;
while (zero_len >= 0)
{
uint64_t cur_zero_len = zero_buffer_size > zero_len ? zero_len : zero_buffer_size;
cur_op->iov.push_back(zero_buffer, cur_zero_len);
zero_len -= cur_zero_len;
}
finish_op(cur_op, cur_op->req.rw.len);
return;
}
}
if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
{
@@ -256,9 +241,9 @@ resume_2:
{
reconstruct_stripes_xor(stripes, op_data->pg_size, clean_entry_bitmap_size);
}
else if (op_data->scheme == POOL_SCHEME_JERASURE)
else if (op_data->scheme == POOL_SCHEME_EC)
{
reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size);
reconstruct_stripes_ec(stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size);
}
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
for (int role = 0; role < op_data->pg_size; role++)
@@ -305,7 +290,7 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object
report_pg_state(pg);
}
}
else if (object_state->state & (OBJ_MISPLACED | OBJ_DELETED))
else if (object_state->state & OBJ_MISPLACED)
{
this->misplaced_objects--;
pg.misplaced_objects.erase(oid);
@@ -344,6 +329,12 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
else if (op_data->st == 4) goto resume_4;
else if (op_data->st == 5) goto resume_5;
assert(op_data->st == 0);
// Delete is forbidden even in active PGs if they're also degraded or have previous dead OSDs
if (pg.state & (PG_DEGRADED | PG_LEFT_ON_DEAD))
{
finish_op(cur_op, -EBUSY);
return;
}
if (!check_write_queue(cur_op, pg))
{
return;
@@ -351,18 +342,11 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
resume_1:
// Determine which OSDs contain this object and delete it
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
if (op_data->object_state && (op_data->object_state->state & OBJ_DELETED))
{
op_data->fact_ver = pg.ver_override[op_data->oid];
}
else
{
// Submit 1 read to determine the actual version number
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
// Submit 1 read to determine the actual version number
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
resume_2:
op_data->st = 2;
return;
}
op_data->st = 2;
return;
resume_3:
if (op_data->errors > 0)
{

View File

@@ -110,9 +110,9 @@ resume_1:
{
reconstruct_stripes_xor(local_stripes, pg.pg_size, clean_entry_bitmap_size);
}
else if (pg.scheme == POOL_SCHEME_JERASURE)
else if (pg.scheme == POOL_SCHEME_EC)
{
reconstruct_stripes_jerasure(local_stripes, pg.pg_size, pg.pg_data_size, clean_entry_bitmap_size);
reconstruct_stripes_ec(local_stripes, pg.pg_size, pg.pg_data_size, clean_entry_bitmap_size);
}
break;
}
@@ -133,12 +133,6 @@ int osd_t::collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitm
uint64_t target_version = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
pg_osd_set_state_t *object_state;
uint64_t* cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
if (object_state && (object_state->state & OBJ_DELETED))
{
// Object is deleted, zero out the bitmap
memset((uint8_t*)op_data->snapshot_bitmaps + chain_num*clean_entry_bitmap_size, 0, clean_entry_bitmap_size);
continue;
}
if (pg.scheme == POOL_SCHEME_REPLICATED)
{
osd_num_t read_target = 0;
@@ -512,9 +506,9 @@ void osd_t::send_chained_read_results(pg_t & pg, osd_op_t *cur_op)
{
reconstruct_stripes_xor(stripes, pg.pg_size, clean_entry_bitmap_size);
}
else if (op_data->scheme == POOL_SCHEME_JERASURE)
else if (op_data->scheme == POOL_SCHEME_EC)
{
reconstruct_stripes_jerasure(stripes, pg.pg_size, pg.pg_data_size, clean_entry_bitmap_size);
reconstruct_stripes_ec(stripes, pg.pg_size, pg.pg_data_size, clean_entry_bitmap_size);
}
}
}

View File

@@ -116,19 +116,17 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, const ui
if (osd_set[role] != 0 && (wr || !rep && stripes[role].read_end != 0))
n_subops++;
}
if (!n_subops && (submit_type == SUBMIT_RMW_READ || rep) && zero_read >= 0)
if (!n_subops && (submit_type == SUBMIT_RMW_READ || rep))
n_subops = 1;
else
zero_read = -1;
osd_op_t *subops = new osd_op_t[n_subops];
op_data->fact_ver = 0;
op_data->done = op_data->errors = 0;
op_data->n_subops = n_subops;
if (n_subops > 0)
{
op_data->subops = new osd_op_t[n_subops];
int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read);
assert(sent == n_subops);
}
op_data->subops = subops;
int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read);
assert(sent == n_subops);
}
int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t op_version,

View File

@@ -132,9 +132,9 @@ resume_3:
{
calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, clean_entry_bitmap_size);
}
else if (pg.scheme == POOL_SCHEME_JERASURE)
else if (pg.scheme == POOL_SCHEME_EC)
{
calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, clean_entry_bitmap_size);
calc_rmw_parity_ec(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, clean_entry_bitmap_size);
}
}
// Send writes
@@ -154,9 +154,10 @@ resume_3:
if (pg.epoch > pg.reported_epoch)
{
// Report newer epoch before writing
// FIXME: We don't have to report all changed PG states here
// FIXME: We may report only one PG state here...
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
pg.history_changed = true;
report_pg_state(pg);
report_pg_states();
resume_10:
if (pg.epoch > pg.reported_epoch)
{

View File

@@ -4,8 +4,13 @@
#include <stdexcept>
#include <string.h>
#include <assert.h>
#include <jerasure/reed_sol.h>
extern "C" {
#include <reed_sol.h>
#include <jerasure.h>
#ifdef WITH_ISAL
#include <isa-l/erasure_code.h>
#endif
}
#include <map>
#include "allocator.h"
#include "xor.h"
@@ -147,13 +152,14 @@ inline bool operator < (const reed_sol_erased_t &a, const reed_sol_erased_t &b)
struct reed_sol_matrix_t
{
int refs = 0;
int *data;
std::map<reed_sol_erased_t, int*> decodings;
int *je_data;
uint8_t *isal_data;
std::map<reed_sol_erased_t, void*> decodings;
};
std::map<uint64_t, reed_sol_matrix_t> matrices;
static std::map<uint64_t, reed_sol_matrix_t> matrices;
void use_jerasure(int pg_size, int pg_minsize, bool use)
void use_ec(int pg_size, int pg_minsize, bool use)
{
uint64_t key = (uint64_t)pg_size | ((uint64_t)pg_minsize) << 32;
auto rs_it = matrices.find(key);
@@ -164,19 +170,33 @@ void use_jerasure(int pg_size, int pg_minsize, bool use)
return;
}
int *matrix = reed_sol_vandermonde_coding_matrix(pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W);
uint8_t *isal_table = NULL;
#ifdef WITH_ISAL
uint8_t *isal_matrix = (uint8_t*)malloc_or_die(pg_minsize*(pg_size-pg_minsize));
for (int i = 0; i < pg_minsize*(pg_size-pg_minsize); i++)
{
isal_matrix[i] = matrix[i];
}
isal_table = (uint8_t*)malloc_or_die(pg_minsize*(pg_size-pg_minsize)*32);
ec_init_tables(pg_minsize, pg_size-pg_minsize, isal_matrix, isal_table);
free(isal_matrix);
#endif
matrices[key] = (reed_sol_matrix_t){
.refs = 0,
.data = matrix,
.je_data = matrix,
.isal_data = isal_table,
};
rs_it = matrices.find(key);
}
rs_it->second.refs += (!use ? -1 : 1);
if (rs_it->second.refs <= 0)
{
free(rs_it->second.data);
free(rs_it->second.je_data);
if (rs_it->second.isal_data)
free(rs_it->second.isal_data);
for (auto dec_it = rs_it->second.decodings.begin(); dec_it != rs_it->second.decodings.end();)
{
int *data = dec_it->second;
void *data = dec_it->second;
rs_it->second.decodings.erase(dec_it++);
free(data);
}
@@ -184,7 +204,7 @@ void use_jerasure(int pg_size, int pg_minsize, bool use)
}
}
reed_sol_matrix_t* get_jerasure_matrix(int pg_size, int pg_minsize)
static reed_sol_matrix_t* get_ec_matrix(int pg_size, int pg_minsize)
{
uint64_t key = (uint64_t)pg_size | ((uint64_t)pg_minsize) << 32;
auto rs_it = matrices.find(key);
@@ -199,7 +219,7 @@ reed_sol_matrix_t* get_jerasure_matrix(int pg_size, int pg_minsize)
// we don't need it. also it makes an extra allocation of int *erased on every call and doesn't cache
// the decoding matrix.
// all these flaws are fixed in this function:
int* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize)
static void* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize)
{
int edd = 0;
int erased[pg_size];
@@ -210,16 +230,57 @@ int* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size, int pg
edd++;
if (edd == 0)
return NULL;
reed_sol_matrix_t *matrix = get_jerasure_matrix(pg_size, pg_minsize);
reed_sol_matrix_t *matrix = get_ec_matrix(pg_size, pg_minsize);
auto dec_it = matrix->decodings.find((reed_sol_erased_t){ .data = erased, .size = pg_size });
if (dec_it == matrix->decodings.end())
{
#ifdef WITH_ISAL
int smrow = 0;
uint8_t *submatrix = (uint8_t*)malloc_or_die(pg_minsize*pg_minsize*2);
for (int i = 0; i < pg_size; i++)
{
if (!erased[i])
{
if (i < pg_minsize)
{
for (int j = 0; j < pg_minsize; j++)
submatrix[smrow*pg_minsize + j] = j == i;
}
else
{
for (int j = 0; j < pg_minsize; j++)
submatrix[smrow*pg_minsize + j] = (uint8_t)matrix->je_data[(i-pg_minsize)*pg_minsize + j];
}
smrow++;
}
}
if (smrow < pg_minsize)
{
free(submatrix);
throw std::runtime_error("failed to make an invertible submatrix");
}
gf_invert_matrix(submatrix, submatrix + pg_minsize*pg_minsize, pg_minsize);
smrow = 0;
for (int i = 0; i < pg_minsize; i++)
{
if (erased[i])
{
memcpy(submatrix + pg_minsize*smrow, submatrix + (pg_minsize+i)*pg_minsize, pg_minsize);
smrow++;
}
}
uint8_t *rectable = (uint8_t*)malloc_or_die(32*smrow*pg_minsize + pg_size*sizeof(int));
ec_init_tables(pg_minsize, smrow, submatrix, rectable);
free(submatrix);
int *erased_copy = (int*)(rectable + 32*smrow*pg_minsize);
memcpy(erased_copy, erased, pg_size*sizeof(int));
matrix->decodings.emplace((reed_sol_erased_t){ .data = erased_copy, .size = pg_size }, rectable);
return rectable;
#else
int *dm_ids = (int*)malloc_or_die(sizeof(int)*(pg_minsize + pg_minsize*pg_minsize + pg_size));
int *decoding_matrix = dm_ids + pg_minsize;
if (!dm_ids)
throw std::bad_alloc();
// we always use row_k_ones=1 and w=8 (OSD_JERASURE_W)
if (jerasure_make_decoding_matrix(pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->data, erased, decoding_matrix, dm_ids) < 0)
if (jerasure_make_decoding_matrix(pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->je_data, erased, decoding_matrix, dm_ids) < 0)
{
free(dm_ids);
throw std::runtime_error("jerasure_make_decoding_matrix() failed");
@@ -228,13 +289,64 @@ int* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size, int pg
memcpy(erased_copy, erased, pg_size*sizeof(int));
matrix->decodings.emplace((reed_sol_erased_t){ .data = erased_copy, .size = pg_size }, dm_ids);
return dm_ids;
#endif
}
return dec_it->second;
}
void reconstruct_stripes_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size)
#ifdef WITH_ISAL
void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size)
{
int *dm_ids = get_jerasure_decoding_matrix(stripes, pg_size, pg_minsize);
uint8_t *dectable = (uint8_t*)get_jerasure_decoding_matrix(stripes, pg_size, pg_minsize);
if (!dectable)
{
return;
}
uint8_t *data_ptrs[pg_size];
int wanted_base = 0, wanted = 0;
uint64_t read_start = 0, read_end = 0;
auto recover_seq = [&]()
{
int orig = 0;
for (int other = 0; other < pg_size; other++)
{
if (stripes[other].read_end != 0 && !stripes[other].missing)
{
assert(stripes[other].read_start <= read_start);
assert(stripes[other].read_end >= read_end);
data_ptrs[orig++] = (uint8_t*)stripes[other].read_buf + (read_start - stripes[other].read_start);
}
}
ec_encode_data(
read_end-read_start, pg_minsize, wanted, dectable + wanted_base*32*pg_minsize,
data_ptrs, data_ptrs + pg_minsize
);
wanted_base += wanted;
wanted = 0;
};
for (int role = 0; role < pg_minsize; role++)
{
if (stripes[role].read_end != 0 && stripes[role].missing)
{
if (read_end && (stripes[role].read_start != read_start ||
stripes[role].read_end != read_end))
{
recover_seq();
}
read_start = stripes[role].read_start;
read_end = stripes[role].read_end;
data_ptrs[pg_minsize + (wanted++)] = (uint8_t*)stripes[role].read_buf;
}
}
if (wanted > 0)
{
recover_seq();
}
}
#else
void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size)
{
int *dm_ids = (int*)get_jerasure_decoding_matrix(stripes, pg_size, pg_minsize);
if (!dm_ids)
{
return;
@@ -242,7 +354,9 @@ void reconstruct_stripes_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg
int *decoding_matrix = dm_ids + pg_minsize;
char *data_ptrs[pg_size];
for (int role = 0; role < pg_size; role++)
{
data_ptrs[role] = NULL;
}
for (int role = 0; role < pg_minsize; role++)
{
if (stripes[role].read_end != 0 && stripes[role].missing)
@@ -279,6 +393,7 @@ void reconstruct_stripes_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg
}
}
}
#endif
int extend_missing_stripes(osd_rmw_stripe_t *stripes, osd_num_t *osd_set, int pg_minsize, int pg_size)
{
@@ -679,12 +794,12 @@ void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_
calc_rmw_parity_copy_parity(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);
}
void calc_rmw_parity_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_size)
{
uint32_t bitmap_granularity = bitmap_size > 0 ? chunk_size / bitmap_size / 8 : 0;
reed_sol_matrix_t *matrix = get_jerasure_matrix(pg_size, pg_minsize);
reconstruct_stripes_jerasure(stripes, pg_size, pg_minsize, bitmap_size);
reed_sol_matrix_t *matrix = get_ec_matrix(pg_size, pg_minsize);
reconstruct_stripes_ec(stripes, pg_size, pg_minsize, bitmap_size);
uint32_t start = 0, end = 0;
calc_rmw_parity_copy_mod(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, bitmap_granularity, start, end);
if (end != 0)
@@ -741,20 +856,34 @@ void calc_rmw_parity_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_min
curbuf[i]++;
}
}
#ifdef WITH_ISAL
ec_encode_data(
next_end-pos, pg_minsize, pg_size-pg_minsize, matrix->isal_data,
(uint8_t**)data_ptrs, (uint8_t**)data_ptrs+pg_minsize
);
#else
jerasure_matrix_encode(
pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->data,
pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->je_data,
(char**)data_ptrs, (char**)data_ptrs+pg_minsize, next_end-pos
);
#endif
pos = next_end;
}
for (int i = 0; i < pg_size; i++)
{
data_ptrs[i] = stripes[i].bmp_buf;
}
#ifdef WITH_ISAL
ec_encode_data(
bitmap_size, pg_minsize, pg_size-pg_minsize, matrix->isal_data,
(uint8_t**)data_ptrs, (uint8_t**)data_ptrs+pg_minsize
);
#else
jerasure_matrix_encode(
pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->data,
pg_minsize, pg_size-pg_minsize, OSD_JERASURE_W, matrix->je_data,
(char**)data_ptrs, (char**)data_ptrs+pg_minsize, bitmap_size
);
#endif
}
}
calc_rmw_parity_copy_parity(stripes, pg_size, pg_minsize, read_osd_set, write_osd_set, chunk_size, start, end);

View File

@@ -7,6 +7,7 @@
#include "object_id.h"
#include "osd_id.h"
// Memory allocation alignment (page size is usually optimal)
#ifndef MEM_ALIGNMENT
#define MEM_ALIGNMENT 4096
#endif
@@ -44,9 +45,9 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
void calc_rmw_parity_xor(osd_rmw_stripe_t *stripes, int pg_size, uint64_t *read_osd_set, uint64_t *write_osd_set,
uint32_t chunk_size, uint32_t bitmap_size);
void use_jerasure(int pg_size, int pg_minsize, bool use);
void use_ec(int pg_size, int pg_minsize, bool use);
void reconstruct_stripes_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size);
void reconstruct_stripes_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint32_t bitmap_size);
void calc_rmw_parity_jerasure(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
void calc_rmw_parity_ec(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize,
uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t bitmap_size);

View File

@@ -587,14 +587,14 @@ void test12()
input buffer: [ write0, write1 ],
rmw buffer: [ write2, write3, read0, read1 ],
}
then, after calc_rmw_parity_jerasure(): all the same
then, after calc_rmw_parity_ec(): all the same
then simulate read with read_osd_set=[0,0,3,4] and check read0,read1 buffers
***/
void test13()
{
use_jerasure(4, 2, true);
use_ec(4, 2, true);
osd_num_t osd_set[4] = { 1, 2, 0, 0 };
osd_num_t write_osd_set[4] = { 1, 2, 3, 4 };
osd_rmw_stripe_t stripes[4] = {};
@@ -628,7 +628,7 @@ void test13()
set_pattern(write_buf, 8192, PATTERN3);
set_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
set_pattern(stripes[1].read_buf, 128*1024-4096, PATTERN2);
calc_rmw_parity_jerasure(stripes, 4, 2, osd_set, write_osd_set, 128*1024, 0);
calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 128*1024, 0);
assert(stripes[0].write_start == 128*1024-4096 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 4096);
assert(stripes[2].write_start == 0 && stripes[2].write_end == 128*1024);
@@ -663,7 +663,7 @@ void test13()
assert(stripes[3].read_buf == (uint8_t*)read_buf+3*128*1024);
memcpy((uint8_t*)read_buf+2*128*1024, rmw_buf, 128*1024);
memcpy((uint8_t*)read_buf+3*128*1024, (uint8_t*)rmw_buf+128*1024, 128*1024);
reconstruct_stripes_jerasure(stripes, 4, 2, 0);
reconstruct_stripes_ec(stripes, 4, 2, 0);
check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3);
check_pattern(stripes[1].read_buf, 4096, PATTERN3);
@@ -694,14 +694,14 @@ void test13()
assert(stripes[3].read_buf == (uint8_t*)read_buf+2*128*1024);
memcpy((uint8_t*)read_buf+128*1024, rmw_buf, 128*1024);
memcpy((uint8_t*)read_buf+2*128*1024, (uint8_t*)rmw_buf+128*1024, 128*1024);
reconstruct_stripes_jerasure(stripes, 4, 2, 0);
reconstruct_stripes_ec(stripes, 4, 2, 0);
check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3);
free(read_buf);
// Huh done
free(rmw_buf);
free(write_buf);
use_jerasure(4, 2, false);
use_ec(4, 2, false);
}
/***
@@ -714,7 +714,7 @@ void test13()
input buffer: [ write0, write1 ],
rmw buffer: [ write2, read0, read1 ],
}
then, after calc_rmw_parity_jerasure(): all the same
then, after calc_rmw_parity_ec(): all the same
then simulate read with read_osd_set=[0,2,3] and check read0 buffer
***/
@@ -722,7 +722,7 @@ void test13()
void test14()
{
const int bmp = 4;
use_jerasure(3, 2, true);
use_ec(3, 2, true);
osd_num_t osd_set[3] = { 1, 2, 0 };
osd_num_t write_osd_set[3] = { 1, 2, 3 };
osd_rmw_stripe_t stripes[3] = {};
@@ -757,7 +757,7 @@ void test14()
memset(stripes[0].bmp_buf, 0, bmp);
memset(stripes[1].bmp_buf, 0, bmp);
memset(stripes[2].bmp_buf, 0, bmp);
calc_rmw_parity_jerasure(stripes, 3, 2, osd_set, write_osd_set, 128*1024, bmp);
calc_rmw_parity_ec(stripes, 3, 2, osd_set, write_osd_set, 128*1024, bmp);
assert(*(uint32_t*)stripes[0].bmp_buf == 0x80000000);
assert(*(uint32_t*)stripes[1].bmp_buf == 0x00000001);
assert(*(uint32_t*)stripes[2].bmp_buf == 0x80000001); // jerasure 2+1 is still just XOR
@@ -793,12 +793,12 @@ void test14()
set_pattern(stripes[1].read_buf, 4096, PATTERN3);
set_pattern(stripes[1].read_buf+4096, 128*1024-4096, PATTERN2);
memcpy(stripes[2].read_buf, rmw_buf, 128*1024);
reconstruct_stripes_jerasure(stripes, 3, 2, bmp);
reconstruct_stripes_ec(stripes, 3, 2, bmp);
check_pattern(stripes[0].read_buf, 128*1024-4096, PATTERN1);
check_pattern(stripes[0].read_buf+128*1024-4096, 4096, PATTERN3);
free(read_buf);
// Huh done
free(rmw_buf);
free(write_buf);
use_jerasure(3, 2, false);
use_ec(3, 2, false);
}

View File

@@ -34,7 +34,6 @@
#define OBJ_DEGRADED 0x02
#define OBJ_INCOMPLETE 0x04
#define OBJ_MISPLACED 0x08
#define OBJ_DELETED 0x10
#define OBJ_NEEDS_STABLE 0x10000
#define OBJ_NEEDS_ROLLBACK 0x20000

View File

@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: Vitastor
Description: Vitastor client library
Version: 0.6.17
Version: 0.7.1
Libs: -L${libdir} -lvitastor_client
Cflags: -I${includedir}

View File

@@ -1,18 +1,39 @@
#!/bin/bash -ex
#!/bin/bash
. `dirname $0`/common.sh
OSD_SIZE=${OSD_SIZE:-1024}
PG_COUNT=${PG_COUNT:-1}
PG_SIZE=${PG_SIZE:-3}
PG_MINSIZE=${PG_MINSIZE:-2}
OSD_COUNT=${OSD_COUNT:-3}
SCHEME=${SCHEME:-ec}
# OSD_COUNT
SCHEME=${SCHEME:-replicated}
# OSD_ARGS
# PG_SIZE
# PG_MINSIZE
if [ "$SCHEME" = "ec" ]; then
OSD_COUNT=${OSD_COUNT:-5}
else
OSD_COUNT=${OSD_COUNT:-3}
fi
if [ "$IMMEDIATE_COMMIT" != "" ]; then
NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 1"
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1,"immediate_commit":"all"}'
else
NO_SAME="--journal_sector_buffer_count 1024 --log_level 1"
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1}'
fi
start_osd()
{
local i=$1
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
eval OSD${i}_PID=$!
}
for i in $(seq 1 $OSD_COUNT); do
dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
eval OSD${i}_PID=$!
start_osd $i
done
cd mon
@@ -21,43 +42,62 @@ cd ..
node mon/mon-main.js --etcd_url $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log &
MON_PID=$!
if [ -n "$GLOBAL_CONF" ]; then
$ETCDCTL put /vitastor/config/global "$GLOBAL_CONF"
fi
if [ "$SCHEME" = "replicated" ]; then
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":'$PG_SIZE',"pg_minsize":'$PG_MINSIZE',"pg_count":'$PG_COUNT',"failure_domain":"osd"}}'
if [ "$SCHEME" = "ec" ]; then
PG_SIZE=${PG_SIZE:-5}
PG_MINSIZE=${PG_MINSIZE:-3}
PG_DATA_SIZE=$PG_MINSIZE
POOLCFG='"scheme":"ec","parity_chunks":'$((PG_SIZE-PG_MINSIZE))
elif [ "$SCHEME" = "xor" ]; then
PG_SIZE=${PG_SIZE:-3}
PG_MINSIZE=${PG_MINSIZE:-2}
PG_DATA_SIZE=$PG_MINSIZE
POOLCFG='"scheme":"xor","parity_chunks":'$((PG_SIZE-PG_MINSIZE))
else
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"xor","pg_size":'$PG_SIZE',"pg_minsize":'$PG_MINSIZE',"parity_chunks":1,"pg_count":'$PG_COUNT',"failure_domain":"osd"}}'
PG_SIZE=${PG_SIZE:-2}
PG_MINSIZE=${PG_MINSIZE:-2}
PG_DATA_SIZE=1
POOLCFG='"scheme":"replicated"'
fi
POOLCFG='"name":"testpool","failure_domain":"osd",'$POOLCFG
$ETCDCTL put /vitastor/config/pools '{"1":{'$POOLCFG',"pg_size":'$PG_SIZE',"pg_minsize":'$PG_MINSIZE',"pg_count":'$PG_COUNT'}}'
sleep 2
if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == '$PG_SIZE') | length) == '$PG_COUNT); then
format_error "FAILED: $PG_COUNT PGS NOT CONFIGURED"
fi
wait_up()
if ! ($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == '$PG_COUNT); then
format_error "FAILED: $PG_COUNT PGS NOT UP"
fi
try_reweight()
{
local sec=$1
local i=0
local configured=0
osd=$1
w=$2
$ETCDCTL put /vitastor/config/osd/$osd '{"reweight":'$w'}'
sleep 3
}
wait_finish_rebalance()
{
sec=$1
i=0
while [[ $i -lt $sec ]]; do
if $ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(. | length) != 0 and ([ .[0].items["1"][] |
select(((.osd_set | select(. != 0) | sort | unique) | length) == '$PG_SIZE') ] | length) == '$PG_COUNT; then
configured=1
if $ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active"]) ] | length == '$PG_COUNT; then
break
fi
($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == 32') && \
break
if [ $i -eq 60 ]; then
format_error "Rebalance couldn't finish in $sec seconds"
fi
sleep 1
i=$((i+1))
if [ $i -eq $sec ]; then
if [[ $configured -ne 0 ]]; then
format_error "FAILED: $PG_COUNT PG(s) NOT CONFIGURED"
fi
format_error "FAILED: $PG_COUNT PG(s) NOT UP"
fi
done
}
wait_up 60
if ! cmp build/src/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so; then
sudo rm -f /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so
sudo ln -s "$(realpath .)/build/src/block-vitastor.so" /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so
fi
check_qemu()
{
if ! cmp build/src/block-vitastor.so /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so; then
sudo rm -f /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so
sudo ln -s "$(realpath .)/build/src/block-vitastor.so" /usr/lib/x86_64-linux-gnu/qemu/block-vitastor.so
fi
}

View File

@@ -1,74 +0,0 @@
#!/bin/bash
. `dirname $0`/common.sh
if [ "$IMMEDIATE_COMMIT" != "" ]; then
NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 1"
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1,"immediate_commit":"all"}'
else
NO_SAME="--journal_sector_buffer_count 1024 --log_level 1"
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1}'
fi
start_osd()
{
local i=$1
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
eval OSD${i}_PID=$!
}
OSD_SIZE=1024
OSD_COUNT=7
OSD_ARGS=
for i in $(seq 1 $OSD_COUNT); do
dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
start_osd $i
done
cd mon
npm install
cd ..
node mon/mon-main.js --etcd_url $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log &
MON_PID=$!
if [ "$EC" != "" ]; then
PG_SIZE=3
POOLCFG='"scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1'
else
PG_SIZE=${PG_SIZE:-2}
POOLCFG='"scheme":"replicated","pg_size":'$PG_SIZE',"pg_minsize":2'
fi
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":32,"failure_domain":"osd"}}'
sleep 2
if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == '$PG_SIZE') | length) == 32'); then
format_error "FAILED: 32 PGS NOT CONFIGURED"
fi
if ! ($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == 32'); then
format_error "FAILED: 32 PGS NOT UP"
fi
try_reweight()
{
osd=$1
w=$2
$ETCDCTL put /vitastor/config/osd/$osd '{"reweight":'$w'}'
sleep 3
}
wait_finish_rebalance()
{
sec=$1
i=0
while [[ $i -lt $sec ]]; do
($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == 32') && \
break
sleep 1
i=$((i+1))
if [ $i -eq $sec ]; then
format_error "Rebalance couldn't finish in $sec seconds"
fi
done
}

View File

@@ -8,11 +8,11 @@ cd $(dirname $0)
./test_cas.sh
./test_change_pg_count.sh
EC=1 ./test_change_pg_count.sh
SCHEME=ec ./test_change_pg_count.sh
./test_change_pg_size.sh
./test_degraded_delete.sh
./test_create_nomaxid.sh
./test_etcd_fail.sh
@@ -20,8 +20,8 @@ EC=1 ./test_change_pg_count.sh
./test_interrupted_rebalance.sh
IMMEDIATE_COMMIT=1 ./test_interrupted_rebalance.sh
EC=1 ./test_interrupted_rebalance.sh
EC=1 IMMEDIATE_COMMIT=1 ./test_interrupted_rebalance.sh
SCHEME=ec ./test_interrupted_rebalance.sh
SCHEME=ec IMMEDIATE_COMMIT=1 ./test_interrupted_rebalance.sh
./test_minsize_1.sh
@@ -29,17 +29,17 @@ EC=1 IMMEDIATE_COMMIT=1 ./test_interrupted_rebalance.sh
./test_rebalance_verify.sh
IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh
EC=1 ./test_rebalance_verify.sh
EC=1 IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh
SCHEME=ec ./test_rebalance_verify.sh
SCHEME=ec IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh
./test_rm.sh
./test_snapshot.sh
SCHEME=replicated ./test_snapshot.sh
SCHEME=ec ./test_snapshot.sh
./test_splitbrain.sh
./test_write.sh
SCHEME=replicated ./test_write.sh
SCHEME=xor ./test_write.sh
./test_write_no_same.sh

View File

@@ -1,41 +1,11 @@
#!/bin/bash -ex
. `dirname $0`/common.sh
OSD_COUNT=${OSD_COUNT:-6}
PG_COUNT=16
if [ "$EC" != "" ]; then
POOLCFG='"scheme":"xor","pg_size":3,"pg_minsize":2,"parity_chunks":1'
NOBJ=512
else
POOLCFG='"scheme":"replicated","pg_size":2,"pg_minsize":2'
NOBJ=1024
fi
. `dirname $0`/run_3osds.sh
OSD_SIZE=1024
OSD_COUNT=6
OSD_ARGS=
for i in $(seq 1 $OSD_COUNT); do
dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
eval OSD${i}_PID=$!
done
cd mon
npm install
cd ..
node mon/mon-main.js --etcd_url $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 &>./testdata/mon.log &
MON_PID=$!
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":16,"failure_domain":"osd"}}'
sleep 2
if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == 2) | length) == 16'); then
format_error "FAILED: 16 PGS NOT CONFIGURED"
fi
if ! ($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == 16'); then
format_error "FAILED: 16 PGS NOT UP"
fi
NOBJ=$(((128*8+PG_DATA_SIZE-1)/PG_DATA_SIZE))
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -fsync=1 -rw=write \
@@ -49,7 +19,7 @@ try_change()
echo --- Change PG count to $n --- >>testdata/osd$i.log
done
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool",'$POOLCFG',"pg_count":'$n',"failure_domain":"osd"}}'
$ETCDCTL put /vitastor/config/pools '{"1":{'$POOLCFG',"pg_size":'$PG_SIZE',"pg_minsize":'$PG_MINSIZE',"pg_count":'$n'}}'
for i in {1..10}; do
($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | select(. > 0)) | length == 2) | length) == '$n) && \

View File

@@ -1,40 +1,16 @@
#!/bin/bash -ex
. `dirname $0`/common.sh
PG_COUNT=16
SCHEME=${SCHEME:-replicated}
OSD_SIZE=1024
OSD_COUNT=3
OSD_ARGS=
for i in $(seq 1 $OSD_COUNT); do
dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
eval OSD${i}_PID=$!
done
cd mon
npm install
cd ..
node mon/mon-main.js --etcd_url $ETCD_URL --etcd_prefix "/vitastor" &>./testdata/mon.log &
MON_PID=$!
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":3,"pg_minsize":2,"pg_count":16,"failure_domain":"osd"}}'
sleep 2
if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only | jq -s -e '(.[0].items["1"] | map((.osd_set | sort) == ["1","2","3"]) | length) == 16'); then
format_error "FAILED: 16 PGS NOT CONFIGURED"
fi
if ! ($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == 16'); then
format_error "FAILED: 16 PGS NOT UP"
fi
. `dirname $0`/run_3osds.sh
try_change()
{
n=$1
s=$2
$ETCDCTL put /vitastor/config/pools '{"1":{"name":"testpool","scheme":"replicated","pg_size":'$s',"pg_minsize":2,"pg_count":'$n',"failure_domain":"osd"}}'
$ETCDCTL put /vitastor/config/pools '{"1":{'$POOLCFG',"pg_size":'$s',"pg_minsize":'$PG_MINSIZE',"pg_count":'$n'}}'
for i in {1..10}; do
($ETCDCTL get /vitastor/config/pgs --print-value-only |\

21
tests/test_create_nomaxid.sh Executable file
View File

@@ -0,0 +1,21 @@
#!/bin/bash -ex
# Test vitastor-cli create when /index/maxid is out of sync
. `dirname $0`/run_3osds.sh
$ETCDCTL put /vitastor/config/inode/1/120 '{"name":"testimg","size":'$((1024*1024*1024))'}'
build/src/vitastor-cli create --etcd_address $ETCD_URL -s 1G testimg2
t=$($ETCDCTL get --print-value-only /vitastor/config/inode/1/121 | jq -r .name)
if [[ "$t" != "testimg2" ]]; then
format_error "testimg2 should've been created as inode 121"
fi
t=$($ETCDCTL get --print-value-only /vitastor/index/maxid/1)
if [[ "$t" != 121 ]]; then
format_error "/index/maxid should've been set to 121"
fi
format_green OK

View File

@@ -1,131 +0,0 @@
#!/bin/bash -ex
# Run 3 OSDs
. `dirname $0`/run_3osds.sh
# Write inodes 1 and 2
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 \
-rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -runtime=10
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 \
-rw=write -etcd=$ETCD_URL -pool=1 -inode=2 -size=128M -runtime=10
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=16 \
-rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -runtime=10 &>/dev/null &
sleep 5
# Stop OSD 1
kill -INT $OSD1_PID
sleep 2
# Remove inode 2
build/src/vitastor-cli rm-data --etcd_address $ETCD_URL --pool 1 --inode 2
# Run 3 more OSDs and move PG to 4,5,6
for i in $(seq 4 6); do
dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
eval OSD${i}_PID=$!
done
$ETCDCTL put /vitastor/config/osd/1 '{"reweight":0}'
$ETCDCTL put /vitastor/config/osd/2 '{"reweight":0}'
$ETCDCTL put /vitastor/config/osd/3 '{"reweight":0}'
# Wait for rebalance to finish
wait_finish_rebalance()
{
local sec=$1
local st=$2
local i=0
while [[ $i -lt $sec ]]; do
if $ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e \
'([ .[] | select(.state == ['$st'] and (.peers | contains([1]) | not) and (.peers | contains([2,3]) | not)) ] | length) == '$PG_COUNT; then
break
fi
sleep 1
i=$((i+1))
if [ $i -eq $sec ]; then
format_error "Rebalance couldn't finish in $sec seconds"
fi
done
}
wait_finish_rebalance 60 '"active","left_on_dead"'
# Stop OSD 2,3
kill -INT $OSD2_PID
kill -INT $OSD3_PID
sleep 2
# Verify that PGs are still active
if ! ($ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active","left_on_dead"]) ] | length == '$PG_COUNT); then
format_error "FAILED: $PG_COUNT PG(s) NOT UP"
fi
# Start OSD 1
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd1.bin 2>/dev/null) &>./testdata/osd1.log &
OSD1_PID=$!
# Verify that inode 2 is removed and inode 1 is in place
wait_repeer_1()
{
local sec=$1
local i=0
while [[ $i -lt $sec ]]; do
if grep -q 'Repeer because of OSD 1' testdata/osd4.log testdata/osd5.log testdata/osd6.log; then
break
fi
sleep 1
i=$((i+1))
if [ $i -eq $sec ]; then
format_error "OSD 4/5/6 do not peer with older OSD 1"
fi
done
}
wait_repeer_1 15
wait_finish_rebalance 15 '"active"'
if [ "$SCHEME" = "replicated" ]; then
NOBJ=1024
else
NOBJ=$((1024/(PG_SIZE-1)))
fi
if ! ($ETCDCTL get /vitastor/pg/stats/1/1 --print-value-only | jq -s -e '.[0].object_count == '$NOBJ); then
format_error "FAILED: PG SHOULD CONTAIN EXACTLY 128 MB OF DATA, BUT IT DOESN'T"
fi
qemu-img convert -S 4096 -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=1:size=4096" \
-O raw ./testdata/inode1.bin
qemu-img convert -S 4096 -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=2:size="$((128*1024*1024)) \
-O raw ./testdata/inode2.bin
if (dd if=/dev/zero bs=4096 count=1 | diff - ./testdata/inode1.bin); then
format_error "FAILED: INODE 1 SEEMS LOST"
fi
if ! (dd if=/dev/zero bs=1M count=128 | diff - ./testdata/inode2.bin); then
format_error "FAILED: INODE 2 SEEMS RESTORED"
fi
format_green OK

View File

@@ -3,8 +3,10 @@
# Kill OSDs while writing
PG_SIZE=3
. `dirname $0`/run_7osds.sh
OSD_COUNT=7
PG_COUNT=32
. `dirname $0`/run_3osds.sh
check_qemu
IMG_SIZE=960

View File

@@ -1,6 +1,8 @@
#!/bin/bash -ex
. `dirname $0`/run_7osds.sh
OSD_COUNT=7
PG_COUNT=32
. `dirname $0`/run_3osds.sh
IMG_SIZE=960
@@ -32,13 +34,14 @@ try_reweight 5 1
wait_finish_rebalance 60
# Check that PGs never had degraded objects !
if grep has_degraded ./testdata/mon.log; then
format_error "Some copies of objects were lost during interrupted rebalancings"
fi
# FIXME: In fact, the test doesn't guarantee it because PGs aren't always peered only with full prior OSD sets :-(
#if grep has_degraded ./testdata/mon.log; then
# format_error "Some copies of objects were lost during interrupted rebalancings"
#fi
# Check that no objects are lost !
nobj=`$ETCDCTL get --prefix '/vitastor/pg/stats' --print-value-only | jq -s '[ .[].object_count ] | reduce .[] as $num (0; .+$num)'`
if [ "$nobj" -ne $((IMG_SIZE*8)) ]; then
if [ "$nobj" -ne $((IMG_SIZE*8/PG_DATA_SIZE)) ]; then
format_error "Data lost after multiple interrupted rebalancings"
fi

View File

@@ -1,6 +1,8 @@
#!/bin/bash -ex
. `dirname $0`/run_7osds.sh
OSD_COUNT=7
PG_COUNT=32
. `dirname $0`/run_3osds.sh
IMG_SIZE=256

View File

@@ -1,6 +1,7 @@
#!/bin/bash -ex
. `dirname $0`/run_3osds.sh
check_qemu
# Test basic write and snapshot

View File

@@ -3,6 +3,7 @@
OSD_SIZE=2048
. `dirname $0`/run_3osds.sh
check_qemu
$ETCDCTL put /vitastor/config/inode/1/1 '{"name":"debian9","size":'$((2048*1024*1024))'}'

View File

@@ -1,6 +1,7 @@
#!/bin/bash -ex
. `dirname $0`/run_3osds.sh
check_qemu
#LD_PRELOAD=libasan.so.5 \
# fio -thread -name=test -ioengine=build/src/libfio_vitastor_sec.so -bs=4k -fsync=128 `$ETCDCTL get /vitastor/osd/state/1 --print-value-only | jq -r '"-host="+.addresses[0]+" -port="+(.port|tostring)'` -rw=write -size=32M