Compare commits

..

31 Commits

Author SHA1 Message Date
ab615849d6 Release 0.8.8
- Fix vitastor-cli rm/rm-data broken in 0.8.6 (missing messenger initialization)
- Prepare OSD read handler for upcoming version with scrub - allow "secondary reads" to return errors
- Fix OSDs re-peering PGs infinitely with a big number of PGs (reproduced in test_add_osd)
- Fix another variant of flusher sync-waiting stall (reproduced in test_write)
- Fix other tests in tests/ (will add them to Gitea CI soon)
- Add patches for QEMU 6.2-8.0
- Fix QEMU driver compatibility with QEMU 8.0
- Build packages for RHEL 9 clones (based on AlmaLinux 9)
2023-04-28 11:22:00 +03:00
38be9a49c0 Add AlmaLinux 9 build to documentation 2023-04-28 02:00:52 +03:00
7d6bf84a3e Add scripts/meson-buildoptions.sh to QEMU patches 2023-04-28 01:43:22 +03:00
41a40a4123 Add QEMU spec patch for Alma/Rocky/RH 9 2023-04-28 01:32:06 +03:00
b94587ef0e Fix some build warnings 2023-04-28 00:44:27 +03:00
2a2f4f6738 Add Almalinux 9 build 2023-04-28 00:40:50 +03:00
c768a9015f Fix QEMU driver compatibility with QEMU 8.0 2023-04-25 11:20:21 +03:00
0d9e10cf96 Add patches for QEMU 6.2-8.0 2023-04-25 11:20:21 +03:00
b74ccb613c Fix another variant of flusher sync-waiting stall 2023-04-24 00:44:41 +03:00
5052174918 Fix test_write_no_same (too large image) 2023-04-24 00:44:41 +03:00
eec9cf5575 Fix test_snapshot.sh - qemu-img requires explicit backing_fmt 2023-04-24 00:44:41 +03:00
a04dab0840 Initialize messenger in cluster_client listings 2023-04-24 00:44:41 +03:00
160863f707 Print op pointer values in slow log 2023-04-23 17:54:00 +03:00
2f16c32eb4 Fix test_minsize_1 (left_on_dead) 2023-04-23 17:54:00 +03:00
2877cd0adb Allow OP_SEC_READ to return errors (do not hang the connection) 2023-04-23 17:54:00 +03:00
480509f5b9 Fix pg_data_size > 1 for replicas (harmless bug) 2023-04-23 01:50:42 +03:00
46462da45e Preload own PG history updates to fix PG state loop possibly applying the old metadata version 2023-04-23 01:50:30 +03:00
024c8658f6 Fix missing } in quick start documentation 2023-04-12 18:26:38 +03:00
7e958afeda Release 0.8.7
This release includes a bunch of important bugfixes for erasure-coded setups
with disabled immediate_commit. After these fixes, "test_heal" OSD killing test
now passes fine with EC:

- Fix cluster write stalls with "Error while doing flush on OSD xx: -16 (Device or resource busy)"
  in OSD logs possible in EC setups with disabled immediate_commit by selectively
  syncing nonsynced objects on STABILIZE/ROLLBACK (https://github.com/vitalif/vitastor/issues/51)
- Fix other EC + disabled immediate_commit problems:
  - Fix "opcode=5 retval=-2" errors happening on SYNC retries
  - Fix non-working "pagination" during PG dirty object flushing
  - Fix write operations not continued correctly after dirty object flushing
- Fix incorrect parity read-modify-write calculation when writing into a lost chunk
- Fix OSDs losing left_on_dead PG state of non-clean PGs and thus not removing junk data in the cluster
- Fix a small memory leak caused by bad indexing of EC recovery matrices
- Fix a rare use-after-free in cluster_client caused by a reenterability issue
- Fix vitastor-cli create command syntax in the CSI driver
- Allow to start OSDs without local store for tests
- Fix memory allocation error in disk_tool_meta for non-standard metadata block sizes
- Fix delete operations received before loading pool metadata crashing OSDs with "null pointer exception"
- Improve "theoretical performance" Russian documentation

New features:

- Implement online configuration update for some parameters. Documentation is coming soon :)
2023-04-11 02:11:57 +03:00
2f5e769a29 Fix a small memory leak caused by bad indexing of EC recovery matrices 2023-04-11 00:30:36 +03:00
28d5e53c6c Add test_heal to run_tests 2023-04-09 02:10:42 +03:00
d9f55f11d8 More logs (log_level 10), append to log instead of overwriting on restart in tests 2023-04-09 02:06:10 +03:00
3237014608 Fix incorrect parity read-modify-write calculation when writing into a lost chunk 2023-04-09 02:06:10 +03:00
baaf8f6f44 Fix write operations not continued correctly after flush 2023-04-09 02:06:10 +03:00
1d83fdcd17 Add debug logs to osd_flush 2023-04-09 02:06:10 +03:00
0ddd787c38 Fix non-working "pagination" during PG dirty object flushing 2023-04-08 02:44:02 +03:00
6eff3a60a5 Do not lose left_on_dead PG state of non-clean PGs 2023-04-08 02:44:02 +03:00
888a6975ab Fix a rare use-after-free in cluster_client caused by a reenterability issue 2023-04-08 02:44:02 +03:00
cd1e890bd4 Fix "opcode=5 retval=-2" errors sometimes possible with EC 2023-04-08 02:44:02 +03:00
0fbf4c6a08 Selectively sync nonsynced objects on STABILIZE/ROLLBACK (fix for github issue #51) 2023-04-08 02:44:02 +03:00
d06ed2b0e7 Implement online config update 2023-03-26 19:21:50 +03:00
76 changed files with 2254 additions and 315 deletions

View File

@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
project(vitastor)
set(VERSION "0.8.6")
set(VERSION "0.8.8")
add_subdirectory(src)

View File

@@ -1,4 +1,4 @@
VERSION ?= v0.8.6
VERSION ?= v0.8.8
all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v0.8.6
image: vitalif/vitastor-csi:v0.8.8
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -116,7 +116,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v0.8.6
image: vitalif/vitastor-csi:v0.8.8
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "0.8.6"
vitastorCSIDriverVersion = "0.8.8"
)
// Config struct fills the parameters of request or user input

4
debian/changelog vendored
View File

@@ -1,10 +1,10 @@
vitastor (0.8.6-1) unstable; urgency=medium
vitastor (0.8.8-1) unstable; urgency=medium
* Bugfixes
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
vitastor (0.8.6-1) unstable; urgency=medium
vitastor (0.8.8-1) unstable; urgency=medium
* Implement NFS proxy
* Add documentation

View File

@@ -34,8 +34,8 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-0.8.6; \
cd vitastor-0.8.6; \
cp -r /root/vitastor vitastor-0.8.8; \
cd vitastor-0.8.8; \
ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -48,8 +48,8 @@ RUN set -e -x; \
rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.6.orig.tar.xz vitastor-0.8.6; \
cd vitastor-0.8.6; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.8.orig.tar.xz vitastor-0.8.8; \
cd vitastor-0.8.8; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

View File

@@ -22,14 +22,17 @@
- Add Vitastor package repository:
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release.rpm`
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release.rpm`
- AlmaLinux 9 and other RHEL 9 clones (Rocky, Oracle...): `dnf install https://vitastor.io/rpms/centos/9/vitastor-release.rpm`
- Enable EPEL: `yum/dnf install epel-release`
- Enable additional CentOS repositories:
- CentOS 7: `yum install centos-release-scl`
- CentOS 8: `dnf install centos-release-advanced-virtualization`
- RHEL 9 clones: not required
- Enable elrepo-kernel:
- CentOS 7: `yum install https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm`
- CentOS 8: `dnf install https://www.elrepo.org/elrepo-release-8.el8.elrepo.noarch.rpm`
- Install packages: `yum/dnf install vitastor lpsolve etcd kernel-ml qemu-kvm`
- RHEL 9 clones: optional, not required: `dnf install https://www.elrepo.org/elrepo-release-9.el9.elrepo.noarch.rpm`
- Install packages: `yum/dnf install vitastor lpsolve etcd qemu-kvm` and optionally `kernel-ml` if you use elrepo-kernel
## Installation requirements

View File

@@ -70,7 +70,7 @@ For EC pools the configuration should look like the following:
```
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}'
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}}'
```
After you do this, one of the monitors will configure PGs and OSDs will start them.

View File

@@ -71,7 +71,7 @@ etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool",
```
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}'
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}}'
```
После этого один из мониторов должен сконфигурировать PG, а OSD должны запустить их.

View File

@@ -51,8 +51,9 @@ const etcd_tree = {
// THIS IS JUST A POOR MAN'S CONFIG DOCUMENTATION
// etcd connection
config_path: "/etc/vitastor/vitastor.conf",
etcd_address: "10.0.115.10:2379/v3",
etcd_prefix: "/vitastor",
// etcd connection - configurable online
etcd_address: "10.0.115.10:2379/v3",
// mon
etcd_mon_ttl: 30, // min: 10
etcd_mon_timeout: 1000, // ms. min: 0
@@ -73,11 +74,12 @@ const etcd_tree = {
rdma_max_send: 8,
rdma_max_recv: 16,
rdma_max_msg: 132096,
log_level: 0,
block_size: 131072,
disk_alignment: 4096,
bitmap_granularity: 4096,
immediate_commit: false, // 'all' or 'small'
// client and osd - configurable online
log_level: 0,
client_dirty_limit: 33554432,
peer_connect_interval: 5, // seconds. min: 1
peer_connect_timeout: 5, // seconds. min: 1
@@ -95,18 +97,19 @@ const etcd_tree = {
osd_network: null, // "192.168.7.0/24" or an array of masks
bind_address: "0.0.0.0",
bind_port: 0,
readonly: false,
osd_memlock: false,
// osd - configurable online
autosync_interval: 5,
autosync_writes: 128,
client_queue_depth: 128, // unused
recovery_queue_depth: 4,
recovery_sync_batch: 16,
readonly: false,
no_recovery: false,
no_rebalance: false,
print_stats_interval: 3,
slow_log_interval: 10,
inode_vanish_time: 60,
osd_memlock: false,
// blockstore - fixed in superblock
block_size,
disk_alignment,
@@ -125,14 +128,15 @@ const etcd_tree = {
meta_offset,
disable_meta_fsync,
disable_device_lock,
// blockstore - configurable
max_write_iodepth,
min_flusher_count: 1,
max_flusher_count: 256,
// blockstore - configurable offline
inmemory_metadata,
inmemory_journal,
journal_sector_buffer_count,
journal_no_same_sector_overwrites,
// blockstore - configurable online
max_write_iodepth,
min_flusher_count: 1,
max_flusher_count: 256,
throttle_small_writes: false,
throttle_target_iops: 100,
throttle_target_mbs: 100,

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VERSION = '0.8.6'
VERSION = '0.8.8'
LOG = logging.getLogger(__name__)

View File

@@ -0,0 +1,169 @@
Index: pve-qemu-kvm-7.2.0/block/meson.build
===================================================================
--- pve-qemu-kvm-7.2.0.orig/block/meson.build
+++ pve-qemu-kvm-7.2.0/block/meson.build
@@ -113,6 +113,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
Index: pve-qemu-kvm-7.2.0/meson.build
===================================================================
--- pve-qemu-kvm-7.2.0.orig/meson.build
+++ pve-qemu-kvm-7.2.0/meson.build
@@ -1026,6 +1026,26 @@ if not get_option('rbd').auto() or have_
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'), kwargs: static_kwargs)
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -1865,6 +1885,7 @@ config_host_data.set('CONFIG_NUMA', numa
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_SDL', sdl.found())
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
@@ -3957,6 +3978,7 @@ if spice_protocol.found()
summary_info += {' spice server support': spice}
endif
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
Index: pve-qemu-kvm-7.2.0/meson_options.txt
===================================================================
--- pve-qemu-kvm-7.2.0.orig/meson_options.txt
+++ pve-qemu-kvm-7.2.0/meson_options.txt
@@ -169,6 +169,8 @@ option('lzo', type : 'feature', value :
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
Index: pve-qemu-kvm-7.2.0/qapi/block-core.json
===================================================================
--- pve-qemu-kvm-7.2.0.orig/qapi/block-core.json
+++ pve-qemu-kvm-7.2.0/qapi/block-core.json
@@ -3213,7 +3213,7 @@
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
'pbs',
- 'ssh', 'throttle', 'vdi', 'vhdx',
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -4223,6 +4223,28 @@
'*server': ['InetSocketAddressBase'] } }
##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
+##
# @ReplicationMode:
#
# An enumeration of replication modes.
@@ -4671,6 +4693,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'virtio-blk-vfio-pci':
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
'if': 'CONFIG_BLKIO' },
@@ -5072,6 +5095,17 @@
'*encrypt' : 'RbdEncryptionCreateOptions' } }
##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
+##
# @BlockdevVmdkSubformat:
#
# Subformat options for VMDK images
@@ -5269,6 +5303,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
Index: pve-qemu-kvm-7.2.0/scripts/ci/org.centos/stream/8/x86_64/configure
===================================================================
--- pve-qemu-kvm-7.2.0.orig/scripts/ci/org.centos/stream/8/x86_64/configure
+++ pve-qemu-kvm-7.2.0/scripts/ci/org.centos/stream/8/x86_64/configure
@@ -31,7 +31,7 @@
--with-git=meson \
--with-git-submodules=update \
--target-list="x86_64-softmmu" \
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
--audio-drv-list="" \
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
--with-coroutine=ucontext \
@@ -179,6 +179,7 @@
--enable-opengl \
--enable-pie \
--enable-rbd \
+--enable-vitastor \
--enable-rdma \
--enable-seccomp \
--enable-snappy \

View File

@@ -0,0 +1,169 @@
diff --git a/block/meson.build b/block/meson.build
index deb73ca389..e269f599a1 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -78,6 +78,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
diff --git a/meson.build b/meson.build
index 96de1a6ef9..2e3994777d 100644
--- a/meson.build
+++ b/meson.build
@@ -838,6 +838,26 @@ if not get_option('rbd').auto() or have_block
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'), kwargs: static_kwargs)
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -1455,6 +1475,7 @@ config_host_data.set('CONFIG_LINUX_AIO', libaio.found())
config_host_data.set('CONFIG_LINUX_IO_URING', linux_io_uring.found())
config_host_data.set('CONFIG_LIBPMEM', libpmem.found())
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_SDL', sdl.found())
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
config_host_data.set('CONFIG_SECCOMP', seccomp.found())
@@ -3412,6 +3433,7 @@ if spice_protocol.found()
summary_info += {' spice server support': spice}
endif
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'xfsctl support': config_host.has_key('CONFIG_XFS')}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
diff --git a/meson_options.txt b/meson_options.txt
index e392323732..5b56007475 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -121,6 +121,8 @@ option('lzo', type : 'feature', value : 'auto',
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('gtk', type : 'feature', value : 'auto',
description: 'GTK+ user interface')
option('sdl', type : 'feature', value : 'auto',
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 1d3dd9cb48..88453405e5 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2930,7 +2930,7 @@
'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
- 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor', 'vmdk', 'vpc', 'vvfat' ] }
##
# @BlockdevOptionsFile:
@@ -3864,6 +3864,28 @@
'*key-secret': 'str',
'*server': ['InetSocketAddressBase'] } }
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
##
# @ReplicationMode:
#
@@ -4259,6 +4281,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'vmdk': 'BlockdevOptionsGenericCOWFormat',
'vpc': 'BlockdevOptionsGenericFormat',
'vvfat': 'BlockdevOptionsVVFAT'
@@ -4647,6 +4670,17 @@
'*cluster-size' : 'size',
'*encrypt' : 'RbdEncryptionCreateOptions' } }
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
##
# @BlockdevVmdkSubformat:
#
@@ -4846,6 +4880,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 7a17ff4218..cdddbf32aa 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -69,6 +69,7 @@ meson_options_help() {
printf "%s\n" ' oss OSS sound support'
printf "%s\n" ' pa PulseAudio sound support'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' sdl SDL user interface'
printf "%s\n" ' sdl-image SDL Image support for icons'
printf "%s\n" ' seccomp seccomp support'
@@ -210,6 +211,8 @@ _meson_option_parse() {
--disable-pa) printf "%s" -Dpa=disabled ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-sdl) printf "%s" -Dsdl=enabled ;;
--disable-sdl) printf "%s" -Dsdl=disabled ;;
--enable-sdl-image) printf "%s" -Dsdl_image=enabled ;;

View File

@@ -0,0 +1,190 @@
diff --git a/block/meson.build b/block/meson.build
index 0b2a60c99b..d923713804 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -98,6 +98,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
diff --git a/meson.build b/meson.build
index 861de93c4f..272f72af11 100644
--- a/meson.build
+++ b/meson.build
@@ -884,6 +884,26 @@ if not get_option('rbd').auto() or have_block
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'), kwargs: static_kwargs)
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -1546,6 +1566,7 @@ config_host_data.set('CONFIG_LIBPMEM', libpmem.found())
config_host_data.set('CONFIG_NUMA', numa.found())
config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_SDL', sdl.found())
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
config_host_data.set('CONFIG_SECCOMP', seccomp.found())
@@ -3709,6 +3730,7 @@ if spice_protocol.found()
summary_info += {' spice server support': spice}
endif
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
diff --git a/meson_options.txt b/meson_options.txt
index 52b11cead4..d8d0868174 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -149,6 +149,8 @@ option('lzo', type : 'feature', value : 'auto',
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('gtk', type : 'feature', value : 'auto',
description: 'GTK+ user interface')
option('sdl', type : 'feature', value : 'auto',
diff --git a/qapi/block-core.json b/qapi/block-core.json
index beeb91952a..1c98dc0e12 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2929,7 +2929,7 @@
'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
- 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor', 'vmdk', 'vpc', 'vvfat' ] }
##
# @BlockdevOptionsFile:
@@ -3863,6 +3863,28 @@
'*key-secret': 'str',
'*server': ['InetSocketAddressBase'] } }
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
##
# @ReplicationMode:
#
@@ -4277,6 +4299,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'vmdk': 'BlockdevOptionsGenericCOWFormat',
'vpc': 'BlockdevOptionsGenericFormat',
'vvfat': 'BlockdevOptionsVVFAT'
@@ -4665,6 +4688,17 @@
'*cluster-size' : 'size',
'*encrypt' : 'RbdEncryptionCreateOptions' } }
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
##
# @BlockdevVmdkSubformat:
#
@@ -4864,6 +4898,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
index 9850dd4444..72b1287520 100755
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
@@ -31,7 +31,7 @@
--with-git=meson \
--with-git-submodules=update \
--target-list="x86_64-softmmu" \
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
--audio-drv-list="" \
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
--with-coroutine=ucontext \
@@ -181,6 +181,7 @@
--enable-opengl \
--enable-pie \
--enable-rbd \
+--enable-vitastor \
--enable-rdma \
--enable-seccomp \
--enable-snappy \
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 1e26f4571e..370898d48c 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -98,6 +98,7 @@ meson_options_help() {
printf "%s\n" ' qed qed image format support'
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' replication replication support'
printf "%s\n" ' sdl SDL user interface'
printf "%s\n" ' sdl-image SDL Image support for icons'
@@ -289,6 +290,8 @@ _meson_option_parse() {
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-replication) printf "%s" -Dreplication=enabled ;;
--disable-replication) printf "%s" -Dreplication=disabled ;;
--enable-rng-none) printf "%s" -Drng_none=true ;;

View File

@@ -0,0 +1,190 @@
diff --git a/block/meson.build b/block/meson.build
index 60bc305597..89a042216f 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -98,6 +98,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
diff --git a/meson.build b/meson.build
index 20fddbd707..600db4e2fb 100644
--- a/meson.build
+++ b/meson.build
@@ -967,6 +967,26 @@ if not get_option('rbd').auto() or have_block
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'), kwargs: static_kwargs)
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -1799,6 +1819,7 @@ config_host_data.set('CONFIG_NUMA', numa.found())
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_SDL', sdl.found())
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
@@ -3954,6 +3975,7 @@ if spice_protocol.found()
summary_info += {' spice server support': spice}
endif
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
diff --git a/meson_options.txt b/meson_options.txt
index e58e158396..9747b38fd0 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -167,6 +167,8 @@ option('lzo', type : 'feature', value : 'auto',
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 2173e7734a..5a4900b322 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2955,7 +2955,7 @@
'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
- 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor', 'vmdk', 'vpc', 'vvfat' ] }
##
# @BlockdevOptionsFile:
@@ -3883,6 +3883,28 @@
'*key-secret': 'str',
'*server': ['InetSocketAddressBase'] } }
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
##
# @ReplicationMode:
#
@@ -4327,6 +4349,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'vmdk': 'BlockdevOptionsGenericCOWFormat',
'vpc': 'BlockdevOptionsGenericFormat',
'vvfat': 'BlockdevOptionsVVFAT'
@@ -4717,6 +4740,17 @@
'*cluster-size' : 'size',
'*encrypt' : 'RbdEncryptionCreateOptions' } }
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
##
# @BlockdevVmdkSubformat:
#
@@ -4915,6 +4949,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
index a7f92aff90..53dc55be2e 100755
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
@@ -31,7 +31,7 @@
--with-git=meson \
--with-git-submodules=update \
--target-list="x86_64-softmmu" \
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
--audio-drv-list="" \
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
--with-coroutine=ucontext \
@@ -179,6 +179,7 @@
--enable-opengl \
--enable-pie \
--enable-rbd \
+--enable-vitastor \
--enable-rdma \
--enable-seccomp \
--enable-snappy \
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 359b04e0e6..f5b85ba78c 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -135,6 +135,7 @@ meson_options_help() {
printf "%s\n" ' qed qed image format support'
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' rdma Enable RDMA-based migration'
printf "%s\n" ' replication replication support'
printf "%s\n" ' sdl SDL user interface'
@@ -370,6 +371,8 @@ _meson_option_parse() {
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-rdma) printf "%s" -Drdma=enabled ;;
--disable-rdma) printf "%s" -Drdma=disabled ;;
--enable-replication) printf "%s" -Dreplication=enabled ;;

View File

@@ -0,0 +1,190 @@
diff --git a/block/meson.build b/block/meson.build
index b7c68b83a3..95d8a6f15d 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -100,6 +100,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
diff --git a/meson.build b/meson.build
index 5c6b5a1c75..f31f73612e 100644
--- a/meson.build
+++ b/meson.build
@@ -1026,6 +1026,26 @@ if not get_option('rbd').auto() or have_block
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'), kwargs: static_kwargs)
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -1861,6 +1881,7 @@ config_host_data.set('CONFIG_NUMA', numa.found())
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_SDL', sdl.found())
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
@@ -3945,6 +3966,7 @@ if spice_protocol.found()
summary_info += {' spice server support': spice}
endif
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
diff --git a/meson_options.txt b/meson_options.txt
index 4b749ca549..6b37bd6b77 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -169,6 +169,8 @@ option('lzo', type : 'feature', value : 'auto',
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 95ac4fa634..7a240827e4 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2959,7 +2959,7 @@
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
- 'ssh', 'throttle', 'vdi', 'vhdx',
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -3957,6 +3957,28 @@
'*key-secret': 'str',
'*server': ['InetSocketAddressBase'] } }
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
##
# @ReplicationMode:
#
@@ -4405,6 +4427,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'virtio-blk-vfio-pci':
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
'if': 'CONFIG_BLKIO' },
@@ -4804,6 +4827,17 @@
'*cluster-size' : 'size',
'*encrypt' : 'RbdEncryptionCreateOptions' } }
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
##
# @BlockdevVmdkSubformat:
#
@@ -5002,6 +5036,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
index a7f92aff90..53dc55be2e 100755
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
@@ -31,7 +31,7 @@
--with-git=meson \
--with-git-submodules=update \
--target-list="x86_64-softmmu" \
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
--audio-drv-list="" \
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
--with-coroutine=ucontext \
@@ -179,6 +179,7 @@
--enable-opengl \
--enable-pie \
--enable-rbd \
+--enable-vitastor \
--enable-rdma \
--enable-seccomp \
--enable-snappy \
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index aa6e30ea91..c45d21c40f 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -135,6 +135,7 @@ meson_options_help() {
printf "%s\n" ' qed qed image format support'
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' rdma Enable RDMA-based migration'
printf "%s\n" ' replication replication support'
printf "%s\n" ' sdl SDL user interface'
@@ -376,6 +377,8 @@ _meson_option_parse() {
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-rdma) printf "%s" -Drdma=enabled ;;
--disable-rdma) printf "%s" -Drdma=disabled ;;
--enable-replication) printf "%s" -Dreplication=enabled ;;

View File

@@ -0,0 +1,190 @@
diff --git a/block/meson.build b/block/meson.build
index 382bec0e7d..af6207dbce 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -101,6 +101,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
diff --git a/meson.build b/meson.build
index c44d05a13f..ebedb42843 100644
--- a/meson.build
+++ b/meson.build
@@ -1028,6 +1028,26 @@ if not get_option('rbd').auto() or have_block
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'), kwargs: static_kwargs)
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -1878,6 +1898,7 @@ endif
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_SDL', sdl.found())
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
@@ -4002,6 +4023,7 @@ if spice_protocol.found()
summary_info += {' spice server support': spice}
endif
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
diff --git a/meson_options.txt b/meson_options.txt
index fc9447d267..c4ac55c283 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -173,6 +173,8 @@ option('lzo', type : 'feature', value : 'auto',
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
diff --git a/qapi/block-core.json b/qapi/block-core.json
index c05ad0c07e..f5eb701604 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3054,7 +3054,7 @@
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
- 'ssh', 'throttle', 'vdi', 'vhdx',
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -4073,6 +4073,28 @@
'*key-secret': 'str',
'*server': ['InetSocketAddressBase'] } }
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
##
# @ReplicationMode:
#
@@ -4521,6 +4543,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'virtio-blk-vfio-pci':
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
'if': 'CONFIG_BLKIO' },
@@ -4920,6 +4943,17 @@
'*cluster-size' : 'size',
'*encrypt' : 'RbdEncryptionCreateOptions' } }
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
##
# @BlockdevVmdkSubformat:
#
@@ -5118,6 +5152,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
index 6e8983f39c..1b0b9fcf3e 100755
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
@@ -32,7 +32,7 @@
--with-git=meson \
--with-git-submodules=update \
--target-list="x86_64-softmmu" \
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
--audio-drv-list="" \
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
--with-coroutine=ucontext \
@@ -179,6 +179,7 @@
--enable-opengl \
--enable-pie \
--enable-rbd \
+--enable-vitastor \
--enable-rdma \
--enable-seccomp \
--enable-snappy \
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 009fab1515..95914e6ebc 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -142,6 +142,7 @@ meson_options_help() {
printf "%s\n" ' qed qed image format support'
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' rdma Enable RDMA-based migration'
printf "%s\n" ' replication replication support'
printf "%s\n" ' sdl SDL user interface'
@@ -388,6 +389,8 @@ _meson_option_parse() {
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-rdma) printf "%s" -Drdma=enabled ;;
--disable-rdma) printf "%s" -Drdma=disabled ;;
--enable-replication) printf "%s" -Dreplication=enabled ;;

View File

@@ -7,13 +7,12 @@ set -e
VITASTOR=$(dirname $0)
VITASTOR=$(realpath "$VITASTOR/..")
if [ -d /opt/rh/gcc-toolset-9 ]; then
EL=$(rpm --eval '%dist')
if [ "$EL" = ".el8" ]; then
# CentOS 8
EL=8
. /opt/rh/gcc-toolset-9/enable
else
elif [ "$EL" = ".el7" ]; then
# CentOS 7
EL=7
. /opt/rh/devtoolset-9/enable
fi
cd ~/rpmbuild/SPECS
@@ -25,4 +24,4 @@ rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-0.8.6/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.6$(rpm --eval '%dist').tar.gz *
tar --transform 's#^#vitastor-0.8.8/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.8$(rpm --eval '%dist').tar.gz *

View File

@@ -0,0 +1,93 @@
--- qemu-kvm.spec.orig 2023-02-28 08:04:06.000000000 +0000
+++ qemu-kvm.spec 2023-04-27 22:29:18.094878829 +0000
@@ -100,8 +100,6 @@
%endif
%global target_list %{kvm_target}-softmmu
-%global block_drivers_rw_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,compress
-%global block_drivers_ro_list vdi,vmdk,vhdx,vpc,https
%define qemudocdir %{_docdir}/%{name}
%global firmwaredirs "%{_datadir}/qemu-firmware:%{_datadir}/ipxe/qemu:%{_datadir}/seavgabios:%{_datadir}/seabios"
@@ -129,6 +127,7 @@ Requires: %{name}-device-usb-host = %{ep
Requires: %{name}-device-usb-redirect = %{epoch}:%{version}-%{release} \
%endif \
Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release} \
+Requires: %{name}-block-vitastor = %{epoch}:%{version}-%{release}\
Requires: %{name}-audio-pa = %{epoch}:%{version}-%{release}
# Since SPICE is removed from RHEL-9, the following Obsoletes:
@@ -151,7 +150,7 @@ Obsoletes: %{name}-block-ssh <= %{epoch}
Summary: QEMU is a machine emulator and virtualizer
Name: qemu-kvm
Version: 7.0.0
-Release: 13%{?rcrel}%{?dist}%{?cc_suffix}.2
+Release: 13.vitastor%{?rcrel}%{?dist}%{?cc_suffix}
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
# Epoch 15 used for RHEL 8
# Epoch 17 used for RHEL 9 (due to release versioning offset in RHEL 8.5)
@@ -174,6 +173,7 @@ Source28: 95-kvm-memlock.conf
Source30: kvm-s390x.conf
Source31: kvm-x86.conf
Source36: README.tests
+Source37: qemu-vitastor.c
Patch0004: 0004-Initial-redhat-build.patch
@@ -498,6 +498,7 @@ Patch171: kvm-i386-do-kvm_put_msr_featur
Patch172: kvm-target-i386-kvm-fix-kvmclock_current_nsec-Assertion-.patch
# For bz#2168221 - while live-migrating many instances concurrently, libvirt sometimes return internal error: migration was active, but no RAM info was set [rhel-9.1.0.z]
Patch173: kvm-migration-Read-state-once.patch
+Patch174: qemu-7.0-vitastor.patch
# Source-git patches
@@ -531,6 +532,7 @@ BuildRequires: libcurl-devel
%if %{have_block_rbd}
BuildRequires: librbd-devel
%endif
+BuildRequires: vitastor-client-devel
# We need both because the 'stap' binary is probed for by configure
BuildRequires: systemtap
BuildRequires: systemtap-sdt-devel
@@ -718,6 +720,14 @@ using the rbd protocol.
%endif
+%package block-vitastor
+Summary: QEMU Vitastor block driver
+Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
+
+%description block-vitastor
+This package provides the additional Vitastor block driver for QEMU.
+
+
%package audio-pa
Summary: QEMU PulseAudio audio driver
Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
@@ -811,6 +821,7 @@ This package provides usbredir support.
%prep
%setup -q -n qemu-%{version}%{?rcstr}
%autopatch -p1
+cp %{SOURCE37} ./block/vitastor.c
%global qemu_kvm_build qemu_kvm_build
mkdir -p %{qemu_kvm_build}
@@ -1032,6 +1043,7 @@ run_configure \
%if %{have_block_rbd}
--enable-rbd \
%endif
+ --enable-vitastor \
%if %{have_librdma}
--enable-rdma \
%endif
@@ -1511,6 +1523,9 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
%files block-rbd
%{_libdir}/%{name}/block-rbd.so
%endif
+%files block-vitastor
+%{_libdir}/%{name}/block-vitastor.so
+
%files audio-pa
%{_libdir}/%{name}/audio-pa.so

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.8.6.el7.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.8.8.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.8.6
Version: 0.8.8
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.8.6.el7.tar.gz
Source0: vitastor-0.8.8.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.8.6.el8.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.8.8.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.8.6
Version: 0.8.8
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.8.6.el8.tar.gz
Source0: vitastor-0.8.8.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -0,0 +1,28 @@
# Build packages for AlmaLinux 9 inside a container
# cd ..; podman build -t vitastor-el9 -v `pwd`/packages:/root/packages -f rpm/vitastor-el9.Dockerfile .
FROM almalinux:9
WORKDIR /root
RUN sed -i 's/enabled=0/enabled=1/' /etc/yum.repos.d/*.repo
RUN dnf -y install epel-release dnf-plugins-core
RUN dnf -y install https://vitastor.io/rpms/centos/9/vitastor-release-1.0-1.el9.noarch.rpm
RUN dnf -y install gcc-c++ gperftools-devel fio nodejs rpm-build jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel libarchive liburing-devel cmake
RUN dnf download --source fio
RUN rpm --nomd5 -i fio*.src.rpm
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --spec fio.spec
ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.8.8.el9.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \
mkdir -p /root/packages/vitastor-el9; \
rm -rf /root/packages/vitastor-el9/*; \
cp ~/rpmbuild/RPMS/*/vitastor* /root/packages/vitastor-el9/; \
cp ~/rpmbuild/SRPMS/vitastor* /root/packages/vitastor-el9/

158
rpm/vitastor-el9.spec Normal file
View File

@@ -0,0 +1,158 @@
Name: vitastor
Version: 0.8.8
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.8.8.el9.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel
BuildRequires: gcc-c++
BuildRequires: nodejs >= 10
BuildRequires: jerasure-devel
BuildRequires: libisa-l-devel
BuildRequires: gf-complete-devel
BuildRequires: rdma-core-devel
BuildRequires: cmake
Requires: vitastor-osd = %{version}-%{release}
Requires: vitastor-mon = %{version}-%{release}
Requires: vitastor-client = %{version}-%{release}
Requires: vitastor-client-devel = %{version}-%{release}
Requires: vitastor-fio = %{version}-%{release}
%description
Vitastor is a small, simple and fast clustered block storage (storage for VM drives),
architecturally similar to Ceph which means strong consistency, primary-replication,
symmetric clustering and automatic data distribution over any number of drives of any
size with configurable redundancy (replication or erasure codes/XOR).
%package -n vitastor-osd
Summary: Vitastor - OSD
Requires: vitastor-client = %{version}-%{release}
Requires: util-linux
Requires: parted
%description -n vitastor-osd
Vitastor object storage daemon, i.e. server program that stores data.
%package -n vitastor-mon
Summary: Vitastor - monitor
Requires: nodejs >= 10
Requires: lpsolve
%description -n vitastor-mon
Vitastor monitor, i.e. server program responsible for watching cluster state and
scheduling cluster-level operations.
%package -n vitastor-client
Summary: Vitastor - client
%description -n vitastor-client
Vitastor client library and command-line interface.
%package -n vitastor-client-devel
Summary: Vitastor - development files
Group: Development/Libraries
Requires: vitastor-client = %{version}-%{release}
%description -n vitastor-client-devel
Vitastor library headers for development.
%package -n vitastor-fio
Summary: Vitastor - fio drivers
Group: Development/Libraries
Requires: vitastor-client = %{version}-%{release}
Requires: fio = 3.27-7.el9
%description -n vitastor-fio
Vitastor fio drivers for benchmarking.
%prep
%setup -q
%build
%cmake
%cmake_build
%install
rm -rf $RPM_BUILD_ROOT
%cmake_install
cd mon
npm install
cd ..
mkdir -p %buildroot/usr/lib/vitastor
cp -r mon %buildroot/usr/lib/vitastor
mkdir -p %buildroot/lib/systemd/system
cp mon/vitastor.target mon/vitastor-mon.service mon/vitastor-osd@.service %buildroot/lib/systemd/system
mkdir -p %buildroot/lib/udev/rules.d
cp mon/90-vitastor.rules %buildroot/lib/udev/rules.d
%files
%doc GPL-2.0.txt VNPL-1.1.txt README.md README-ru.md
%files -n vitastor-osd
%_bindir/vitastor-osd
%_bindir/vitastor-disk
%_bindir/vitastor-dump-journal
/lib/systemd/system/vitastor-osd@.service
/lib/systemd/system/vitastor.target
/lib/udev/rules.d/90-vitastor.rules
%pre -n vitastor-osd
groupadd -r -f vitastor 2>/dev/null ||:
useradd -r -g vitastor -s /sbin/nologin -c "Vitastor daemons" -M -d /nonexistent vitastor 2>/dev/null ||:
install -o vitastor -g vitastor -d /var/log/vitastor
mkdir -p /etc/vitastor
%files -n vitastor-mon
/usr/lib/vitastor/mon
/lib/systemd/system/vitastor-mon.service
%pre -n vitastor-mon
groupadd -r -f vitastor 2>/dev/null ||:
useradd -r -g vitastor -s /sbin/nologin -c "Vitastor daemons" -M -d /nonexistent vitastor 2>/dev/null ||:
mkdir -p /etc/vitastor
%files -n vitastor-client
%_bindir/vitastor-nbd
%_bindir/vitastor-nfs
%_bindir/vitastor-cli
%_bindir/vitastor-rm
%_bindir/vita
%_libdir/libvitastor_blk.so*
%_libdir/libvitastor_client.so*
%files -n vitastor-client-devel
%_includedir/vitastor_c.h
%_libdir/pkgconfig
%files -n vitastor-fio
%_libdir/libfio_vitastor.so
%_libdir/libfio_vitastor_blk.so
%_libdir/libfio_vitastor_sec.so
%changelog

View File

@@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
add_definitions(-DVERSION="0.8.6")
add_definitions(-DVERSION="0.8.8")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
if (${WITH_ASAN})
add_definitions(-fsanitize=address -fno-omit-frame-pointer)

View File

@@ -13,6 +13,11 @@ blockstore_t::~blockstore_t()
delete impl;
}
void blockstore_t::parse_config(blockstore_config_t & config)
{
impl->parse_config(config, false);
}
void blockstore_t::loop()
{
impl->loop();

View File

@@ -107,7 +107,7 @@ Input:
- buf = pre-allocated obj_ver_id array <len> units long
Output:
- retval = 0 or negative error number (-EINVAL, -ENOENT if no such version or -EBUSY if not synced)
- retval = 0 or negative error number (-ENOENT if no such version for stabilize)
## BS_OP_SYNC_STAB_ALL
@@ -165,6 +165,9 @@ public:
blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
~blockstore_t();
// Update configuration
void parse_config(blockstore_config_t & config);
// Event loop
void loop();

View File

@@ -932,7 +932,7 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
resume_1:
if (!cur_sync->state)
{
if (flusher->syncing_flushers >= flusher->cur_flusher_count || !flusher->flush_queue.size())
if (flusher->syncing_flushers >= flusher->active_flushers || !flusher->flush_queue.size())
{
// Sync batch is ready. Do it.
await_sqe(0);

View File

@@ -11,7 +11,7 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
ring_consumer.loop = [this]() { loop(); };
ringloop->register_consumer(&ring_consumer);
initialized = 0;
parse_config(config);
parse_config(config, true);
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
try
{
@@ -171,7 +171,7 @@ void blockstore_impl_t::loop()
// Can't submit SYNC before previous writes
continue;
}
wr_st = continue_sync(op, false);
wr_st = continue_sync(op);
if (wr_st != 2)
{
has_writes = wr_st > 0 ? 1 : 2;
@@ -371,13 +371,18 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
return;
}
init_op(op);
submit_queue.push_back(op);
ringloop->wakeup();
}
void blockstore_impl_t::init_op(blockstore_op_t *op)
{
// Call constructor without allocating memory. We'll call destructor before returning op back
new ((void*)op->private_data) blockstore_op_private_t;
PRIV(op)->wait_for = 0;
PRIV(op)->op_state = 0;
PRIV(op)->pending_ops = 0;
submit_queue.push_back(op);
ringloop->wakeup();
}
static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)

View File

@@ -216,6 +216,11 @@ struct pool_shard_settings_t
uint32_t pg_stripe_size;
};
#define STAB_SPLIT_DONE 1
#define STAB_SPLIT_WAIT 2
#define STAB_SPLIT_SYNC 3
#define STAB_SPLIT_TODO 4
class blockstore_impl_t
{
blockstore_disk_t dsk;
@@ -277,7 +282,6 @@ class blockstore_impl_t
friend class journal_flusher_t;
friend class journal_flusher_co;
void parse_config(blockstore_config_t & config);
void calc_lengths();
void open_data();
void open_meta();
@@ -299,6 +303,7 @@ class blockstore_impl_t
blockstore_init_journal* journal_init_reader;
void check_wait(blockstore_op_t *op);
void init_op(blockstore_op_t *op);
// Read
int dequeue_read(blockstore_op_t *read_op);
@@ -318,7 +323,7 @@ class blockstore_impl_t
void handle_write_event(ring_data_t *data, blockstore_op_t *op);
// Sync
int continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync);
int continue_sync(blockstore_op_t *op);
void ack_sync(blockstore_op_t *op);
// Stabilize
@@ -326,6 +331,8 @@ class blockstore_impl_t
int continue_stable(blockstore_op_t *op);
void mark_stable(const obj_ver_id & ov, bool forget_dirty = false);
void stabilize_object(object_id oid, uint64_t max_ver);
blockstore_op_t* selective_sync(blockstore_op_t *op);
int split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider);
// Rollback
int dequeue_rollback(blockstore_op_t *op);
@@ -341,6 +348,8 @@ public:
blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
~blockstore_impl_t();
void parse_config(blockstore_config_t & config, bool init);
// Event loop
void loop();

View File

@@ -4,8 +4,54 @@
#include <sys/file.h>
#include "blockstore_impl.h"
void blockstore_impl_t::parse_config(blockstore_config_t & config)
void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
{
// Online-configurable options:
max_flusher_count = strtoull(config["max_flusher_count"].c_str(), NULL, 10);
if (!max_flusher_count)
{
max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
}
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
if (!max_flusher_count)
{
max_flusher_count = 256;
}
if (!min_flusher_count || journal.flush_journal)
{
min_flusher_count = 1;
}
if (!max_write_iodepth)
{
max_write_iodepth = 128;
}
if (!throttle_target_iops)
{
throttle_target_iops = 100;
}
if (!throttle_target_mbs)
{
throttle_target_mbs = 100;
}
if (!throttle_target_parallelism)
{
throttle_target_parallelism = 1;
}
if (!throttle_threshold_us)
{
throttle_threshold_us = 50;
}
if (!init)
{
return;
}
// Offline-configurable options:
// Common disk options
dsk.parse_config(config);
// Parse
@@ -44,29 +90,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
journal.inmemory = config["inmemory_journal"] != "false";
max_flusher_count = strtoull(config["max_flusher_count"].c_str(), NULL, 10);
if (!max_flusher_count)
max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
// Validate
if (!max_flusher_count)
{
max_flusher_count = 256;
}
if (!min_flusher_count || journal.flush_journal)
{
min_flusher_count = 1;
}
if (!max_write_iodepth)
{
max_write_iodepth = 128;
}
if (journal.sector_count < 2)
{
journal.sector_count = 32;
@@ -91,22 +115,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
{
throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
}
if (!throttle_target_iops)
{
throttle_target_iops = 100;
}
if (!throttle_target_mbs)
{
throttle_target_mbs = 100;
}
if (!throttle_target_parallelism)
{
throttle_target_parallelism = 1;
}
if (!throttle_threshold_us)
{
throttle_threshold_us = 50;
}
// init some fields
journal.block_size = dsk.journal_block_size;
journal.next_free = dsk.journal_block_size;

View File

@@ -9,48 +9,39 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
{
return continue_rollback(op);
}
obj_ver_id *v, *nv;
int i, todo = op->len;
for (i = 0, v = (obj_ver_id*)op->buf, nv = (obj_ver_id*)op->buf; i < op->len; i++, v++, nv++)
int r = split_stab_op(op, [this](obj_ver_id ov)
{
if (nv != v)
{
*nv = *v;
}
// Check that there are some versions greater than v->version (which may be zero),
// check that they're unstable, synced, and not currently written to
auto dirty_it = dirty_db.lower_bound((obj_ver_id){
.oid = v->oid,
.oid = ov.oid,
.version = UINT64_MAX,
});
if (dirty_it == dirty_db.begin())
{
skip_ov:
// Already rolled back, skip this object version
todo--;
nv--;
continue;
return STAB_SPLIT_DONE;
}
else
{
dirty_it--;
if (dirty_it->first.oid != v->oid || dirty_it->first.version < v->version)
if (dirty_it->first.oid != ov.oid || dirty_it->first.version < ov.version)
{
goto skip_ov;
// Already rolled back, skip this object version
return STAB_SPLIT_DONE;
}
while (dirty_it->first.oid == v->oid && dirty_it->first.version > v->version)
while (dirty_it->first.oid == ov.oid && dirty_it->first.version > ov.version)
{
if (IS_IN_FLIGHT(dirty_it->second.state))
{
// Object write is still in progress. Wait until the write request completes
return 0;
return STAB_SPLIT_WAIT;
}
else if (!IS_SYNCED(dirty_it->second.state) ||
IS_STABLE(dirty_it->second.state))
{
op->retval = -EBUSY;
FINISH_OP(op);
return 2;
// Sync the object
return STAB_SPLIT_SYNC;
}
if (dirty_it == dirty_db.begin())
{
@@ -58,19 +49,16 @@ skip_ov:
}
dirty_it--;
}
return STAB_SPLIT_TODO;
}
}
op->len = todo;
if (!todo)
});
if (r != 1)
{
// Already rolled back
op->retval = 0;
FINISH_OP(op);
return 2;
return r;
}
// Check journal space
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, todo, sizeof(journal_entry_rollback), 0))
if (!space_check.check_available(op, op->len, sizeof(journal_entry_rollback), 0))
{
return 0;
}
@@ -78,7 +66,8 @@ skip_ov:
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
// Prepare and submit journal entries
int s = 0;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
auto v = (obj_ver_id*)op->buf;
for (int i = 0; i < op->len; i++, v++)
{
if (!journal.entry_fits(sizeof(journal_entry_rollback)) &&
journal.sector_info[journal.cur_sector].dirty)

View File

@@ -41,60 +41,309 @@
// 4) after a while it takes his synced object list and sends stabilize requests
// to peers and to its own blockstore, thus freeing the old version
int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
struct ver_vector_t
{
if (PRIV(op)->op_state)
obj_ver_id *items = NULL;
uint64_t alloc = 0, size = 0;
};
static void init_versions(ver_vector_t & vec, obj_ver_id *start, obj_ver_id *end, uint64_t len)
{
if (!vec.items)
{
return continue_stable(op);
vec.alloc = len;
vec.items = (obj_ver_id*)malloc_or_die(sizeof(obj_ver_id) * vec.alloc);
for (auto sv = start; sv < end; sv++)
{
vec.items[vec.size++] = *sv;
}
}
}
static void append_version(ver_vector_t & vec, obj_ver_id ov)
{
if (vec.size >= vec.alloc)
{
vec.alloc = !vec.alloc ? 4 : vec.alloc*2;
vec.items = (obj_ver_id*)realloc_or_die(vec.items, sizeof(obj_ver_id) * vec.alloc);
}
vec.items[vec.size++] = ov;
}
static bool check_unsynced(std::vector<obj_ver_id> & check, obj_ver_id ov, std::vector<obj_ver_id> & to, int *count)
{
bool found = false;
int j = 0, k = 0;
while (j < check.size())
{
if (check[j] == ov)
found = true;
if (check[j].oid == ov.oid && check[j].version <= ov.version)
{
to.push_back(check[j++]);
if (count)
(*count)--;
}
else
check[k++] = check[j++];
}
check.resize(k);
return found;
}
blockstore_op_t* blockstore_impl_t::selective_sync(blockstore_op_t *op)
{
unsynced_big_write_count -= unsynced_big_writes.size();
unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
unsynced_big_write_count += unsynced_big_writes.size();
unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
// Create a sync operation, insert into the end of the queue
// And move ourselves into the end too!
// Rather hacky but that's what we need...
blockstore_op_t *sync_op = new blockstore_op_t;
sync_op->opcode = BS_OP_SYNC;
sync_op->buf = NULL;
sync_op->callback = [this](blockstore_op_t *sync_op)
{
delete sync_op;
};
init_op(sync_op);
int sync_res = continue_sync(sync_op);
if (sync_res != 2)
{
// Put SYNC into the queue if it's not finished yet
submit_queue.push_back(sync_op);
}
// Restore unsynced_writes
unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
unsynced_big_write_count -= unsynced_big_writes.size();
unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
unsynced_big_write_count += unsynced_big_writes.size();
if (sync_res == 2)
{
// Sync is immediately completed
return NULL;
}
return sync_op;
}
// Returns: 2 = stop processing and dequeue, 0 = stop processing and do not dequeue, 1 = proceed with op itself
int blockstore_impl_t::split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider)
{
bool add_sync = false;
ver_vector_t good_vers, bad_vers;
obj_ver_id* v;
int i, todo = 0;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{
auto dirty_it = dirty_db.find(*v);
if (dirty_it == dirty_db.end())
int action = decider(*v);
if (action < 0)
{
auto & clean_db = clean_db_shard(v->oid);
auto clean_it = clean_db.find(v->oid);
if (clean_it == clean_db.end() || clean_it->second.version < v->version)
// Rollback changes
for (auto & ov: PRIV(op)->sync_big_writes)
{
// No such object version
op->retval = -ENOENT;
FINISH_OP(op);
return 2;
unsynced_big_writes.push_back(ov);
unsynced_big_write_count++;
}
else
for (auto & ov: PRIV(op)->sync_small_writes)
{
// Already stable
unsynced_small_writes.push_back(ov);
}
}
else if (IS_IN_FLIGHT(dirty_it->second.state))
{
// Object write is still in progress. Wait until the write request completes
return 0;
}
else if (!IS_SYNCED(dirty_it->second.state))
{
// Object not synced yet. Caller must sync it first
op->retval = -EBUSY;
free(good_vers.items);
good_vers.items = NULL;
free(bad_vers.items);
bad_vers.items = NULL;
// Error
op->retval = action;
FINISH_OP(op);
return 2;
}
else if (!IS_STABLE(dirty_it->second.state))
else if (action == STAB_SPLIT_DONE)
{
// Already done
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
}
else if (action == STAB_SPLIT_WAIT)
{
// Already in progress, we just have to wait until it finishes
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
append_version(bad_vers, *v);
}
else if (action == STAB_SPLIT_SYNC)
{
// Needs a SYNC, we have to send a SYNC if not already in progress
//
// If the object is not present in unsynced_(big|small)_writes then
// it's currently being synced. If it's present then we can initiate
// its sync ourselves.
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
append_version(bad_vers, *v);
if (!add_sync)
{
PRIV(op)->sync_big_writes.clear();
PRIV(op)->sync_small_writes.clear();
add_sync = true;
}
check_unsynced(unsynced_small_writes, *v, PRIV(op)->sync_small_writes, NULL);
check_unsynced(unsynced_big_writes, *v, PRIV(op)->sync_big_writes, &unsynced_big_write_count);
}
else /* if (action == STAB_SPLIT_TODO) */
{
if (good_vers.items)
{
// If we're selecting versions then append it
// Main idea is that 99% of the time all versions passed to BS_OP_STABLE are synced
// And we don't want to select/allocate anything in that optimistic case
append_version(good_vers, *v);
}
todo++;
}
}
if (!todo)
// In a pessimistic scenario, an operation may be split into 3:
// - Stabilize synced entries
// - Sync unsynced entries
// - Continue for unsynced entries after sync
add_sync = add_sync && (PRIV(op)->sync_big_writes.size() || PRIV(op)->sync_small_writes.size());
if (!todo && !bad_vers.size)
{
// Already stable
op->retval = 0;
FINISH_OP(op);
return 2;
}
op->retval = 0;
if (!todo && !add_sync)
{
// Only wait for inflight writes or current in-progress syncs
return 0;
}
blockstore_op_t *sync_op = NULL, *split_stab_op = NULL;
if (add_sync)
{
// Initiate a selective sync for PRIV(op)->sync_(big|small)_writes
sync_op = selective_sync(op);
}
if (bad_vers.size)
{
// Split part of the request into a separate operation
split_stab_op = new blockstore_op_t;
split_stab_op->opcode = op->opcode;
split_stab_op->buf = bad_vers.items;
split_stab_op->len = bad_vers.size;
init_op(split_stab_op);
submit_queue.push_back(split_stab_op);
}
if (sync_op || split_stab_op || good_vers.items)
{
void *orig_buf = op->buf;
if (good_vers.items)
{
op->buf = good_vers.items;
op->len = good_vers.size;
}
// Make a wrapped callback
int *split_op_counter = (int*)malloc_or_die(sizeof(int));
*split_op_counter = (sync_op ? 1 : 0) + (split_stab_op ? 1 : 0) + (todo ? 1 : 0);
auto cb = [this, op, good_items = good_vers.items,
bad_items = bad_vers.items, split_op_counter,
orig_buf, real_cb = op->callback](blockstore_op_t *split_op)
{
if (split_op->retval != 0)
op->retval = split_op->retval;
(*split_op_counter)--;
assert((*split_op_counter) >= 0);
if (op != split_op)
delete split_op;
if (!*split_op_counter)
{
free(good_items);
free(bad_items);
free(split_op_counter);
op->buf = orig_buf;
real_cb(op);
}
};
if (sync_op)
{
sync_op->callback = cb;
}
if (split_stab_op)
{
split_stab_op->callback = cb;
}
op->callback = cb;
}
if (!todo)
{
// All work is postponed
op->callback = NULL;
return 2;
}
return 1;
}
int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
{
if (PRIV(op)->op_state)
{
return continue_stable(op);
}
int r = split_stab_op(op, [this](obj_ver_id ov)
{
auto dirty_it = dirty_db.find(ov);
if (dirty_it == dirty_db.end())
{
auto & clean_db = clean_db_shard(ov.oid);
auto clean_it = clean_db.find(ov.oid);
if (clean_it == clean_db.end() || clean_it->second.version < ov.version)
{
// No such object version
printf("Error: %lx:%lx v%lu not found while stabilizing\n", ov.oid.inode, ov.oid.stripe, ov.version);
return -ENOENT;
}
else
{
// Already stable
return STAB_SPLIT_DONE;
}
}
else if (IS_IN_FLIGHT(dirty_it->second.state))
{
// Object write is still in progress. Wait until the write request completes
return STAB_SPLIT_WAIT;
}
else if (!IS_SYNCED(dirty_it->second.state))
{
// Object not synced yet - sync it
// In previous versions we returned EBUSY here and required
// the caller (OSD) to issue a global sync first. But a global sync
// waits for all writes in the queue including inflight writes. And
// inflight writes may themselves be blocked by unstable writes being
// still present in the journal and not flushed away from it.
// So we must sync specific objects here.
//
// Even more, we have to process "stabilize" request in parts. That is,
// we must stabilize all objects which are already synced. Otherwise
// they may block objects which are NOT synced yet.
return STAB_SPLIT_SYNC;
}
else if (IS_STABLE(dirty_it->second.state))
{
// Already stable
return STAB_SPLIT_DONE;
}
else
{
return STAB_SPLIT_TODO;
}
});
if (r != 1)
{
return r;
}
// Check journal space
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, todo, sizeof(journal_entry_stable), 0))
if (!space_check.check_available(op, op->len, sizeof(journal_entry_stable), 0))
{
return 0;
}
@@ -102,9 +351,9 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
// Prepare and submit journal entries
int s = 0;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
auto v = (obj_ver_id*)op->buf;
for (int i = 0; i < op->len; i++, v++)
{
// FIXME: Only stabilize versions that aren't stable yet
if (!journal.entry_fits(sizeof(journal_entry_stable)) &&
journal.sector_info[journal.cur_sector].dirty)
{

View File

@@ -12,7 +12,7 @@
#define SYNC_JOURNAL_SYNC_SENT 7
#define SYNC_DONE 8
int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_progress_sync)
int blockstore_impl_t::continue_sync(blockstore_op_t *op)
{
if (immediate_commit == IMMEDIATE_ALL)
{
@@ -145,7 +145,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
PRIV(op)->op_state = SYNC_DONE;
}
}
if (PRIV(op)->op_state == SYNC_DONE && !queue_has_in_progress_sync)
if (PRIV(op)->op_state == SYNC_DONE)
{
ack_sync(op);
return 2;

View File

@@ -278,7 +278,7 @@ struct rm_osd_t
if (rsp["response_delete_range"]["deleted"].uint64_value() > 0)
{
// Wait for mon_change_timeout before updating PG history, or the monitor's change will likely interfere with ours
retry_wait = parent->cli->merged_config["mon_change_timeout"].uint64_value();
retry_wait = parent->cli->config["mon_change_timeout"].uint64_value();
if (!retry_wait)
retry_wait = 1000;
retry_wait += etcd_tx_retry_ms;

View File

@@ -198,9 +198,9 @@ resume_2:
}
pgs_by_state_str += std::to_string(kv.second)+" "+kv.first;
}
bool readonly = json_is_true(parent->cli->merged_config["readonly"]);
bool no_recovery = json_is_true(parent->cli->merged_config["no_recovery"]);
bool no_rebalance = json_is_true(parent->cli->merged_config["no_rebalance"]);
bool readonly = json_is_true(parent->cli->config["readonly"]);
bool no_recovery = json_is_true(parent->cli->config["no_recovery"]);
bool no_rebalance = json_is_true(parent->cli->config["no_rebalance"]);
if (parent->json_output)
{
// JSON output

View File

@@ -18,11 +18,12 @@
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
{
config = osd_messenger_t::read_config(config);
cli_config = config.object_items();
file_config = osd_messenger_t::read_config(config);
config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
this->ringloop = ringloop;
this->tfd = tfd;
this->config = config;
msgr.osd_num = 0;
msgr.tfd = tfd;
@@ -58,7 +59,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
msgr.stop_client(op->peer_fd);
delete op;
};
msgr.parse_config(this->config);
msgr.parse_config(config);
st_cli.tfd = tfd;
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
@@ -276,13 +277,10 @@ restart:
continuing_ops = 0;
}
void cluster_client_t::on_load_config_hook(json11::Json::object & config)
void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_config)
{
this->merged_config = config;
for (auto & kv: this->config.object_items())
{
this->merged_config[kv.first] = kv.second;
}
this->etcd_global_config = etcd_global_config;
config = osd_messenger_t::merge_configs(cli_config, file_config, etcd_global_config, {});
if (config.find("client_max_dirty_bytes") != config.end())
{
client_max_dirty_bytes = config["client_max_dirty_bytes"].uint64_value();
@@ -292,14 +290,13 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config)
// Old name
client_max_dirty_bytes = config["client_dirty_limit"].uint64_value();
}
if (config.find("client_max_dirty_ops") != config.end())
{
client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
}
else
client_max_dirty_bytes = 0;
if (!client_max_dirty_bytes)
{
client_max_dirty_bytes = DEFAULT_CLIENT_MAX_DIRTY_BYTES;
}
client_max_dirty_ops = config["client_max_dirty_ops"].uint64_value();
if (!client_max_dirty_ops)
{
client_max_dirty_ops = DEFAULT_CLIENT_MAX_DIRTY_OPS;
@@ -314,7 +311,7 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config)
up_wait_retry_interval = 50;
}
msgr.parse_config(config);
msgr.parse_config(this->config);
st_cli.parse_config(config);
st_cli.load_pgs();
}
@@ -1121,6 +1118,24 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
if (part->op.reply.hdr.retval != expected)
{
// Operation failed, retry
part->flags |= PART_ERROR;
if (!op->retval || op->retval == -EPIPE)
{
// Don't overwrite other errors with -EPIPE
op->retval = part->op.reply.hdr.retval;
}
int stop_fd = -1;
if (op->retval != -EINTR && op->retval != -EIO)
{
stop_fd = part->op.peer_fd;
fprintf(
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
);
}
// All next things like timer, continue_sync/rw and stop_client may affect the operation again
// So do all these things after modifying operation state, otherwise we may hit reenterability bugs
// FIXME postpone such things to set_immediate here to avoid bugs
if (part->op.reply.hdr.retval == -EPIPE)
{
// Mark op->up_wait = true before stopping the client
@@ -1134,20 +1149,17 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
});
}
}
if (!op->retval || op->retval == -EPIPE)
if (op->inflight_count == 0)
{
// Don't overwrite other errors with -EPIPE
op->retval = part->op.reply.hdr.retval;
if (op->opcode == OSD_OP_SYNC)
continue_sync(op);
else
continue_rw(op);
}
if (op->retval != -EINTR && op->retval != -EIO)
if (stop_fd >= 0)
{
fprintf(
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d), dropping connection\n",
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
);
msgr.stop_client(part->op.peer_fd);
msgr.stop_client(stop_fd);
}
part->flags |= PART_ERROR;
}
else
{
@@ -1161,13 +1173,13 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
copy_part_bitmap(op, part);
op->version = op->parts.size() == 1 ? part->op.reply.rw.version : 0;
}
}
if (op->inflight_count == 0)
{
if (op->opcode == OSD_OP_SYNC)
continue_sync(op);
else
continue_rw(op);
if (op->inflight_count == 0)
{
if (op->opcode == OSD_OP_SYNC)
continue_sync(op);
else
continue_rw(op);
}
}
}

View File

@@ -112,8 +112,8 @@ public:
osd_messenger_t msgr;
void init_msgr();
json11::Json config;
json11::Json::object merged_config;
json11::Json::object cli_config, file_config, etcd_global_config;
json11::Json::object config;
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
~cluster_client_t();

View File

@@ -43,6 +43,7 @@ struct inode_list_t
inode_list_t* cluster_client_t::list_inode_start(inode_t inode,
std::function<void(inode_list_t* lst, std::set<object_id>&& objects, pg_num_t pg_num, osd_num_t primary_osd, int status)> callback)
{
init_msgr();
int skipped_pgs = 0;
pool_id_t pool_id = INODE_POOL(inode);
if (!pool_id || st_cli.pool_config.find(pool_id) == st_cli.pool_config.end())

View File

@@ -18,12 +18,8 @@ etcd_state_client_t::~etcd_state_client_t()
}
watches.clear();
etcd_watches_initialised = -1;
if (ws_keepalive_timer >= 0)
{
tfd->clear_timer(ws_keepalive_timer);
ws_keepalive_timer = -1;
}
#ifndef __MOCK__
stop_ws_keepalive();
if (etcd_watch_ws)
{
http_close(etcd_watch_ws);
@@ -245,6 +241,7 @@ void etcd_state_client_t::parse_config(const json11::Json & config)
if (this->etcd_keepalive_timeout < 30)
this->etcd_keepalive_timeout = 30;
}
auto old_etcd_ws_keepalive_interval = this->etcd_ws_keepalive_interval;
this->etcd_ws_keepalive_interval = config["etcd_ws_keepalive_interval"].uint64_value();
if (this->etcd_ws_keepalive_interval <= 0)
{
@@ -265,6 +262,13 @@ void etcd_state_client_t::parse_config(const json11::Json & config)
{
this->etcd_quick_timeout = 1000;
}
if (this->etcd_ws_keepalive_interval != old_etcd_ws_keepalive_interval && ws_keepalive_timer >= 0)
{
#ifndef __MOCK__
stop_ws_keepalive();
start_ws_keepalive();
#endif
}
}
void etcd_state_client_t::pick_next_etcd()
@@ -478,6 +482,20 @@ void etcd_state_client_t::start_etcd_watcher()
{
on_start_watcher_hook(etcd_watch_ws);
}
start_ws_keepalive();
}
void etcd_state_client_t::stop_ws_keepalive()
{
if (ws_keepalive_timer >= 0)
{
tfd->clear_timer(ws_keepalive_timer);
ws_keepalive_timer = -1;
}
}
void etcd_state_client_t::start_ws_keepalive()
{
if (ws_keepalive_timer < 0)
{
ws_keepalive_timer = tfd->set_timer(etcd_ws_keepalive_interval*1000, true, [this](int)

View File

@@ -132,6 +132,8 @@ public:
void etcd_txn(json11::Json txn, int timeout, int retries, int interval, std::function<void(std::string, json11::Json)> callback);
void etcd_txn_slow(json11::Json txn, std::function<void(std::string, json11::Json)> callback);
void start_etcd_watcher();
void stop_ws_keepalive();
void start_ws_keepalive();
void load_global_config();
void load_pgs();
void parse_state(const etcd_kv_t & kv);

View File

@@ -534,8 +534,9 @@ bool osd_messenger_t::is_rdma_enabled()
}
#endif
json11::Json osd_messenger_t::read_config(const json11::Json & config)
json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
{
json11::Json::object file_config;
const char *config_path = config["config_path"].string_value() != ""
? config["config_path"].string_value().c_str() : VITASTOR_CONFIG_PATH;
int fd = open(config_path, O_RDONLY);
@@ -543,14 +544,14 @@ json11::Json osd_messenger_t::read_config(const json11::Json & config)
{
if (errno != ENOENT)
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
return config;
return file_config;
}
struct stat st;
if (fstat(fd, &st) != 0)
{
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
close(fd);
return config;
return file_config;
}
std::string buf;
buf.resize(st.st_size);
@@ -562,23 +563,125 @@ json11::Json osd_messenger_t::read_config(const json11::Json & config)
{
fprintf(stderr, "Error reading %s: %s\n", config_path, strerror(errno));
close(fd);
return config;
return file_config;
}
done += r;
}
close(fd);
std::string json_err;
json11::Json::object file_config = json11::Json::parse(buf, json_err).object_items();
file_config = json11::Json::parse(buf, json_err).object_items();
if (json_err != "")
{
fprintf(stderr, "Invalid JSON in %s: %s\n", config_path, json_err.c_str());
return config;
}
file_config.erase("config_path");
file_config.erase("osd_num");
for (auto kv: config.object_items())
{
file_config[kv.first] = kv.second;
}
return file_config;
}
static const char* cli_only_params[] = {
// The list has to be sorted
"bitmap_granularity",
"block_size",
"data_device",
"data_offset",
"data_size",
"disable_data_fsync",
"disable_device_lock",
"disable_journal_fsync",
"disable_meta_fsync",
"disk_alignment",
"flush_journal",
"immediate_commit",
"inmemory_journal",
"inmemory_metadata",
"journal_block_size",
"journal_device",
"journal_no_same_sector_overwrites",
"journal_offset",
"journal_sector_buffer_count",
"journal_size",
"meta_block_size",
"meta_buf_size",
"meta_device",
"meta_offset",
"osd_num",
"readonly",
};
static const char **cli_only_end = cli_only_params + (sizeof(cli_only_params)/sizeof(cli_only_params[0]));
static const char* local_only_params[] = {
// The list has to be sorted
"config_path",
"rdma_device",
"rdma_gid_index",
"rdma_max_msg",
"rdma_max_recv",
"rdma_max_send",
"rdma_max_sge",
"rdma_mtu",
"rdma_port_num",
"tcp_header_buffer_size",
"use_rdma",
"use_sync_send_recv",
};
static const char **local_only_end = local_only_params + (sizeof(local_only_params)/sizeof(local_only_params[0]));
// Basically could be replaced by std::lower_bound()...
static int find_str_array(const char **start, const char **end, const std::string & s)
{
int min = 0, max = end-start;
while (max-min >= 2)
{
int mid = (min+max)/2;
int r = strcmp(s.c_str(), start[mid]);
if (r < 0)
max = mid;
else if (r > 0)
min = mid;
else
return mid;
}
if (min < end-start && !strcmp(s.c_str(), start[min]))
return min;
return -1;
}
json11::Json::object osd_messenger_t::merge_configs(const json11::Json::object & cli_config,
const json11::Json::object & file_config,
const json11::Json::object & etcd_global_config,
const json11::Json::object & etcd_osd_config)
{
// Priority: most important -> less important:
// etcd_osd_config -> cli_config -> etcd_global_config -> file_config
json11::Json::object res = file_config;
for (auto & kv: file_config)
{
int cli_only = find_str_array(cli_only_params, cli_only_end, kv.first);
if (cli_only < 0)
{
res[kv.first] = kv.second;
}
}
for (auto & kv: etcd_global_config)
{
int local_only = find_str_array(local_only_params, local_only_end, kv.first);
if (local_only < 0)
{
res[kv.first] = kv.second;
}
}
for (auto & kv: cli_config)
{
res[kv.first] = kv.second;
}
for (auto & kv: etcd_osd_config)
{
int local_only = find_str_array(local_only_params, local_only_end, kv.first);
if (local_only < 0)
{
res[kv.first] = kv.second;
}
}
return res;
}

View File

@@ -166,7 +166,11 @@ public:
void accept_connections(int listen_fd);
~osd_messenger_t();
static json11::Json read_config(const json11::Json & config);
static json11::Json::object read_config(const json11::Json & config);
static json11::Json::object merge_configs(const json11::Json::object & cli_config,
const json11::Json::object & file_config,
const json11::Json::object & etcd_global_config,
const json11::Json::object & etcd_osd_config);
#ifdef WITH_RDMA
bool is_rdma_enabled();

View File

@@ -43,7 +43,15 @@ void osd_messenger_t::send_replies()
{
}
json11::Json osd_messenger_t::read_config(const json11::Json & config)
json11::Json::object osd_messenger_t::read_config(const json11::Json & config)
{
return config;
return json11::Json::object();
}
json11::Json::object osd_messenger_t::merge_configs(const json11::Json::object & cli_config,
const json11::Json::object & file_config,
const json11::Json::object & etcd_global_config,
const json11::Json::object & etcd_osd_config)
{
return cli_config;
}

View File

@@ -353,10 +353,8 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
.wr_id = (uint64_t)(cl->peer_fd*2+1),
.sg_list = sge,
.num_sge = op_sge,
.opcode = cl->rdma_conn->avail_recv > 0 ? IBV_WR_SEND_WITH_IMM : IBV_WR_SEND,
.opcode = IBV_WR_SEND,
.send_flags = IBV_SEND_SIGNALED,
// Notify peer about our available incoming buffers
.imm_data = (uint32_t)cl->rdma_conn->avail_recv,
};
int err = ibv_post_send(cl->rdma_conn->qp, &wr, &bad_wr);
if (err || bad_wr)
@@ -365,27 +363,15 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
exit(1);
}
cl->rdma_conn->cur_send++;
cl->rdma_conn->avail_send--;
cl->rdma_conn->avail_recv = 0;
}
bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
{
auto rc = cl->rdma_conn;
if (rc->cur_send >= rc->max_send)
if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
{
return true;
}
if (!cl->send_list.size() || rc->use_flow_control && rc->avail_send <= 0)
{
if (rc->avail_recv)
{
// Only notify about available buffers so 2 peers don't lock each other
rc->send_sizes.push_back(0);
try_send_rdma_wr(cl, NULL, 0);
}
return true;
}
uint64_t op_size = 0, op_sge = 0;
ibv_sge sge[rc->max_sge];
while (rc->send_pos < cl->send_list.size())
@@ -445,7 +431,6 @@ static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
exit(1);
}
cl->rdma_conn->cur_recv++;
cl->rdma_conn->avail_recv++;
}
bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
@@ -507,11 +492,6 @@ void osd_messenger_t::handle_rdma_events()
if (!is_send)
{
rc->cur_recv--;
if ((wc[i].wc_flags & IBV_WC_WITH_IMM) && wc[i].imm_data > 0)
{
rc->avail_send += wc[i].imm_data;
rc->use_flow_control = true;
}
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf], wc[i].byte_len))
{
// handle_read_buffer may stop the client
@@ -519,7 +499,6 @@ void osd_messenger_t::handle_rdma_events()
}
try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]);
rc->next_recv_buf = (rc->next_recv_buf+1) % rc->recv_buffers.size();
try_send_rdma(cl);
}
else
{

View File

@@ -45,8 +45,7 @@ struct msgr_rdma_connection_t
ibv_qp *qp = NULL;
msgr_rdma_address_t addr;
int max_send = 0, max_recv = 0, max_sge = 0;
int cur_send = 0, cur_recv = 0, avail_recv = 0, avail_send = 0;
bool use_flow_control = false;
int cur_send = 0, cur_recv = 0;
uint64_t max_msg = 0;
int send_pos = 0, send_buf_pos = 0;

View File

@@ -313,17 +313,18 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
stop_client(cl->peer_fd);
return false;
}
if (op->reply.hdr.retval >= 0 && bmp_len > 0)
if (bmp_len > 0)
{
assert(op->bitmap);
cl->recv_list.push_back(op->bitmap, bmp_len);
cl->read_remaining += bmp_len;
}
if (op->reply.hdr.retval > 0)
{
assert(op->iov.count > 0);
cl->recv_list.append(op->iov);
cl->read_remaining += op->reply.hdr.retval;
}
cl->read_remaining = op->reply.hdr.retval + bmp_len;
if (cl->read_remaining == 0)
{
goto reuse;

View File

@@ -39,6 +39,11 @@ struct __attribute__((__packed__)) obj_ver_id
uint64_t version;
};
inline bool operator == (const obj_ver_id & a, const obj_ver_id & b)
{
return a.oid == b.oid && a.version == b.version;
}
inline bool operator < (const obj_ver_id & a, const obj_ver_id & b)
{
return a.oid < b.oid || a.oid == b.oid && a.version < b.version;

View File

@@ -35,10 +35,9 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
this->ringloop = ringloop;
this->config = msgr.read_config(config).object_items();
if (this->config.find("log_level") == this->config.end())
this->config["log_level"] = 1;
parse_config(this->config, true);
this->cli_config = config.object_items();
this->file_config = msgr.read_config(this->cli_config);
parse_config(true);
epmgr = new epoll_manager_t(ringloop);
// FIXME: Use timerfd_interval based directly on io_uring
@@ -68,11 +67,11 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
}
}
this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
print_stats_timer_id = this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
{
print_stats();
});
this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
slow_log_timer_id = this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
{
print_slow();
});
@@ -92,6 +91,21 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
osd_t::~osd_t()
{
if (slow_log_timer_id >= 0)
{
tfd->clear_timer(slow_log_timer_id);
slow_log_timer_id = -1;
}
if (print_stats_timer_id >= 0)
{
tfd->clear_timer(print_stats_timer_id);
print_stats_timer_id = -1;
}
if (autosync_timer_id >= 0)
{
tfd->clear_timer(autosync_timer_id);
autosync_timer_id = -1;
}
ringloop->unregister_consumer(&consumer);
delete epmgr;
if (bs)
@@ -100,11 +114,19 @@ osd_t::~osd_t()
free(zero_buffer);
}
void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
void osd_t::parse_config(bool init)
{
config = msgr.merge_configs(cli_config, file_config, etcd_global_config, etcd_osd_config);
if (config.find("log_level") == this->config.end())
config["log_level"] = 1;
if (bs)
{
auto bs_cfg = json_to_bs(config);
bs->parse_config(bs_cfg);
}
st_cli.parse_config(config);
msgr.parse_config(config);
if (allow_disk_params)
if (init)
{
// OSD number
osd_num = config["osd_num"].uint64_value();
@@ -126,24 +148,27 @@ void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
immediate_commit = IMMEDIATE_SMALL;
else
immediate_commit = IMMEDIATE_NONE;
// Bind address
bind_address = config["bind_address"].string_value();
if (bind_address == "")
bind_address = "0.0.0.0";
bind_port = config["bind_port"].uint64_value();
if (bind_port <= 0 || bind_port > 65535)
bind_port = 0;
// OSD configuration
etcd_report_interval = config["etcd_report_interval"].uint64_value();
if (etcd_report_interval <= 0)
etcd_report_interval = 5;
readonly = json_is_true(config["readonly"]);
run_primary = !json_is_false(config["run_primary"]);
allow_test_ops = json_is_true(config["allow_test_ops"]);
}
// Bind address
bind_address = config["bind_address"].string_value();
if (bind_address == "")
bind_address = "0.0.0.0";
bind_port = config["bind_port"].uint64_value();
if (bind_port <= 0 || bind_port > 65535)
bind_port = 0;
// OSD configuration
log_level = config["log_level"].uint64_value();
etcd_report_interval = config["etcd_report_interval"].uint64_value();
if (etcd_report_interval <= 0)
etcd_report_interval = 5;
readonly = json_is_true(config["readonly"]);
run_primary = !json_is_false(config["run_primary"]);
auto old_no_rebalance = no_rebalance;
no_rebalance = json_is_true(config["no_rebalance"]);
auto old_no_recovery = no_recovery;
no_recovery = json_is_true(config["no_recovery"]);
allow_test_ops = json_is_true(config["allow_test_ops"]);
auto old_autosync_interval = autosync_interval;
if (!config["autosync_interval"].is_null())
{
// Allow to set it to 0
@@ -171,15 +196,46 @@ void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
recovery_sync_batch = config["recovery_sync_batch"].uint64_value();
if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
auto old_print_stats_interval = print_stats_interval;
print_stats_interval = config["print_stats_interval"].uint64_value();
if (!print_stats_interval)
print_stats_interval = 3;
auto old_slow_log_interval = slow_log_interval;
slow_log_interval = config["slow_log_interval"].uint64_value();
if (!slow_log_interval)
slow_log_interval = 10;
inode_vanish_time = config["inode_vanish_time"].uint64_value();
if (!inode_vanish_time)
inode_vanish_time = 60;
if ((old_no_rebalance && !no_rebalance || old_no_recovery && !no_recovery) &&
!(peering_state & (OSD_RECOVERING | OSD_FLUSHING_PGS)))
{
peering_state = peering_state | OSD_RECOVERING;
}
if (old_autosync_interval != autosync_interval && autosync_timer_id >= 0)
{
this->tfd->clear_timer(autosync_timer_id);
autosync_timer_id = this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
{
autosync();
});
}
if (old_print_stats_interval != print_stats_interval && print_stats_timer_id >= 0)
{
tfd->clear_timer(print_stats_timer_id);
print_stats_timer_id = this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
{
print_stats();
});
}
if (old_slow_log_interval != slow_log_interval && slow_log_timer_id >= 0)
{
tfd->clear_timer(slow_log_timer_id);
slow_log_timer_id = this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
{
print_slow();
});
}
}
void osd_t::bind_socket()
@@ -404,7 +460,7 @@ void osd_t::print_slow()
int l = sizeof(alloc), n;
char *buf = alloc;
#define bufprintf(s, ...) { n = snprintf(buf, l, s, __VA_ARGS__); n = n < 0 ? 0 : n; buf += n; l -= n; }
bufprintf("[OSD %lu] Slow op", osd_num);
bufprintf("[OSD %lu] Slow op %lx", osd_num, (unsigned long)op);
if (kv.second->osd_num)
{
bufprintf(" from peer OSD %lu (client %d)", kv.second->osd_num, kv.second->peer_fd);

View File

@@ -90,7 +90,7 @@ class osd_t
{
// config
json11::Json::object config;
json11::Json::object cli_config, file_config, etcd_global_config, etcd_osd_config, config;
int etcd_report_interval = 5;
bool readonly = false;
@@ -126,6 +126,7 @@ class osd_t
bool pg_config_applied = false;
bool etcd_reporting_pg_state = false;
bool etcd_reporting_stats = false;
int autosync_timer_id = -1, print_stats_timer_id = -1, slow_log_timer_id = -1;
// peers and PGs
@@ -173,7 +174,7 @@ class osd_t
uint64_t recovery_stat_bytes[2][2] = {};
// cluster connection
void parse_config(const json11::Json & config, bool allow_disk_params);
void parse_config(bool init);
void init_cluster();
void on_change_osd_state_hook(osd_num_t peer_osd);
void on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num);

View File

@@ -75,7 +75,7 @@ void osd_t::init_cluster()
}
if (run_primary && autosync_interval > 0)
{
this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
autosync_timer_id = this->tfd->set_timer(autosync_interval*1000, true, [this](int timer_id)
{
autosync();
});
@@ -375,7 +375,11 @@ void osd_t::on_change_osd_state_hook(osd_num_t peer_osd)
void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes)
{
// FIXME apply config changes in runtime (maybe, some)
if (changes.find(st_cli.etcd_prefix+"/config/global") != changes.end())
{
etcd_global_config = changes[st_cli.etcd_prefix+"/config/global"].value.object_items();
parse_config(false);
}
if (run_primary)
{
apply_pg_count();
@@ -385,11 +389,8 @@ void osd_t::on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes
void osd_t::on_load_config_hook(json11::Json::object & global_config)
{
json11::Json::object osd_config = this->config;
for (auto & kv: global_config)
if (osd_config.find(kv.first) == osd_config.end())
osd_config[kv.first] = kv.second;
parse_config(osd_config, false);
etcd_global_config = global_config;
parse_config(true);
bind_socket();
acquire_lease();
}
@@ -735,7 +736,7 @@ void osd_t::apply_pg_config()
.pg_cursize = 0,
.pg_size = pool_item.second.pg_size,
.pg_minsize = pool_item.second.pg_minsize,
.pg_data_size = pg.scheme == POOL_SCHEME_REPLICATED
.pg_data_size = pool_item.second.scheme == POOL_SCHEME_REPLICATED
? 1 : pool_item.second.pg_size - pool_item.second.parity_chunks,
.pool_id = pool_id,
.pg_num = pg_num,

View File

@@ -64,6 +64,11 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval)
{
if (log_level > 2)
{
printf("[PG %u/%u] flush batch %lx completed on OSD %lu with result %d\n",
pool_id, pg_num, (uint64_t)fb, peer_osd, retval);
}
pool_pg_num_t pg_id = { .pool_id = pool_id, .pg_num = pg_num };
if (pgs.find(pg_id) == pgs.end() || pgs[pg_id].flush_batch != fb)
{
@@ -99,10 +104,9 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
std::vector<osd_op_t*> continue_ops;
auto & pg = pgs.at(pg_id);
auto it = pg.flush_actions.begin(), prev_it = it;
auto erase_start = it;
while (1)
{
if (it == pg.flush_actions.end() ||
if (it == pg.flush_actions.end() || !it->second.submitted ||
it->first.oid.inode != prev_it->first.oid.inode ||
(it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK))
{
@@ -116,29 +120,23 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
});
if (wr_it != pg.write_queue.end())
{
if (log_level > 2)
{
printf("[PG %u/%u] continuing write %lx to object %lx:%lx after flush\n",
pool_id, pg_num, (uint64_t)wr_it->second, wr_it->first.inode, wr_it->first.stripe);
}
continue_ops.push_back(wr_it->second);
pg.write_queue.erase(wr_it);
}
}
if ((it == pg.flush_actions.end() || !it->second.submitted) &&
erase_start != it)
{
pg.flush_actions.erase(erase_start, it);
}
if (it == pg.flush_actions.end())
if (it == pg.flush_actions.end() || !it->second.submitted)
{
if (it != pg.flush_actions.begin())
{
pg.flush_actions.erase(pg.flush_actions.begin(), it);
}
break;
}
prev_it = it;
if (!it->second.submitted)
{
it++;
erase_start = it;
}
else
{
it++;
}
prev_it = it++;
}
delete fb;
pg.flush_batch = NULL;
@@ -168,6 +166,18 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
// Copy buffer so it gets freed along with the operation
op->buf = malloc_or_die(sizeof(obj_ver_id) * count);
memcpy(op->buf, data, sizeof(obj_ver_id) * count);
if (log_level > 2)
{
printf(
"[PG %u/%u] flush batch %lx on OSD %lu: %s objects: ",
pool_id, pg_num, (uint64_t)fb, peer_osd, rollback ? "rollback" : "stabilize"
);
for (int i = 0; i < count; i++)
{
printf(i > 0 ? ", %lx:%lx v%lu" : "%lx:%lx v%lu", data[i].oid.inode, data[i].oid.stripe, data[i].version);
}
printf("\n");
}
if (peer_osd == this->osd_num)
{
// local
@@ -304,9 +314,10 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
{
// PG is stopped or one of the OSDs is gone, error is harmless
printf(
"Recovery operation failed with object %lx:%lx (PG %u/%u)\n",
op->oid.inode, op->oid.stripe, INODE_POOL(op->oid.inode),
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size)
"[PG %u/%u] Recovery operation failed with object %lx:%lx\n",
INODE_POOL(op->oid.inode),
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size),
op->oid.inode, op->oid.stripe
);
}
else

View File

@@ -76,7 +76,7 @@ void osd_t::handle_peers()
peering_state = peering_state & ~OSD_FLUSHING_PGS | OSD_RECOVERING;
}
}
if ((peering_state & OSD_RECOVERING) && !readonly)
if (!(peering_state & OSD_FLUSHING_PGS) && (peering_state & OSD_RECOVERING) && !readonly)
{
if (!continue_recovery())
{
@@ -483,6 +483,10 @@ void osd_t::report_pg_state(pg_t & pg)
pg.all_peers = pg.target_set;
std::sort(pg.all_peers.begin(), pg.all_peers.end());
pg.cur_peers = pg.target_set;
// Change pg_config at the same time, otherwise our PG reconciling loop may try to apply the old metadata
auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num];
pg_cfg.target_history = pg.target_history;
pg_cfg.all_peers = pg.all_peers;
}
else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
{
@@ -522,6 +526,9 @@ void osd_t::report_pg_state(pg_t & pg)
pg.cur_peers.push_back(pg_osd);
}
}
auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num];
pg_cfg.target_history = pg.target_history;
pg_cfg.all_peers = pg.all_peers;
}
if (pg.state == PG_OFFLINE && !this->pg_config_applied)
{

View File

@@ -91,7 +91,7 @@ void pg_obj_state_check_t::walk()
pg->state |= PG_DEGRADED;
}
pg->state |= PG_ACTIVE;
if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
if (pg->cur_peers.size() < pg->all_peers.size())
{
pg->state |= PG_LEFT_ON_DEAD;
}

View File

@@ -219,7 +219,7 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg)
op_data->n_subops++;
}
}
if (op_data->n_subops)
if (op_data->n_subops > 0)
{
op_data->fact_ver = 0;
op_data->done = op_data->errors = 0;

View File

@@ -472,7 +472,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
osd_primary_op_data_t *op_data = cur_op->op_data;
op_data->n_subops = chunks_to_delete_count;
op_data->done = op_data->errors = op_data->errcode = 0;
if (!op_data->n_subops)
if (op_data->n_subops <= 0)
{
return;
}

View File

@@ -166,7 +166,7 @@ resume_6:
for (int i = 0; i < unstable_osd.len; i++)
{
// Except those from peered PGs
auto & w = op_data->unstable_writes[i];
auto & w = op_data->unstable_writes[unstable_osd.start + i];
pool_pg_num_t wpg = {
.pool_id = INODE_POOL(w.oid.inode),
.pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),

View File

@@ -12,6 +12,7 @@ bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
.oid = op_data->oid,
.osd_num = 0,
});
op_data->st = 1;
if (act_it != pg.flush_actions.end() &&
act_it->first.oid.inode == op_data->oid.inode &&
(act_it->first.oid.stripe & ~STRIPE_MASK) == op_data->oid.stripe)
@@ -23,7 +24,6 @@ bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
auto vo_it = pg.write_queue.find(op_data->oid);
if (vo_it != pg.write_queue.end())
{
op_data->st = 1;
pg.write_queue.emplace(op_data->oid, cur_op);
return false;
}

View File

@@ -142,11 +142,11 @@ inline bool operator < (const reed_sol_erased_t &a, const reed_sol_erased_t &b)
for (int i = 0; i < a.size && i < b.size; i++)
{
if (a.data[i] < b.data[i])
return -1;
return true;
else if (a.data[i] > b.data[i])
return 1;
return false;
}
return 0;
return false;
}
struct reed_sol_matrix_t
@@ -677,11 +677,11 @@ void* calc_rmw(void *request_buf, osd_rmw_stripe_t *stripes, uint64_t *read_osd_
static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, uint32_t wr_end, buf_len_t *bufs, int & nbufs)
{
uint32_t ns = 0, ne = 0, os = 0, oe = 0;
if (stripe.req_end > wr_start &&
stripe.req_start < wr_end)
if (stripe.write_end > wr_start &&
stripe.write_start < wr_end)
{
ns = std::max(stripe.req_start, wr_start);
ne = std::min(stripe.req_end, wr_end);
ns = std::max(stripe.write_start, wr_start);
ne = std::min(stripe.write_end, wr_end);
}
if (stripe.read_end > wr_start &&
stripe.read_start < wr_end)
@@ -692,7 +692,7 @@ static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, ui
if (ne && (!oe || ns <= os))
{
// NEW or NEW->OLD
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.req_start, .len = ne-ns };
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.write_start, .len = ne-ns };
if (os < ne)
os = ne;
if (oe > os)
@@ -708,7 +708,7 @@ static void get_old_new_buffers(osd_rmw_stripe_t & stripe, uint32_t wr_start, ui
{
// OLD->NEW or OLD->NEW->OLD
bufs[nbufs++] = { .buf = (uint8_t*)stripe.read_buf + os - stripe.read_start, .len = ns-os };
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.req_start, .len = ne-ns };
bufs[nbufs++] = { .buf = (uint8_t*)stripe.write_buf + ns - stripe.write_start, .len = ne-ns };
if (oe > ne)
{
// OLD->NEW->OLD

View File

@@ -17,6 +17,7 @@ void test4();
void test5();
void test6();
void test7();
void test_rmw_4k_degraded_into_lost_to_normal(bool ec);
void test8();
void test9();
void test10();
@@ -39,6 +40,8 @@ int main(int narg, char *args[])
test6();
// Test 7
test7();
test_rmw_4k_degraded_into_lost_to_normal(false);
test_rmw_4k_degraded_into_lost_to_normal(true);
// Test 8
test8();
// Test 9
@@ -316,6 +319,69 @@ void test7()
/***
7/2. calc_rmw(offset=48K, len=4K, osd_set=[0,2,3], write_set=[1,2,3])
= {
read: [ [ 0, 128K ], [ 0, 128K ], [ 0, 128K ] ],
write: [ [ 48K, 52K ], [ 0, 0 ], [ 48K, 52K ] ],
input buffer: [ write0 ],
rmw buffer: [ write2, read0, read1, read2 ],
}
then, after calc_rmw_parity_xor/ec(): {
write: [ [ 0, 128K ], [ 0, 0 ], [ 48K, 52K ] ],
write0==read0,
}
+ check write0, write2 buffers
***/
void test_rmw_4k_degraded_into_lost_to_normal(bool ec)
{
osd_num_t osd_set[3] = { 0, 2, 3 };
osd_num_t write_osd_set[3] = { 1, 2, 3 };
osd_rmw_stripe_t stripes[3] = {};
// Subtest 1
split_stripes(2, 128*1024, 48*1024, 4096, stripes);
void *write_buf = malloc(4096);
void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 3, 2, 3, write_osd_set, 128*1024, 0);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
assert(stripes[0].write_start == 48*1024 && stripes[0].write_end == 52*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
assert(stripes[2].write_start == 48*1024 && stripes[2].write_end == 52*1024);
assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024);
assert(stripes[1].read_buf == (uint8_t*)rmw_buf+4*1024+128*1024);
assert(stripes[2].read_buf == (uint8_t*)rmw_buf+4*1024+2*128*1024);
assert(stripes[0].write_buf == write_buf);
assert(stripes[1].write_buf == NULL);
assert(stripes[2].write_buf == rmw_buf);
// Subtest 2
set_pattern(write_buf, 4096, PATTERN2);
set_pattern(stripes[1].read_buf, 128*1024, PATTERN1);
set_pattern(stripes[2].read_buf, 128*1024, PATTERN0^PATTERN1);
if (!ec)
calc_rmw_parity_xor(stripes, 3, osd_set, write_osd_set, 128*1024, 0);
else
{
use_ec(3, 2, true);
calc_rmw_parity_ec(stripes, 3, 2, osd_set, write_osd_set, 128*1024, 0);
use_ec(3, 2, false);
}
assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024);
assert(stripes[1].write_start == 0 && stripes[1].write_end == 0);
assert(stripes[2].write_start == 48*1024 && stripes[2].write_end == 52*1024);
assert(stripes[0].write_buf == stripes[0].read_buf);
assert(stripes[1].write_buf == NULL);
assert(stripes[2].write_buf == rmw_buf);
check_pattern(stripes[0].write_buf, 4096, PATTERN0);
check_pattern(stripes[0].write_buf+48*1024, 4096, PATTERN2);
check_pattern(stripes[2].write_buf, 4096, PATTERN2^PATTERN1); // new parity
free(rmw_buf);
free(write_buf);
}
/***
8. calc_rmw(offset=0, len=128K+4K, osd_set=[0,2,3], write_set=[1,2,3])
= {
read: [ [ 0, 0 ], [ 4K, 128K ], [ 0, 0 ] ],

View File

@@ -9,6 +9,9 @@
#endif
#include "qemu/osdep.h"
#include "qemu/main-loop.h"
#if QEMU_VERSION_MAJOR >= 8
#include "block/block-io.h"
#endif
#include "block/block_int.h"
#include "qapi/error.h"
#include "qapi/qmp/qdict.h"
@@ -268,7 +271,13 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
}
else
{
#if QEMU_VERSION_MAJOR >= 8
aio_co_enter(bdrv_get_aio_context(bs), qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
#elif QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 9 || QEMU_VERSION_MAJOR >= 3
bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
#else
qemu_coroutine_enter(qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
#endif
BDRV_POLL_WHILE(bs, !task.complete);
}
client->image = image;
@@ -732,8 +741,13 @@ static BlockDriver bdrv_vitastor = {
.bdrv_parse_filename = vitastor_parse_filename,
.bdrv_has_zero_init = bdrv_has_zero_init_1,
#if QEMU_VERSION_MAJOR >= 8
.bdrv_co_get_info = vitastor_get_info,
.bdrv_co_getlength = vitastor_getlength,
#else
.bdrv_get_info = vitastor_get_info,
.bdrv_getlength = vitastor_getlength,
#endif
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2
.bdrv_probe_blocksizes = vitastor_probe_blocksizes,
#endif

View File

@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: Vitastor
Description: Vitastor client library
Version: 0.8.6
Version: 0.8.8
Libs: -L${libdir} -lvitastor_client
Cflags: -I${includedir}

View File

@@ -64,4 +64,4 @@ echo leak:librbd >> testdata/lsan-suppress.txt
echo leak:_M_mutate >> testdata/lsan-suppress.txt
echo leak:_M_assign >> testdata/lsan-suppress.txt
export LSAN_OPTIONS=report_objects=true:suppressions=`pwd`/testdata/lsan-suppress.txt
export ASAN_OPTIONS=verify_asan_link_order=false
export ASAN_OPTIONS=verify_asan_link_order=false:abort_on_error=1

View File

@@ -17,17 +17,17 @@ else
fi
if [ "$IMMEDIATE_COMMIT" != "" ]; then
NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 1"
NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 10"
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1,"immediate_commit":"all"}'
else
NO_SAME="--journal_sector_buffer_count 1024 --log_level 1"
NO_SAME="--journal_sector_buffer_count 1024 --log_level 10"
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1}'
fi
start_osd()
{
local i=$1
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) >>./testdata/osd$i.log 2>&1 &
eval OSD${i}_PID=$!
}

View File

@@ -43,3 +43,6 @@ SCHEME=ec ./test_snapshot.sh
SCHEME=xor ./test_write.sh
./test_write_no_same.sh
./test_heal.sh
SCHEME=ec PG_MINSIZE=2 ./test_heal.sh

View File

@@ -28,9 +28,7 @@ if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only |\
format_error "FAILED: OSD NOT ADDED INTO DISTRIBUTION"
fi
if ! ($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"]) ] | length) == '$PG_COUNT''); then
format_error "FAILED: $PG_COUNT PGS NOT ACTIVE"
fi
wait_finish_rebalance 10
sleep 1
kill -9 $OSD4_PID
@@ -52,8 +50,6 @@ if ! ($ETCDCTL get /vitastor/config/pgs --print-value-only |\
format_error "FAILED: OSD NOT REMOVED FROM DISTRIBUTION"
fi
if ! ($ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e '([ .[] | select(.state == ["active"] or .state == ["active", "left_on_dead"]) ] | length) == '$PG_COUNT''); then
format_error "FAILED: $PG_COUNT PGS NOT ACTIVE"
fi
wait_finish_rebalance 10
format_green OK

View File

@@ -43,7 +43,7 @@ kill_osds &
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=16 -fsync=256 -rw=randwrite \
-mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120 2>/dev/null
-mirror_file=./testdata/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120
qemu-img convert -S 4096 -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:image=testimg" \

View File

@@ -10,7 +10,7 @@ kill -INT $OSD2_PID
sleep 5
if ! ($ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active", "degraded"]) ] | length == '$PG_COUNT); then
if ! ($ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active", "degraded", "left_on_dead"]) ] | length == '$PG_COUNT); then
format_error "FAILED: $PG_COUNT PG(s) NOT ACTIVE+DEGRADED"
fi

View File

@@ -7,7 +7,7 @@ OSD_COUNT=5
OSD_ARGS=
for i in $(seq 1 $OSD_COUNT); do
dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) >>./testdata/osd$i.log 2>&1 &
eval OSD${i}_PID=$!
done

View File

@@ -53,7 +53,7 @@ for i in $(seq 1 $OSD_COUNT); do
--data_device ./testdata/test_osd$i.bin \
--meta_offset 0 \
--journal_offset $((1024*1024)) \
--data_offset $((128*1024*1024)) &>./testdata/osd$i.log &
--data_offset $((128*1024*1024)) >>./testdata/osd$i.log 2>&1 &
eval OSD${i}_PID=$!
done

View File

@@ -30,7 +30,7 @@ qemu-img create -f qcow2 ./testdata/empty.qcow2 32M
qemu-img convert -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024)):skip-parents=1" \
-O qcow2 -o 'cluster_size=4k' -B empty.qcow2 ./testdata/layer1.qcow2
-O qcow2 -o 'cluster_size=4k,backing_fmt=qcow2' -B empty.qcow2 ./testdata/layer1.qcow2
qemu-img convert -S 4096 -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=3:size=$((32*1024*1024))" \
@@ -64,7 +64,7 @@ cmp ./testdata/merged.bin ./testdata/merged-by-tool.bin
# Test merge by qemu-img
qemu-img rebase -u -b layer0.qcow2 ./testdata/layer1.qcow2
qemu-img rebase -u -b layer0.qcow2 -F qcow2 ./testdata/layer1.qcow2
qemu-img convert -S 4096 -f qcow2 ./testdata/layer1.qcow2 -O raw ./testdata/rebased.bin

View File

@@ -21,7 +21,8 @@ LD_PRELOAD="build/src/libfio_vitastor.so" \
# Kill OSD 2, start OSD 1
kill $OSD2_PID
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options --device ./testdata/test_osd2.bin 2>/dev/null) >>./testdata/osd2.log 2>&1 &
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL \
$(build/src/vitastor-disk simple-offsets --format options --device ./testdata/test_osd2.bin 2>/dev/null) >>./testdata/osd2.log 2>&1 &
sleep 2
# Check PG state - it should NOT become active

View File

@@ -10,7 +10,7 @@ etcdctl --endpoints=http://127.0.0.1:12379/v3 del --prefix /vitastor/osd/state
OSD_COUNT=3
OSD_ARGS=
for i in $(seq 1 $OSD_COUNT); do
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) >>./testdata/osd$i.log 2>&1 &
eval OSD${i}_PID=$!
done

View File

@@ -12,6 +12,6 @@ GLOBAL_CONF='{"immediate_commit":"all"}'
# Test basic write
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=1G -cluster_log_level=10
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4M -direct=1 -iodepth=1 -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -cluster_log_level=10
format_green OK