Compare commits

...

20 Commits

Author SHA1 Message Date
5ef8bed75f Release 0.8.2
- Fix QEMU driver compatibility with QEMU 7.0 and < 2.9
- Add patches for pve-qemu-kvm 7.1 (PVE 7.3) and pve-qemu-kvm 6.2 (PVE 7.2)
- Fix Proxmox driver location in the pve-storage-vitastor package
- Disable HDD autodetection in non-hybrid mode
- Explicitly warn about a buggy kernels on -EAGAIN in io_uring
- Final fix for the lack of zeroing out of old metadata entries
  (do not crash with "big_write journal_entry was allocated over another object"
  in some cases after an unclean OSD shutdown)
- Wait for data writes before fsyncing data if data fsync is enabled
- Never try to wait for free space inside blockstore thus stalling OSDs
- Fix a rare crash in osd_peering due to callback ordering
- Fix a rare duplication of ping & op message IDs
- Fix a rare use-after-free during pings
- Add --force to vitastor-disk read-sb
- Make vitastor-disk dump metadata object IDs in hex, add forgotten commas
- Fix vitastor-disk SCSI disk cache check
2022-12-17 17:54:13 +03:00
8669998e5e Fix discard_list_subop() for local ops 2022-12-17 17:54:13 +03:00
b457327e77 Oops. Fix metadata read after fixes :-) 2022-12-17 17:31:57 +03:00
f7fa9d5e34 Fix SCSI device cache type check 2022-12-17 17:31:57 +03:00
49b88b01f9 Fix clang build 2022-12-17 16:25:26 +03:00
71688bcb59 Disable HDD autodetection in non-hybrid mode 2022-12-17 16:12:15 +03:00
552e207d2b Explicitly print errors about -EAGAIN in io_uring 2022-12-17 15:49:49 +03:00
5464821fa5 Final fix for the lack of zeroing out of old metadata entries
If a crash occurs during flushing a redirect-write it may happen so that
the disk contains both old and new metadata entries. This is OK, but prior
to 0.8.0 after this situation OSDs started without problem, but then they
crashed after some more overwrites with a "tried to overwrite non-zero
metadata entry" error. 0.8.0 introduced a change that was intended to fix
this situation, but rather than fixing it it prevented OSDs from starting,
now because of a "big_write journal_entry was allocated over another object"
error... :-)

This change finally fixes the original issue.

Followup to 54ef2c389f
2022-12-17 14:50:31 +03:00
6917a32ca8 Add --force to vitastor-disk read-sb 2022-12-17 02:47:15 +03:00
f8722a8bd5 Dump meta in hex 2022-12-17 01:50:38 +03:00
9c2f69c9fa Add forgotten commas to vitastor-disk dump-journal 2022-12-17 01:22:58 +03:00
1a93e3f33a Wait for data writes before fsyncing data if data fsync is enabled 2022-12-16 20:46:55 +03:00
3f35744052 Fix compatibility with QEMU aio_set_fd_handler signatures in 7.0 and < 2.9 2022-12-15 19:17:17 +03:00
66f14ac019 Update notes about Proxmox 7.1-7.3 2022-12-15 18:57:28 +03:00
1364009931 Add patches for pve-qemu-kvm 7.1 (PVE 7.3) and pve-qemu-kvm 6.2 (PVE 7.2) 2022-12-14 19:01:36 +03:00
d7e30b8353 Fix pve-storage-vitastor filename 2022-12-14 16:41:35 +03:00
cb437913d3 Never try to wait for free space inside blockstore 2022-12-12 00:27:05 +03:00
472bce58ab Fix rare crash in osd_peering due to callback ordering 2022-12-12 00:27:05 +03:00
7a71e7ef01 Fix possible duplication of ping & op message IDs 2022-12-04 00:16:47 +03:00
c71e5e7bbd Fix possible use-after-free during pings 2022-12-04 00:16:47 +03:00
44 changed files with 566 additions and 128 deletions

View File

@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8)
project(vitastor)
set(VERSION "0.8.1")
set(VERSION "0.8.2")
add_subdirectory(src)

View File

@@ -1,4 +1,4 @@
VERSION ?= v0.8.1
VERSION ?= v0.8.2
all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v0.8.1
image: vitalif/vitastor-csi:v0.8.2
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -116,7 +116,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v0.8.1
image: vitalif/vitastor-csi:v0.8.2
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "0.8.1"
vitastorCSIDriverVersion = "0.8.2"
)
// Config struct fills the parameters of request or user input

4
debian/changelog vendored
View File

@@ -1,10 +1,10 @@
vitastor (0.8.1-1) unstable; urgency=medium
vitastor (0.8.2-1) unstable; urgency=medium
* Bugfixes
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
vitastor (0.8.1-1) unstable; urgency=medium
vitastor (0.8.2-1) unstable; urgency=medium
* Implement NFS proxy
* Add documentation

View File

@@ -1 +1 @@
patches/PVE_VitastorPlugin.pm usr/share/perl5/PVE/Storage/Custom/VitastorPlugin.pm
patches/VitastorPlugin.pm usr/share/perl5/PVE/Storage/Custom/

View File

@@ -34,8 +34,8 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-0.8.1; \
cd vitastor-0.8.1; \
cp -r /root/vitastor vitastor-0.8.2; \
cd vitastor-0.8.2; \
ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -48,8 +48,8 @@ RUN set -e -x; \
rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.1.orig.tar.xz vitastor-0.8.1; \
cd vitastor-0.8.1; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.8.2.orig.tar.xz vitastor-0.8.2; \
cd vitastor-0.8.2; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

View File

@@ -6,10 +6,10 @@
# Proxmox VE
To enable Vitastor support in Proxmox Virtual Environment (6.4 and 7.1 are supported):
To enable Vitastor support in Proxmox Virtual Environment (6.4-7.3 are supported):
- Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts
(buster for 6.4, bullseye for 7.1)
- Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts:
buster for 6.4, bullseye for 7.3, pve7.1 for 7.1, pve7.2 for 7.2
- Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
- Define storage in `/etc/pve/storage.cfg` (see below)
- Block network access from VMs to Vitastor network (to OSDs and etcd),
@@ -35,5 +35,5 @@ vitastor: vitastor
vitastor_nbd 0
```
\* Note: you can also manually copy [patches/PVE_VitastorPlugin.pm](patches/PVE_VitastorPlugin.pm) to Proxmox hosts
\* Note: you can also manually copy [patches/VitastorPlugin.pm](patches/VitastorPlugin.pm) to Proxmox hosts
as `/usr/share/perl5/PVE/Storage/Custom/VitastorPlugin.pm` instead of installing pve-storage-vitastor.

View File

@@ -6,10 +6,10 @@
# Proxmox
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4 и 7.1):
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-7.3):
- Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox
(buster для 6.4, bullseye для 7.1)
- Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox:
buster для 6.4, bullseye для 7.3, pve7.1 для 7.1, pve7.2 для 7.2
- Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
- Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
- Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию
@@ -35,5 +35,5 @@ vitastor: vitastor
```
\* Примечание: вместо установки пакета pve-storage-vitastor вы можете вручную скопировать файл
[patches/PVE_VitastorPlugin.pm](patches/PVE_VitastorPlugin.pm) на хосты Proxmox как
[patches/VitastorPlugin.pm](patches/VitastorPlugin.pm) на хосты Proxmox как
`/usr/share/perl5/PVE/Storage/Custom/VitastorPlugin.pm`.

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VERSION = '0.8.1'
VERSION = '0.8.2'
LOG = logging.getLogger(__name__)

View File

@@ -0,0 +1,169 @@
Index: qemu/block/meson.build
===================================================================
--- qemu.orig/block/meson.build
+++ qemu/block/meson.build
@@ -91,6 +91,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
Index: qemu/meson.build
===================================================================
--- qemu.orig/meson.build
+++ qemu/meson.build
@@ -838,6 +838,26 @@ if not get_option('rbd').auto() or have_
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'), kwargs: static_kwargs)
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -1459,6 +1479,7 @@ config_host_data.set('CONFIG_LINUX_AIO',
config_host_data.set('CONFIG_LINUX_IO_URING', linux_io_uring.found())
config_host_data.set('CONFIG_LIBPMEM', libpmem.found())
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_SDL', sdl.found())
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
config_host_data.set('CONFIG_SECCOMP', seccomp.found())
@@ -3424,6 +3445,7 @@ if spice_protocol.found()
summary_info += {' spice server support': spice}
endif
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'xfsctl support': config_host.has_key('CONFIG_XFS')}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
Index: qemu/meson_options.txt
===================================================================
--- qemu.orig/meson_options.txt
+++ qemu/meson_options.txt
@@ -121,6 +121,8 @@ option('lzo', type : 'feature', value :
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('gtk', type : 'feature', value : 'auto',
description: 'GTK+ user interface')
option('sdl', type : 'feature', value : 'auto',
Index: qemu/qapi/block-core.json
===================================================================
--- qemu.orig/qapi/block-core.json
+++ qemu/qapi/block-core.json
@@ -3179,7 +3179,7 @@
'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
'pbs',
- 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor', 'vmdk', 'vpc', 'vvfat' ] }
##
# @BlockdevOptionsFile:
@@ -4125,6 +4125,28 @@
'*server': ['InetSocketAddressBase'] } }
##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
+##
# @ReplicationMode:
#
# An enumeration of replication modes.
@@ -4520,6 +4542,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'vmdk': 'BlockdevOptionsGenericCOWFormat',
'vpc': 'BlockdevOptionsGenericFormat',
'vvfat': 'BlockdevOptionsVVFAT'
@@ -4910,6 +4933,17 @@
'*encrypt' : 'RbdEncryptionCreateOptions' } }
##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
+##
# @BlockdevVmdkSubformat:
#
# Subformat options for VMDK images
@@ -5108,6 +5142,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
Index: qemu/scripts/ci/org.centos/stream/8/x86_64/configure
===================================================================
--- qemu.orig/scripts/ci/org.centos/stream/8/x86_64/configure
+++ qemu/scripts/ci/org.centos/stream/8/x86_64/configure
@@ -31,7 +31,7 @@
--with-git=meson \
--with-git-submodules=update \
--target-list="x86_64-softmmu" \
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
--audio-drv-list="" \
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
--with-coroutine=ucontext \
@@ -183,6 +183,7 @@
--enable-opengl \
--enable-pie \
--enable-rbd \
+--enable-vitastor \
--enable-rdma \
--enable-seccomp \
--enable-snappy \

View File

@@ -0,0 +1,169 @@
Index: qemu/block/meson.build
===================================================================
--- qemu.orig/block/meson.build
+++ qemu/block/meson.build
@@ -111,6 +111,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
Index: qemu/meson.build
===================================================================
--- qemu.orig/meson.build
+++ qemu/meson.build
@@ -967,6 +967,26 @@ if not get_option('rbd').auto() or have_
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'), kwargs: static_kwargs)
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -1802,6 +1822,7 @@ config_host_data.set('CONFIG_NUMA', numa
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_PROFILER', get_option('profiler'))
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_SDL', sdl.found())
config_host_data.set('CONFIG_SDL_IMAGE', sdl_image.found())
@@ -3965,6 +3986,7 @@ if spice_protocol.found()
summary_info += {' spice server support': spice}
endif
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
Index: qemu/meson_options.txt
===================================================================
--- qemu.orig/meson_options.txt
+++ qemu/meson_options.txt
@@ -167,6 +167,8 @@ option('lzo', type : 'feature', value :
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
Index: qemu/qapi/block-core.json
===================================================================
--- qemu.orig/qapi/block-core.json
+++ qemu/qapi/block-core.json
@@ -3209,7 +3209,7 @@
'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
'pbs',
- 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor', 'vmdk', 'vpc', 'vvfat' ] }
##
# @BlockdevOptionsFile:
@@ -4149,6 +4149,28 @@
'*server': ['InetSocketAddressBase'] } }
##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
+##
# @ReplicationMode:
#
# An enumeration of replication modes.
@@ -4593,6 +4615,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'vmdk': 'BlockdevOptionsGenericCOWFormat',
'vpc': 'BlockdevOptionsGenericFormat',
'vvfat': 'BlockdevOptionsVVFAT'
@@ -4985,6 +5008,17 @@
'*encrypt' : 'RbdEncryptionCreateOptions' } }
##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
+##
# @BlockdevVmdkSubformat:
#
# Subformat options for VMDK images
@@ -5182,6 +5216,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
Index: qemu/scripts/ci/org.centos/stream/8/x86_64/configure
===================================================================
--- qemu.orig/scripts/ci/org.centos/stream/8/x86_64/configure
+++ qemu/scripts/ci/org.centos/stream/8/x86_64/configure
@@ -31,7 +31,7 @@
--with-git=meson \
--with-git-submodules=update \
--target-list="x86_64-softmmu" \
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
--audio-drv-list="" \
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
--with-coroutine=ucontext \
@@ -179,6 +179,7 @@
--enable-opengl \
--enable-pie \
--enable-rbd \
+--enable-vitastor \
--enable-rdma \
--enable-seccomp \
--enable-snappy \

View File

@@ -9,7 +9,7 @@ for i in "$DIR"/qemu-*-vitastor.patch "$DIR"/pve-qemu-*-vitastor.patch; do
echo '===================================================================' >> $i
echo '--- /dev/null' >> $i
echo '+++ a/block/vitastor.c' >> $i
echo '@@ -0,0 +1,'$(wc -l "$DIR"/../src/qemu_driver.c)' @@' >> $i
echo '@@ -0,0 +1,'$(wc -l "$DIR"/../src/qemu_driver.c | cut -d ' ' -f 1)' @@' >> $i
cat "$DIR"/../src/qemu_driver.c | sed 's/^/+/' >> $i
fi
done

View File

@@ -25,4 +25,4 @@ rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-0.8.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.1$(rpm --eval '%dist').tar.gz *
tar --transform 's#^#vitastor-0.8.2/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.8.2$(rpm --eval '%dist').tar.gz *

View File

@@ -58,7 +58,7 @@
+BuildRequires: gperftools-devel
+BuildRequires: libusbx-devel >= 1.0.21
%if %{have_usbredir}
BuildRequires: usbredir-devel >= 0.8.1
BuildRequires: usbredir-devel >= 0.8.2
%endif
@@ -856,12 +861,13 @@ BuildRequires: virglrenderer-devel
# For smartcard NSS support

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.8.1.el7.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.8.2.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.8.1
Version: 0.8.2
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.8.1.el7.tar.gz
Source0: vitastor-0.8.2.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.8.1.el8.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-0.8.2.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.8.1
Version: 0.8.2
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.8.1.el8.tar.gz
Source0: vitastor-0.8.2.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -15,7 +15,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
add_definitions(-DVERSION="0.8.1")
add_definitions(-DVERSION="0.8.2")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
if (${WITH_ASAN})
add_definitions(-fsanitize=address -fno-omit-frame-pointer)

View File

@@ -35,24 +35,14 @@ journal_flusher_co::journal_flusher_co()
{
bs->live = true;
if (data->res != data->iov.iov_len)
{
throw std::runtime_error(
"data read operation failed during flush ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
"). can't continue, sorry :-("
);
}
bs->disk_error_abort("read operation during flush", data->res, data->iov.iov_len);
wait_count--;
};
simple_callback_w = [this](ring_data_t* data)
{
bs->live = true;
if (data->res != data->iov.iov_len)
{
throw std::runtime_error(
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
"). state "+std::to_string(wait_state)+". in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
);
}
bs->disk_error_abort("write operation during flush", data->res, data->iov.iov_len);
wait_count--;
};
}
@@ -306,6 +296,8 @@ bool journal_flusher_co::loop()
goto resume_20;
else if (wait_state == 21)
goto resume_21;
else if (wait_state == 22)
goto resume_22;
resume_0:
if (flusher->flush_queue.size() < flusher->min_flusher_count && !flusher->trim_wanted ||
!flusher->flush_queue.size() || !flusher->dequeuing)
@@ -511,6 +503,13 @@ resume_1:
);
wait_count++;
}
// Wait for data writes before fsyncing it
resume_22:
if (wait_count > 0)
{
wait_state = 22;
return false;
}
// Sync data before writing metadata
resume_16:
resume_17:
@@ -521,7 +520,7 @@ resume_1:
return false;
}
resume_5:
// And metadata writes, but only after data writes complete
// Submit metadata writes, but only when data is written and fsynced
if (!bs->inmemory_meta && meta_new.it->second.state == 0 || wait_count > 0)
{
// metadata sector is still being read or data is still being written, wait for it
@@ -616,7 +615,8 @@ resume_1:
for (it = v.begin(); it != v.end(); it++)
{
// Free it if it's not taken from the journal
if (it->buf && (!bs->journal.inmemory || it->buf < bs->journal.buffer || it->buf >= bs->journal.buffer + bs->journal.len))
if (it->buf && (!bs->journal.inmemory || it->buf < bs->journal.buffer ||
it->buf >= (uint8_t*)bs->journal.buffer + bs->journal.len))
{
free(it->buf);
}

View File

@@ -306,17 +306,6 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
// do not submit
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting for a journal buffer\n");
#endif
return;
}
PRIV(op)->wait_for = 0;
}
else if (PRIV(op)->wait_for == WAIT_FREE)
{
if (!data_alloc->get_free_count() && flusher->is_active())
{
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting for free space on the data device\n");
#endif
return;
}
@@ -687,3 +676,16 @@ void blockstore_impl_t::dump_diagnostics()
journal.dump_diagnostics();
flusher->dump_diagnostics();
}
void blockstore_impl_t::disk_error_abort(const char *op, int retval, int expected)
{
if (retval == -EAGAIN)
{
fprintf(stderr, "EAGAIN error received from a disk %s during flush."
" It must never happen with io_uring and indicates a kernel bug."
" Please upgrade your kernel. Aborting.\n", op);
exit(1);
}
fprintf(stderr, "Disk %s failed: result is %d, expected %d. Can't continue, sorry :-(\n", op, retval, expected);
exit(1);
}

View File

@@ -160,8 +160,6 @@ struct __attribute__((__packed__)) dirty_entry
#define WAIT_JOURNAL 3
// Suspend operation until the next journal sector buffer is free
#define WAIT_JOURNAL_BUFFER 4
// Suspend operation until there is some free space on the data device
#define WAIT_FREE 5
struct fulfill_read_t
{
@@ -294,6 +292,7 @@ class blockstore_impl_t
// Journaling
void prepare_journal_sector_write(int sector, blockstore_op_t *op);
void handle_journal_write(ring_data_t *data, uint64_t flush_id);
void disk_error_abort(const char *op, int retval, int expected);
// Asynchronous init
int initialized;

View File

@@ -48,14 +48,12 @@ void blockstore_init_meta::handle_event(ring_data_t *data, int buf_num)
int blockstore_init_meta::loop()
{
if (wait_state == 1)
goto resume_1;
else if (wait_state == 2)
goto resume_2;
else if (wait_state == 3)
goto resume_3;
else if (wait_state == 4)
goto resume_4;
if (wait_state == 1) goto resume_1;
else if (wait_state == 2) goto resume_2;
else if (wait_state == 3) goto resume_3;
else if (wait_state == 4) goto resume_4;
else if (wait_state == 5) goto resume_5;
else if (wait_state == 6) goto resume_6;
printf("Reading blockstore metadata\n");
if (bs->inmemory_meta)
metadata_buffer = bs->metadata_buffer;
@@ -140,6 +138,7 @@ resume_1:
// Skip superblock
md_offset = bs->dsk.meta_block_size;
next_offset = md_offset;
entries_per_block = bs->dsk.meta_block_size / bs->dsk.clean_entry_size;
// Read the rest of the metadata
resume_2:
if (next_offset < bs->dsk.meta_len && submitted == 0)
@@ -179,17 +178,15 @@ resume_2:
if (bufs[i].state == INIT_META_READ_DONE)
{
// Handle result
unsigned entries_per_block = bs->dsk.meta_block_size / bs->dsk.clean_entry_size;
bool changed = false;
for (uint64_t sector = 0; sector < bufs[i].size; sector += bs->dsk.meta_block_size)
{
// handle <count> entries
changed = changed || handle_entries(
bufs[i].buf + sector, entries_per_block,
((bufs[i].offset + sector - md_offset) / bs->dsk.meta_block_size) * entries_per_block
);
if (handle_meta_block(bufs[i].buf + sector, entries_per_block,
((bufs[i].offset + sector - md_offset) / bs->dsk.meta_block_size) * entries_per_block))
changed = true;
}
if (changed && !bs->inmemory_meta)
if (changed && !bs->inmemory_meta && !bs->readonly)
{
// write the modified buffer back
GET_SQE();
@@ -211,6 +208,43 @@ resume_2:
wait_state = 2;
return 1;
}
if (entries_to_zero.size() && !bs->inmemory_meta && !bs->readonly)
{
// we have to zero out additional entries
for (i = 0; i < entries_to_zero.size(); )
{
next_offset = entries_to_zero[i]/entries_per_block;
for (j = i; j < entries_to_zero.size() && entries_to_zero[j]/entries_per_block == next_offset; j++) {}
GET_SQE();
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
submitted++;
resume_5:
if (submitted > 0)
{
wait_state = 5;
return 1;
}
for (; i < j; i++)
{
uint64_t pos = (entries_to_zero[i] % entries_per_block);
memset((uint8_t*)metadata_buffer + pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
}
GET_SQE();
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
submitted++;
resume_6:
if (submitted > 0)
{
wait_state = 6;
return 1;
}
}
entries_to_zero.clear();
}
// metadata read finished
printf("Metadata entries loaded: %lu, free blocks: %lu / %lu\n", entries_loaded, bs->data_alloc->get_free_count(), bs->dsk.block_count);
if (!bs->inmemory_meta)
@@ -236,10 +270,13 @@ resume_2:
return 0;
}
bool blockstore_init_meta::handle_entries(uint8_t *buf, uint64_t count, uint64_t done_cnt)
bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_block, uint64_t done_cnt)
{
bool updated = false;
for (uint64_t i = 0; i < count; i++)
uint64_t max_i = entries_per_block;
if (max_i > bs->dsk.block_count-done_cnt)
max_i = bs->dsk.block_count-done_cnt;
for (uint64_t i = 0; i < max_i; i++)
{
clean_disk_entry *entry = (clean_disk_entry*)(buf + i*bs->dsk.clean_entry_size);
if (!bs->inmemory_meta && bs->dsk.clean_entry_bitmap_size)
@@ -255,17 +292,35 @@ bool blockstore_init_meta::handle_entries(uint8_t *buf, uint64_t count, uint64_t
if (clean_it != clean_db.end())
{
// free the previous block
// here we have to zero out the entry because otherwise we'll hit
// here we have to zero out the previous entry because otherwise we'll hit
// "tried to overwrite non-zero metadata entry" later
updated = true;
memset(entry, 0, bs->dsk.clean_entry_size);
uint64_t old_clean_loc = clean_it->second.location >> bs->dsk.block_order;
if (bs->inmemory_meta)
{
uint64_t sector = (old_clean_loc / entries_per_block) * bs->dsk.meta_block_size;
uint64_t pos = (old_clean_loc % entries_per_block);
clean_disk_entry *old_entry = (clean_disk_entry*)((uint8_t*)bs->metadata_buffer + sector + pos*bs->dsk.clean_entry_size);
memset(old_entry, 0, bs->dsk.clean_entry_size);
}
else if (old_clean_loc >= done_cnt)
{
updated = true;
uint64_t sector = ((old_clean_loc - done_cnt) / entries_per_block) * bs->dsk.meta_block_size;
uint64_t pos = (old_clean_loc % entries_per_block);
clean_disk_entry *old_entry = (clean_disk_entry*)(buf + sector + pos*bs->dsk.clean_entry_size);
memset(old_entry, 0, bs->dsk.clean_entry_size);
}
else
{
entries_to_zero.push_back(clean_it->second.location >> bs->dsk.block_order);
}
#ifdef BLOCKSTORE_DEBUG
printf("Free block %lu from %lx:%lx v%lu (new location is %lu)\n",
clean_it->second.location >> bs->dsk.block_order,
old_clean_loc,
clean_it->first.inode, clean_it->first.stripe, clean_it->second.version,
done_cnt+i);
#endif
bs->data_alloc->set(clean_it->second.location >> bs->dsk.block_order, false);
bs->data_alloc->set(old_clean_loc, false);
}
else
{

View File

@@ -24,7 +24,10 @@ class blockstore_init_meta
uint64_t md_offset = 0;
uint64_t next_offset = 0;
uint64_t entries_loaded = 0;
bool handle_entries(uint8_t *buf, uint64_t count, uint64_t done_cnt);
unsigned entries_per_block = 0;
int i = 0, j = 0;
std::vector<uint64_t> entries_to_zero;
bool handle_meta_block(uint8_t *buf, uint64_t count, uint64_t done_cnt);
void handle_event(ring_data_t *data, int buf_num);
public:
blockstore_init_meta(blockstore_impl_t *bs);

View File

@@ -198,10 +198,7 @@ void blockstore_impl_t::handle_journal_write(ring_data_t *data, uint64_t flush_i
if (data->res != data->iov.iov_len)
{
// FIXME: our state becomes corrupted after a write error. maybe do something better than just die
throw std::runtime_error(
"journal write failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
);
disk_error_abort("journal write", data->res, data->iov.iov_len);
}
auto fl_it = journal.flushing_ops.upper_bound((pending_journaling_t){ .flush_id = flush_id });
if (fl_it != journal.flushing_ops.end() && fl_it->flush_id == flush_id)

View File

@@ -261,12 +261,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
if (loc == UINT64_MAX)
{
// no space
if (flusher->is_active())
{
// hope that some space will be available after flush
PRIV(op)->wait_for = WAIT_FREE;
return 0;
}
cancel_all_writes(op, dirty_it, -ENOSPC);
return 2;
}
@@ -592,10 +586,7 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
if (data->res != data->iov.iov_len)
{
// FIXME: our state becomes corrupted after a write error. maybe do something better than just die
throw std::runtime_error(
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
);
disk_error_abort("data write", data->res, data->iov.iov_len);
}
PRIV(op)->pending_ops--;
assert(PRIV(op)->pending_ops >= 0);

View File

@@ -941,7 +941,7 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
.req = { .rw = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = op_id++,
.id = next_op_id(),
.opcode = op->opcode == OSD_OP_READ_BITMAP ? OSD_OP_READ : op->opcode,
},
.inode = op->cur_inode,
@@ -1069,7 +1069,7 @@ void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
.req = {
.hdr = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = op_id++,
.id = next_op_id(),
.opcode = OSD_OP_SYNC,
},
},
@@ -1181,5 +1181,5 @@ void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *par
uint64_t cluster_client_t::next_op_id()
{
return op_id++;
return msgr.next_subop_id++;
}

View File

@@ -85,7 +85,6 @@ class cluster_client_t
int up_wait_retry_interval = 500; // ms
int retry_timeout_id = 0;
uint64_t op_id = 1;
std::vector<cluster_op_t*> offline_ops;
cluster_op_t *op_queue_head = NULL, *op_queue_tail = NULL;
std::map<object_id, cluster_buffer_t> dirty_buffers;

View File

@@ -196,7 +196,7 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list)
.sec_list = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = op_id++,
.id = next_op_id(),
.opcode = OSD_OP_SEC_LIST,
},
.list_pg = cur_list->pg->pg_num,

View File

@@ -52,11 +52,12 @@ static const char *help_text =
" --disable_data_fsync 0 Disable data device cache and fsync (default off)\n"
" --disable_meta_fsync 0 Disable metadata device cache and fsync (default off)\n"
" --disable_journal_fsync 0 Disable journal device cache and fsync (default off)\n"
" --hdd Enable HDD defaults (1M block, 1G journal, throttling)\n"
" --force Bypass partition safety checks (for emptiness and so on)\n"
" \n"
" Options (both modes):\n"
" --journal_size 1G/32M Set journal size (area or partition size)\n"
" --block_size 1M/128k Set blockstore object size\n"
" --journal_size 32M/1G Set journal size (area or partition size)\n"
" --block_size 128k/1M Set blockstore object size\n"
" --bitmap_granularity 4k Set bitmap granularity\n"
" --data_device_block 4k Override data device block size\n"
" --meta_device_block 4k Override metadata device block size\n"
@@ -109,8 +110,9 @@ static const char *help_text =
" Commands are passed to systemctl with vitastor-osd@<num> units as arguments.\n"
" When --now is added to enable/disable, OSDs are also immediately started/stopped.\n"
"\n"
"vitastor-disk read-sb <device>\n"
"vitastor-disk read-sb [--force] <device>\n"
" Try to read Vitastor OSD superblock from <device> and print it in JSON format.\n"
" --force allows to bypass \"does not refer to the device itself\" errors.\n"
"\n"
"vitastor-disk write-sb <device>\n"
" Read JSON from STDIN and write it into Vitastor OSD superblock on <device>.\n"
@@ -195,6 +197,10 @@ int main(int argc, char *argv[])
{
self.options["hybrid"] = "1";
}
else if (!strcmp(argv[i], "--hdd"))
{
self.options["hdd"] = "1";
}
else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
{
cmd.insert(cmd.begin(), (char*)"help");

View File

@@ -111,7 +111,7 @@ struct disk_tool_t
int systemd_start_stop_osds(std::vector<std::string> cmd, std::vector<std::string> devices);
int pre_exec_osd(std::string device);
json11::Json read_osd_superblock(std::string device, bool expect_exist = true);
json11::Json read_osd_superblock(std::string device, bool expect_exist = true, bool ignore_nonref = false);
uint32_t write_osd_superblock(std::string device, json11::Json params);
int prepare_one(std::map<std::string, std::string> options, int is_hdd = -1);

View File

@@ -83,10 +83,15 @@ int disk_tool_t::dump_journal()
auto pos = journal_pos;
int r = process_journal_block(data, [this, pos](int num, journal_entry *je)
{
if (json && first2)
if (json)
{
if (dump_with_blocks)
printf("%s{\"offset\":\"0x%lx\",\"entries\":[\n", first ? "" : ",\n", pos);
{
if (first2)
printf("%s{\"offset\":\"0x%lx\",\"entries\":[\n", first ? "" : ",\n", pos);
}
else if (!first)
printf("%s", ",\n");
first = false;
}
dump_journal_entry(num, je, json);

View File

@@ -130,7 +130,7 @@ void disk_tool_t::dump_meta_header(blockstore_meta_header_v1_t *hdr)
void disk_tool_t::dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
{
printf(
#define ENTRY_FMT "{\"block\":%lu,\"pool\":%u,\"inode\":%lu,\"stripe\":%lu,\"version\":%lu"
#define ENTRY_FMT "{\"block\":%lu,\"pool\":%u,\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"version\":%lu"
(first ? ENTRY_FMT : (",\n" ENTRY_FMT)),
#undef ENTRY_FMT
block_num, INODE_POOL(entry->oid.inode), INODE_NO_POOL(entry->oid.inode),

View File

@@ -539,7 +539,7 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
fprintf(stderr, "Device list (positional arguments) and --hybrid are incompatible with --data_device\n");
return 1;
}
return prepare_one(options);
return prepare_one(options, options.find("hdd") != options.end() ? 1 : 0);
}
if (!devices.size())
{
@@ -549,12 +549,12 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
options.erase("data_device");
options.erase("meta_device");
options.erase("journal_device");
bool hybrid = options.find("hybrid") != options.end();
auto devinfo = collect_devices(devices);
if (!devinfo.size())
{
return 1;
}
bool hybrid = options.find("hybrid") != options.end();
uint64_t osd_per_disk = stoull_full(options["osd_per_disk"]);
if (!osd_per_disk)
osd_per_disk = 1;
@@ -612,7 +612,8 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
return 1;
}
}
prepare_one(options, dev.is_hdd ? 1 : 0);
// Treat all disks as SSDs if not in the hybrid mode
prepare_one(options, hybrid && dev.is_hdd ? 1 : 0);
}
}
}

View File

@@ -54,7 +54,7 @@ int disk_tool_t::udev_import(std::string device)
int disk_tool_t::read_sb(std::string device)
{
json11::Json sb = read_osd_superblock(device);
json11::Json sb = read_osd_superblock(device, true, options.find("force") != options.end());
if (sb.is_null())
{
return 1;
@@ -123,7 +123,7 @@ uint32_t disk_tool_t::write_osd_superblock(std::string device, json11::Json para
return sb_size;
}
json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_exist)
json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_exist, bool ignore_nonref)
{
vitastor_disk_superblock_t *sb = NULL;
uint8_t *buf = NULL;
@@ -226,7 +226,7 @@ json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_ex
{
device_type = "journal";
}
else
else if (!ignore_nonref)
{
if (expect_exist)
fprintf(stderr, "Invalid OSD superblock on %s: does not refer to the device itself\n", device.c_str());

View File

@@ -145,10 +145,10 @@ int disable_cache(std::string dev)
closedir(dir);
// Check cache_type
scsi_disk += "/cache_type";
std::string cache_type = read_file(scsi_disk);
std::string cache_type = trim(read_file(scsi_disk));
if (cache_type == "")
return 1;
if (cache_type == "write back")
if (cache_type != "write through")
{
int fd = open(scsi_disk.c_str(), O_WRONLY);
if (fd < 0 || write_blocking(fd, (void*)"write through", strlen("write through")) != strlen("write through"))

View File

@@ -80,12 +80,20 @@ void osd_messenger_t::init()
};
op->callback = [this, cl](osd_op_t *op)
{
auto cl_it = clients.find(op->peer_fd);
if (cl_it == clients.end() || cl_it->second != cl)
{
// client is already dropped
delete op;
return;
}
int fail_fd = (op->reply.hdr.retval != 0 ? op->peer_fd : -1);
auto fail_osd_num = cl->osd_num;
cl->ping_time_remaining = 0;
delete op;
if (fail_fd >= 0)
{
fprintf(stderr, "Ping failed for OSD %lu (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
fprintf(stderr, "Ping failed for OSD %lu (client %d), disconnecting peer\n", fail_osd_num, fail_fd);
stop_client(fail_fd, true);
}
};

View File

@@ -9,6 +9,8 @@
#include "str_util.h"
#include "osd.h"
#define SELF_FD -1
// Peering loop
void osd_t::handle_peers()
{
@@ -317,7 +319,7 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
// Self
osd_op_t *op = new osd_op_t();
op->op_type = 0;
op->peer_fd = -1;
op->peer_fd = SELF_FD;
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
op->bs_op = new blockstore_op_t();
op->bs_op->opcode = BS_OP_SYNC;
@@ -336,8 +338,8 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
ps->list_ops.erase(role_osd);
submit_list_subop(role_osd, ps);
};
bs->enqueue_op(op->bs_op);
ps->list_ops[role_osd] = op;
bs->enqueue_op(op->bs_op);
}
else
{
@@ -371,8 +373,8 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
ps->list_ops.erase(role_osd);
submit_list_subop(role_osd, ps);
};
msgr.outbox_push(op);
ps->list_ops[role_osd] = op;
msgr.outbox_push(op);
}
}
@@ -383,7 +385,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
// Self
osd_op_t *op = new osd_op_t();
op->op_type = 0;
op->peer_fd = -1;
op->peer_fd = SELF_FD;
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
op->bs_op = new blockstore_op_t();
op->bs_op->opcode = BS_OP_LIST;
@@ -415,8 +417,8 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
op->bs_op = NULL;
delete op;
};
bs->enqueue_op(op->bs_op);
ps->list_ops[role_osd] = op;
bs->enqueue_op(op->bs_op);
}
else
{
@@ -463,14 +465,14 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
ps->list_ops.erase(role_osd);
delete op;
};
msgr.outbox_push(op);
ps->list_ops[role_osd] = op;
msgr.outbox_push(op);
}
}
void osd_t::discard_list_subop(osd_op_t *list_op)
{
if (list_op->peer_fd == 0)
if (list_op->peer_fd == SELF_FD)
{
// Self
list_op->bs_op->callback = [list_op](blockstore_op_t *bs_op)

View File

@@ -206,6 +206,25 @@ static void coroutine_fn vitastor_co_get_metadata(VitastorRPC *task)
}
}
static void vitastor_aio_set_fd_handler(void *ctx, int fd, int unused1, IOHandler *fd_read, IOHandler *fd_write, void *unused2, void *opaque)
{
aio_set_fd_handler(ctx, fd,
#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 5 || QEMU_VERSION_MAJOR >= 3
0 /*is_external*/,
#endif
fd_read, fd_write,
#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR <= 6 || QEMU_VERSION_MAJOR < 1
NULL /*io_flush*/,
#endif
#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 9 || QEMU_VERSION_MAJOR >= 3
NULL /*io_poll*/,
#endif
#if QEMU_VERSION_MAJOR >= 7
NULL /*io_poll_ready*/,
#endif
opaque);
}
static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
{
VitastorClient *client = bs->opaque;
@@ -221,7 +240,7 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
client->rdma_gid_index = qdict_get_try_int(options, "rdma-gid-index", 0);
client->rdma_mtu = qdict_get_try_int(options, "rdma-mtu", 0);
client->proxy = vitastor_c_create_qemu(
(QEMUSetFDHandler*)aio_set_fd_handler, bdrv_get_aio_context(bs), client->config_path, client->etcd_host, client->etcd_prefix,
vitastor_aio_set_fd_handler, bdrv_get_aio_context(bs), client->config_path, client->etcd_host, client->etcd_prefix,
client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
);
client->image = g_strdup(qdict_get_try_str(options, "image"));
@@ -238,9 +257,9 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
}
else
{
qemu_coroutine_enter(qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
BDRV_POLL_WHILE(bs, !task.complete);
}
BDRV_POLL_WHILE(bs, !task.complete);
client->watch = (void*)task.ret;
client->readonly = client->readonly || vitastor_c_inode_get_readonly(client->watch);
client->size = vitastor_c_inode_get_size(client->watch);
@@ -428,7 +447,13 @@ static void vitastor_co_read_cb(void *opaque, long retval, uint64_t version)
vitastor_co_generic_bh_cb(opaque, retval);
}
static int coroutine_fn vitastor_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *iov, int flags)
static int coroutine_fn vitastor_co_preadv(BlockDriverState *bs,
#if QEMU_VERSION_MAJOR >= 7 || QEMU_VERSION_MAJOR == 6 && QEMU_VERSION_MINOR >= 2
int64_t offset, int64_t bytes, QEMUIOVector *iov, BdrvRequestFlags flags
#else
uint64_t offset, uint64_t bytes, QEMUIOVector *iov, int flags
#endif
)
{
VitastorClient *client = bs->opaque;
VitastorRPC task;
@@ -448,7 +473,13 @@ static int coroutine_fn vitastor_co_preadv(BlockDriverState *bs, uint64_t offset
return task.ret;
}
static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *iov, int flags)
static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
#if QEMU_VERSION_MAJOR >= 7 || QEMU_VERSION_MAJOR == 6 && QEMU_VERSION_MINOR >= 2
int64_t offset, int64_t bytes, QEMUIOVector *iov, BdrvRequestFlags flags
#else
uint64_t offset, uint64_t bytes, QEMUIOVector *iov, int flags
#endif
)
{
VitastorClient *client = bs->opaque;
VitastorRPC task;

View File

@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: Vitastor
Description: Vitastor client library
Version: 0.8.1
Version: 0.8.2
Libs: -L${libdir} -lvitastor_client
Cflags: -I${includedir}

View File

@@ -24,6 +24,7 @@ typedef void VitastorIOHandler(void *opaque, long retval);
// QEMU
typedef void IOHandler(void *opaque);
// is_external and poll_fn are not required, but are here for compatibility
typedef void QEMUSetFDHandler(void *ctx, int fd, int is_external, IOHandler *fd_read, IOHandler *fd_write, void *poll_fn, void *opaque);
vitastor_c *vitastor_c_create_qemu(QEMUSetFDHandler *aio_set_fd_handler, void *aio_context,