Compare commits

...

4 Commits

Author SHA1 Message Date
Vitaliy Filippov 1c78df4fba Add bindiff for tests
Test / test_switch_primary (push) Successful in 34s Details
Test / test_write (push) Successful in 32s Details
Test / test_write_no_same (push) Successful in 9s Details
Test / test_write_xor (push) Successful in 35s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m45s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Successful in 2m17s Details
Test / test_heal_antietcd (push) Successful in 2m18s Details
Test / test_etcd_fail (push) Failing after 10m5s Details
Test / test_heal_csum_32k_dmj (push) Failing after 2m29s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m19s Details
Test / test_heal_csum_32k (push) Successful in 2m16s Details
Test / test_resize (push) Successful in 12s Details
Test / test_resize_auto (push) Successful in 8s Details
Test / test_snapshot_pool2 (push) Successful in 13s Details
Test / test_osd_tags (push) Successful in 7s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 10s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 13s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m17s Details
Test / test_scrub_zero_osd_2 (push) Successful in 13s Details
Test / test_heal_csum_4k (push) Successful in 2m16s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 17s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 13s Details
2024-11-16 19:35:43 +03:00
Vitaliy Filippov 7de38250ad Auto-select RDMA device based on osd_network
Test / test_rebalance_verify_ec (push) Successful in 1m49s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m50s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write (push) Successful in 31s Details
Test / test_write_xor (push) Successful in 34s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m12s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m19s Details
Test / test_heal_csum_32k (push) Successful in 2m16s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m18s Details
Test / test_resize_auto (push) Successful in 9s Details
Test / test_resize (push) Successful in 15s Details
Test / test_osd_tags (push) Successful in 9s Details
Test / test_snapshot_pool2 (push) Successful in 17s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 11s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 13s Details
Test / test_scrub_xor (push) Successful in 13s Details
Test / test_scrub_pg_size_3 (push) Successful in 16s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 17s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 12s Details
Test / test_heal_csum_4k (push) Successful in 2m16s Details
2024-11-16 18:38:57 +03:00
Vitaliy Filippov 9c59d30e83 Report slow ops in OSD stats in etcd and show them in vitastor-cli status
Test / test_rebalance_verify_ec (push) Successful in 1m42s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m43s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_write (push) Successful in 30s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write_xor (push) Successful in 34s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Successful in 2m17s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m21s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m18s Details
Test / test_heal_csum_32k (push) Successful in 2m19s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m20s Details
Test / test_resize_auto (push) Successful in 9s Details
Test / test_resize (push) Successful in 14s Details
Test / test_osd_tags (push) Successful in 8s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_snapshot_pool2 (push) Successful in 15s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 10s Details
Test / test_enospc_imm_xor (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 12s Details
Test / test_scrub (push) Successful in 16s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 16s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 12s Details
Test / test_heal_csum_4k (push) Successful in 2m9s Details
2024-11-16 15:11:16 +03:00
Vitaliy Filippov 5db02cdf6e Add pve-qemu 9.0 patch 2024-11-16 11:20:47 +03:00
18 changed files with 704 additions and 68 deletions

View File

@ -0,0 +1,193 @@
Index: pve-qemu-kvm-9.0.0/block/meson.build
===================================================================
--- pve-qemu-kvm-9.0.0.orig/block/meson.build
+++ pve-qemu-kvm-9.0.0/block/meson.build
@@ -126,6 +126,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
Index: pve-qemu-kvm-9.0.0/meson.build
===================================================================
--- pve-qemu-kvm-9.0.0.orig/meson.build
+++ pve-qemu-kvm-9.0.0/meson.build
@@ -1452,6 +1452,26 @@ if not get_option('rbd').auto() or have_
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'))
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -2254,6 +2274,7 @@ endif
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
@@ -4454,6 +4475,7 @@ summary_info += {'fdt support': fd
summary_info += {'libcap-ng support': libcap_ng}
summary_info += {'bpf support': libbpf}
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
Index: pve-qemu-kvm-9.0.0/meson_options.txt
===================================================================
--- pve-qemu-kvm-9.0.0.orig/meson_options.txt
+++ pve-qemu-kvm-9.0.0/meson_options.txt
@@ -194,6 +194,8 @@ option('lzo', type : 'feature', value :
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
Index: pve-qemu-kvm-9.0.0/qapi/block-core.json
===================================================================
--- pve-qemu-kvm-9.0.0.orig/qapi/block-core.json
+++ pve-qemu-kvm-9.0.0/qapi/block-core.json
@@ -3481,7 +3481,7 @@
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
'pbs',
- 'ssh', 'throttle', 'vdi', 'vhdx',
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -4591,6 +4591,28 @@
'*server': ['InetSocketAddressBase'] } }
##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
+##
# @ReplicationMode:
#
# An enumeration of replication modes.
@@ -5053,6 +5075,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'virtio-blk-vfio-pci':
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
'if': 'CONFIG_BLKIO' },
@@ -5498,6 +5521,20 @@
'*encrypt' : 'RbdEncryptionCreateOptions' } }
##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @location: Where to store the new image file. This location cannot
+# point to a snapshot.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
+##
# @BlockdevVmdkSubformat:
#
# Subformat options for VMDK images
@@ -5719,6 +5753,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
Index: pve-qemu-kvm-9.0.0/scripts/ci/org.centos/stream/8/x86_64/configure
===================================================================
--- pve-qemu-kvm-9.0.0.orig/scripts/ci/org.centos/stream/8/x86_64/configure
+++ pve-qemu-kvm-9.0.0/scripts/ci/org.centos/stream/8/x86_64/configure
@@ -30,7 +30,7 @@
--with-suffix="qemu-kvm" \
--firmwarepath=/usr/share/qemu-firmware \
--target-list="x86_64-softmmu" \
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
--audio-drv-list="" \
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
--with-coroutine=ucontext \
@@ -176,6 +176,7 @@
--enable-opengl \
--enable-pie \
--enable-rbd \
+--enable-vitastor \
--enable-rdma \
--enable-seccomp \
--enable-snappy \
Index: pve-qemu-kvm-9.0.0/scripts/meson-buildoptions.sh
===================================================================
--- pve-qemu-kvm-9.0.0.orig/scripts/meson-buildoptions.sh
+++ pve-qemu-kvm-9.0.0/scripts/meson-buildoptions.sh
@@ -168,6 +168,7 @@ meson_options_help() {
printf "%s\n" ' qed qed image format support'
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' rdma Enable RDMA-based migration'
printf "%s\n" ' replication replication support'
printf "%s\n" ' rutabaga-gfx rutabaga_gfx support'
@@ -445,6 +446,8 @@ _meson_option_parse() {
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-rdma) printf "%s" -Drdma=enabled ;;
--disable-rdma) printf "%s" -Drdma=disabled ;;
--enable-relocatable) printf "%s" -Drelocatable=true ;;

View File

@ -176,7 +176,7 @@ void etcd_state_client_t::add_etcd_url(std::string addr)
exit(1);
}
if (!local_ips.size())
local_ips = getifaddr_list();
local_ips = getifaddr_list(std::vector<std::string>(), true);
std::string check_addr;
int pos = addr.find('/');
int pos2 = addr.find(':');

View File

@ -121,7 +121,7 @@ void osd_messenger_t::init()
if (use_rdma)
{
rdma_context = msgr_rdma_context_t::create(
rdma_device != "" ? rdma_device.c_str() : NULL,
osd_networks, rdma_device != "" ? rdma_device.c_str() : NULL,
rdma_port_num, rdma_gid_index, rdma_mtu, rdma_odp, log_level
);
if (!rdma_context)
@ -266,7 +266,8 @@ void osd_messenger_t::parse_config(const json11::Json & config)
this->rdma_port_num = (uint8_t)config["rdma_port_num"].uint64_value();
if (!this->rdma_port_num)
this->rdma_port_num = 1;
this->rdma_gid_index = (uint8_t)config["rdma_gid_index"].uint64_value();
if (!config["rdma_gid_index"].is_null())
this->rdma_gid_index = (uint8_t)config["rdma_gid_index"].uint64_value();
this->rdma_mtu = (uint32_t)config["rdma_mtu"].uint64_value();
this->rdma_max_sge = config["rdma_max_sge"].uint64_value();
if (!this->rdma_max_sge)
@ -281,6 +282,15 @@ void osd_messenger_t::parse_config(const json11::Json & config)
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
this->rdma_max_msg = 129*1024;
this->rdma_odp = config["rdma_odp"].bool_value();
std::vector<std::string> mask;
if (config["bind_address"].is_string())
mask.push_back(config["bind_address"].string_value());
else if (config["osd_network"].is_string())
mask.push_back(config["osd_network"].string_value());
else
for (auto v: config["osd_network"].array_items())
mask.push_back(v.string_value());
this->osd_networks = mask;
#endif
if (!osd_num)
this->iothread_count = (uint32_t)config["client_iothread_count"].uint64_value();

View File

@ -165,8 +165,9 @@ protected:
#ifdef WITH_RDMA
bool use_rdma = true;
std::vector<std::string> osd_networks;
std::string rdma_device;
uint64_t rdma_port_num = 1, rdma_gid_index = 0, rdma_mtu = 0;
uint64_t rdma_port_num = 1, rdma_gid_index = -1, rdma_mtu = 0;
msgr_rdma_context_t *rdma_context = NULL;
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
uint64_t rdma_max_msg = 0;

View File

@ -3,6 +3,7 @@
#include <stdio.h>
#include <stdlib.h>
#include "addr_util.h"
#include "msgr_rdma.h"
#include "messenger.h"
@ -69,7 +70,126 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
send_out_size = 0;
}
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level)
static bool is_ipv4_gid(ibv_gid_entry *gidx)
{
return (((uint64_t*)gidx->gid.raw)[0] == 0 &&
((uint32_t*)gidx->gid.raw)[2] == 0xffff0000);
}
static bool match_gid(ibv_gid_entry *gidx, addr_mask_t *networks, int nnet)
{
if (gidx->gid_type != IBV_GID_TYPE_ROCE_V1 &&
gidx->gid_type != IBV_GID_TYPE_ROCE_V2 ||
((uint64_t*)gidx->gid.raw)[0] == 0 &&
((uint64_t*)gidx->gid.raw)[1] == 0)
{
return false;
}
if (is_ipv4_gid(gidx))
{
for (int i = 0; i < nnet; i++)
{
if (networks[i].family == AF_INET && cidr_match(*(in_addr*)(gidx->gid.raw+12), networks[i].ipv4, networks[i].bits))
return true;
}
}
else
{
for (int i = 0; i < nnet; i++)
{
if (networks[i].family == AF_INET6 && cidr6_match(*(in6_addr*)gidx->gid.raw, networks[i].ipv6, networks[i].bits))
return true;
}
}
return false;
}
struct matched_dev
{
int dev = -1;
int port = -1;
int gid = -1;
bool rocev2 = false;
};
static void log_rdma_dev_port_gid(ibv_device *dev, int ib_port, int gid_index, ibv_gid_entry & gidx)
{
bool is4 = ((uint64_t*)gidx.gid.raw)[0] == 0 && ((uint32_t*)gidx.gid.raw)[2] == 0xffff0000;
char buf[256];
inet_ntop(is4 ? AF_INET : AF_INET6, is4 ? gidx.gid.raw+12 : gidx.gid.raw, buf, sizeof(buf));
fprintf(
stderr, "Auto-selected RDMA device %s port %d GID %d - ROCEv%d IPv%d %s\n",
ibv_get_device_name(dev), ib_port, gid_index,
gidx.gid_type == IBV_GID_TYPE_ROCE_V2 ? 2 : 1, is4 ? 4 : 6, buf
);
}
static matched_dev match_device(ibv_device **dev_list, addr_mask_t *networks, int nnet, int log_level)
{
matched_dev best;
ibv_device_attr attr;
ibv_port_attr portinfo;
ibv_gid_entry best_gidx;
int res;
for (int i = 0; dev_list[i]; ++i)
{
auto dev = dev_list[i];
ibv_context *context = ibv_open_device(dev_list[i]);
if ((res = ibv_query_device(context, &attr)) != 0)
{
fprintf(stderr, "Couldn't query RDMA device %s for its features: %s\n", ibv_get_device_name(dev_list[i]), strerror(res));
goto cleanup;
}
for (int j = 1; j <= attr.phys_port_cnt; j++)
{
// Try to find a port with matching address
if ((res = ibv_query_port(context, j, &portinfo)) != 0)
{
fprintf(stderr, "Couldn't get RDMA device %s port %d info: %s\n", ibv_get_device_name(dev), j, strerror(res));
goto cleanup;
}
for (int k = 0; k < portinfo.gid_tbl_len; k++)
{
ibv_gid_entry gidx;
if ((res = ibv_query_gid_ex(context, j, k, &gidx, 0)) != 0)
{
if (res != ENODATA)
{
fprintf(stderr, "Couldn't read RDMA device %s GID index %d: %s\n", ibv_get_device_name(dev), k, strerror(res));
goto cleanup;
}
else
break;
}
if (match_gid(&gidx, networks, nnet))
{
// Prefer RoCEv2
if (!best.rocev2)
{
best.dev = i;
best.port = j;
best.gid = k;
best.rocev2 = (gidx.gid_type == IBV_GID_TYPE_ROCE_V2);
best_gidx = gidx;
}
}
}
}
cleanup:
ibv_close_device(context);
if (best.rocev2)
{
break;
}
}
if (best.dev >= 0 && log_level > 0)
{
log_rdma_dev_port_gid(dev_list[best.dev], best.port, best.gid, best_gidx);
}
return best;
}
msgr_rdma_context_t *msgr_rdma_context_t::create(std::vector<std::string> osd_networks, const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level)
{
int res;
ibv_device **dev_list = NULL;
@ -80,28 +200,23 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
clock_gettime(CLOCK_REALTIME, &tv);
srand48(tv.tv_sec*1000000000 + tv.tv_nsec);
dev_list = ibv_get_device_list(NULL);
if (!dev_list)
if (!dev_list || !*dev_list)
{
if (errno == -ENOSYS || errno == ENOSYS)
{
if (log_level > 0)
fprintf(stderr, "No RDMA devices found (RDMA device list returned ENOSYS)\n");
}
else if (!*dev_list)
{
if (log_level > 0)
fprintf(stderr, "No RDMA devices found\n");
}
else
fprintf(stderr, "Failed to get RDMA device list: %s\n", strerror(errno));
goto cleanup;
}
if (!ib_devname)
{
ctx->dev = *dev_list;
if (!ctx->dev)
{
if (log_level > 0)
fprintf(stderr, "No RDMA devices found\n");
goto cleanup;
}
}
else
if (ib_devname)
{
int i;
for (i = 0; dev_list[i]; ++i)
@ -114,6 +229,31 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
goto cleanup;
}
}
else if (osd_networks.size())
{
std::vector<addr_mask_t> nets;
for (auto & netstr: osd_networks)
{
nets.push_back(cidr_parse(netstr));
}
auto best = match_device(dev_list, nets.data(), nets.size(), log_level);
if (best.dev < 0)
{
if (log_level > 0)
fprintf(stderr, "RDMA device matching osd_network is not found, using first available device\n");
best.dev = 0;
}
else
{
ib_port = best.port;
gid_index = best.gid;
}
ctx->dev = dev_list[best.dev];
}
else
{
ctx->dev = *dev_list;
}
ctx->context = ibv_open_device(ctx->dev);
if (!ctx->context)
@ -123,7 +263,6 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
}
ctx->ib_port = ib_port;
ctx->gid_index = gid_index;
if ((res = ibv_query_port(ctx->context, ib_port, &ctx->portinfo)) != 0)
{
fprintf(stderr, "Couldn't get RDMA device %s port %d info: %s\n", ibv_get_device_name(ctx->dev), ib_port, strerror(res));
@ -135,10 +274,47 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
fprintf(stderr, "RDMA device %s must have local LID because it's not Ethernet, but LID is zero\n", ibv_get_device_name(ctx->dev));
goto cleanup;
}
if (ibv_query_gid(ctx->context, ib_port, gid_index, &ctx->my_gid))
if (gid_index != -1)
{
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), gid_index);
goto cleanup;
ctx->gid_index = gid_index;
if (ibv_query_gid_ex(ctx->context, ib_port, gid_index, &ctx->my_gid, 0))
{
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), gid_index);
goto cleanup;
}
}
else
{
// Auto-guess GID
for (int k = 0; k < ctx->portinfo.gid_tbl_len; k++)
{
ibv_gid_entry gidx;
if (ibv_query_gid_ex(ctx->context, ib_port, k, &gidx, 0) != 0)
{
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), k);
goto cleanup;
}
// Skip empty GID
if (((uint64_t*)gidx.gid.raw)[0] == 0 &&
((uint64_t*)gidx.gid.raw)[1] == 0)
{
continue;
}
// Prefer IPv4 RoCEv2 GID by default
if (gid_index == -1 ||
gidx.gid_type == IBV_GID_TYPE_ROCE_V2 &&
(ctx->my_gid.gid_type != IBV_GID_TYPE_ROCE_V2 || is_ipv4_gid(&gidx)))
{
gid_index = k;
ctx->my_gid = gidx;
}
}
ctx->gid_index = gid_index = (gid_index == -1 ? 0 : gid_index);
if (log_level > 0)
{
log_rdma_dev_port_gid(ctx->dev, ctx->ib_port, ctx->gid_index, ctx->my_gid);
}
}
ctx->pd = ibv_alloc_pd(ctx->context);
@ -255,7 +431,7 @@ msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx,
}
conn->addr.lid = ctx->my_lid;
conn->addr.gid = ctx->my_gid;
conn->addr.gid = ctx->my_gid.gid;
conn->addr.qpn = conn->qp->qp_num;
conn->addr.psn = lrand48() & 0xffffff;

View File

@ -31,12 +31,12 @@ struct msgr_rdma_context_t
uint8_t ib_port;
uint8_t gid_index;
uint16_t my_lid;
ibv_gid my_gid;
ibv_gid_entry my_gid;
uint32_t mtu;
int max_cqe = 0;
int used_max_cqe = 0;
static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level);
static msgr_rdma_context_t *create(std::vector<std::string> osd_networks, const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level);
~msgr_rdma_context_t();
};

View File

@ -216,7 +216,7 @@ resume_1:
for (uint64_t osd_num: node.child_osds)
{
auto & osd = placement_tree->osds.at(osd_num);
fmt_items.push_back(json11::Json::object{
auto json_osd = json11::Json::object{
{ "type", "osd" },
{ "name", osd.num },
{ "parent", node.name },
@ -230,7 +230,16 @@ resume_1:
{ "bitmap", (uint64_t)osd.bitmap_granularity },
{ "commit", osd.immediate_commit == IMMEDIATE_NONE ? "none" : (osd.immediate_commit == IMMEDIATE_ALL ? "all" : "small") },
{ "op_stats", osd_stats[osd_num]["op_stats"] },
});
};
if (osd_stats[osd_num]["slow_ops_primary"].uint64_value() > 0)
{
json_osd["slow_ops_primary"] = osd_stats[osd_num]["slow_ops_primary"];
}
if (osd_stats[osd_num]["slow_ops_secondary"].uint64_value() > 0)
{
json_osd["slow_ops_secondary"] = osd_stats[osd_num]["slow_ops_secondary"];
}
fmt_items.push_back(json_osd);
}
}
result.data = fmt_items;

View File

@ -134,6 +134,7 @@ resume_2:
}
int osd_count = 0, osd_up = 0;
uint64_t total_raw = 0, free_raw = 0, free_down_raw = 0, down_raw = 0;
std::vector<uint64_t> slow_op_primary_osds, slow_op_secondary_osds;
parent->iterate_kvs_1(osd_stats, "/osd/stats/", [&](uint64_t stat_osd_num, json11::Json value)
{
osd_count++;
@ -153,6 +154,14 @@ resume_2:
if (peer_it != parent->cli->st_cli.peer_states.end())
{
osd_up++;
if (value["slow_ops_primary"].uint64_value() > 0)
{
slow_op_primary_osds.push_back(stat_osd_num);
}
if (value["slow_ops_secondary"].uint64_value() > 0)
{
slow_op_secondary_osds.push_back(stat_osd_num);
}
}
else
{
@ -216,6 +225,10 @@ resume_2:
{ "mon_master", mon_master },
{ "osd_up", osd_up },
{ "osd_count", osd_count },
{ "osds_full", osds_full },
{ "osds_nearfull", osds_nearfull },
{ "osds_primary_slow_ops", slow_op_primary_osds },
{ "osds_secondary_slow_ops", slow_op_secondary_osds },
{ "total_raw", total_raw },
{ "free_raw", free_raw },
{ "down_raw", down_raw },
@ -300,6 +313,26 @@ resume_2:
warning_str += " "+std::to_string(osds_nearfull)+
(osds_nearfull > 1 ? " osds are almost full\n" : " osd is almost full\n");
}
if (slow_op_primary_osds.size() > 0)
{
warning_str += " "+std::to_string(slow_op_primary_osds.size());
warning_str += (slow_op_primary_osds.size() > 1 ? " osds have" : " osd has");
warning_str += " slow client ops: ";
for (int i = 0; i < slow_op_primary_osds.size(); i++)
{
warning_str += (i > 0 ? ", " : "")+std::to_string(slow_op_primary_osds[i])+"\n";
}
}
if (slow_op_secondary_osds.size() > 0)
{
warning_str += " "+std::to_string(slow_op_secondary_osds.size());
warning_str += (slow_op_secondary_osds.size() > 1 ? " osds have" : " osd has");
warning_str += " slow replication ops: ";
for (int i = 0; i < slow_op_secondary_osds.size(); i++)
{
warning_str += (i > 0 ? ", " : "")+std::to_string(slow_op_secondary_osds[i])+"\n";
}
}
if (warning_str != "")
{
warning_str = "\n warning:\n"+warning_str;

View File

@ -535,10 +535,12 @@ void osd_t::print_stats()
void osd_t::print_slow()
{
bool has_slow = false;
cur_slow_op_primary = 0;
cur_slow_op_secondary = 0;
char alloc[1024];
timespec now;
clock_gettime(CLOCK_REALTIME, &now);
// FIXME: Also track slow local blockstore ops and recovery/flush/scrub ops
for (auto & kv: msgr.clients)
{
for (auto op: kv.second->received_ops)
@ -608,6 +610,7 @@ void osd_t::print_slow()
op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK ||
op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
{
cur_slow_op_secondary++;
bufprintf(" state=%d", op->bs_op ? PRIV(op->bs_op)->op_state : -1);
int wait_for = op->bs_op ? PRIV(op->bs_op)->wait_for : 0;
if (wait_for)
@ -618,15 +621,19 @@ void osd_t::print_slow()
else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
op->req.hdr.opcode == OSD_OP_SYNC || op->req.hdr.opcode == OSD_OP_DELETE)
{
cur_slow_op_primary++;
bufprintf(" state=%d", !op->op_data ? -1 : op->op_data->st);
}
else
{
cur_slow_op_primary++;
}
#undef bufprintf
printf("%s\n", alloc);
has_slow = true;
}
}
}
if (has_slow && bs)
if ((cur_slow_op_primary+cur_slow_op_secondary) > 0 && bs)
{
bs->dump_diagnostics();
}

View File

@ -151,6 +151,8 @@ class osd_t
bool etcd_reporting_pg_state = false;
bool etcd_reporting_stats = false;
int print_stats_timer_id = -1, slow_log_timer_id = -1;
uint64_t cur_slow_op_primary = 0;
uint64_t cur_slow_op_secondary = 0;
// peers and PGs

View File

@ -201,6 +201,14 @@ json11::Json osd_t::get_statistics()
st["immediate_commit"] = immediate_commit == IMMEDIATE_ALL ? "all" : (immediate_commit == IMMEDIATE_SMALL ? "small" : "none");
st["host"] = self_state["host"];
st["version"] = VITASTOR_VERSION;
if (cur_slow_op_primary > 0)
{
st["slow_ops_primary"] = cur_slow_op_primary;
}
if (cur_slow_op_secondary > 0)
{
st["slow_ops_secondary"] = cur_slow_op_secondary;
}
json11::Json::object op_stats, subop_stats;
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
{

View File

@ -12,6 +12,11 @@ target_link_libraries(stub_bench tcmalloc_minimal)
add_executable(osd_test osd_test.cpp ../util/rw_blocking.cpp ../util/addr_util.cpp)
target_link_libraries(osd_test tcmalloc_minimal)
# bindiff
add_executable(bindiff
bindiff.c
)
# stub_uring_osd
add_executable(stub_uring_osd
stub_uring_osd.cpp

177
src/test/bindiff.c Normal file
View File

@ -0,0 +1,177 @@
// Copyright (c) Vitaliy Filippov, 2004+
// License: VNPL-1.1 (see README.md for details)
#ifndef _LARGEFILE64_SOURCE
#define _LARGEFILE64_SOURCE
#endif
#include <string.h>
#include <sys/stat.h>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <unistd.h>
#include <fcntl.h>
#define BUFSIZE 0x100000
uint64_t filelength(int fd)
{
struct stat st;
if (fstat(fd, &st) < 0)
{
fprintf(stderr, "fstat failed: %s\n", strerror(errno));
return 0;
}
if (st.st_size < 0)
{
return 0;
}
return (uint64_t)st.st_size;
}
size_t read_blocking(int fd, void *read_buf, size_t remaining)
{
size_t done = 0;
while (done < remaining)
{
ssize_t r = read(fd, read_buf, remaining-done);
if (r <= 0)
{
if (!errno)
{
// EOF
return done;
}
else if (errno != EINTR && errno != EAGAIN && errno != EPIPE)
{
perror("read");
exit(1);
}
continue;
}
done += (size_t)r;
read_buf = (uint8_t*)read_buf + r;
}
return done;
}
size_t write_blocking(int fd, void *write_buf, size_t remaining)
{
size_t done = 0;
while (done < remaining)
{
ssize_t r = write(fd, write_buf, remaining-done);
if (r < 0)
{
if (errno != EINTR && errno != EAGAIN && errno != EPIPE)
{
perror("write");
exit(1);
}
continue;
}
done += (size_t)r;
write_buf = (uint8_t*)write_buf + r;
}
return done;
}
int main(int narg, char *args[])
{
int fd1 = -1, fd2 = -1;
uint8_t *buf1 = NULL, *buf2 = NULL;
uint64_t addr = 0, l1 = 0, l2 = 0, l = 0, diffl = 0;
size_t buf1_len = 0, buf2_len = 0, i = 0, j = 0, dl = 0;
int argoff = 0;
int nosource = 0;
fprintf(stderr, "VMX HexDiff v2.1\nLicense: GPLv3.0+, (c) 2005+, Vitaliy Filippov\n");
argoff = 1;
if (narg > argoff && strcmp(args[argoff], "-n") == 0)
{
nosource = 1;
argoff++;
}
if (narg < argoff+2)
{
fprintf(stderr, "USAGE: bindiff [-n] <file1> <file2>\n"
"This will create hex patch file1->file2 and write it to stdout.\n"
"[-n] = do not write file1 data in patch, only file2.\n");
return -1;
}
fd1 = open(args[argoff], O_RDONLY);
if (fd1 < 0)
{
fprintf(stderr, "Couldn't open %s: %s\n", args[argoff], strerror(errno));
return -1;
}
fd2 = open(args[argoff+1], O_RDONLY);
if (fd2 < 0)
{
fprintf(stderr, "Couldn't open %s: %s\n", args[argoff+1], strerror(errno));
close(fd1);
return -1;
}
l1 = filelength(fd1);
l2 = filelength(fd2);
if (l1 < l2)
l = l1;
else
l = l2;
addr = diffl = 0;
buf1 = malloc(BUFSIZE+1);
buf2 = malloc(BUFSIZE+1);
while ((buf1_len = read_blocking(fd1, buf1, BUFSIZE)) > 0 && (buf2_len = read_blocking(fd2, buf2, BUFSIZE)) > 0)
{
buf1[buf1_len] = buf2[buf2_len] = 0;
for (dl = 0, i = 0; i <= buf1_len && i <= buf2_len; i++, addr++)
{
if (buf1[i] != buf2[i])
{
dl++;
}
else if (dl)
{
printf("%08jX: ", addr-dl);
if (!nosource)
{
for (j = i-dl; j < i; j++)
printf("%02X", buf1[j]);
printf(" ");
}
for (j = i-dl; j < i; j++)
printf("%02X", buf2[j]);
printf("\n");
diffl += dl;
dl = 0;
}
}
addr--;
}
if (l1 < l2)
{
printf("%08zX: ", i);
while ((buf2_len = read_blocking(fd2, buf2, BUFSIZE)) > 0)
{
for (j = 0; j < buf2_len; j++, i++)
printf("%02X", buf2[j]);
}
printf("\n");
}
else if (l1 > l2)
{
printf("SIZE %08zX\n", l2);
}
if (diffl != 0 || l1 != l2)
{
fprintf(stderr, "Difference in %zu of %zu common bytes\n", diffl, l);
if (l1 != l2)
fprintf(stderr, "Length difference!\nFile \"%s\": %zu\nFile \"%s\": %zu\n", args [1], l1, args [2], l2);
}
else
{
fprintf(stderr, "Files are equal\n");
}
return 0;
}

View File

@ -65,7 +65,7 @@ std::string addr_to_string(const sockaddr_storage &addr)
return std::string(peer_str)+":"+std::to_string(port);
}
static bool cidr_match(const in_addr &addr, const in_addr &net, uint8_t bits)
bool cidr_match(const in_addr &addr, const in_addr &net, uint8_t bits)
{
if (bits == 0)
{
@ -75,7 +75,7 @@ static bool cidr_match(const in_addr &addr, const in_addr &net, uint8_t bits)
return !((addr.s_addr ^ net.s_addr) & htonl(0xFFFFFFFFu << (32 - bits)));
}
static bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_t bits)
bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_t bits)
{
const uint32_t *a = address.s6_addr32;
const uint32_t *n = network.s6_addr32;
@ -93,47 +93,49 @@ static bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_
return true;
}
struct addr_mask_t
addr_mask_t cidr_parse(std::string mask)
{
sa_family_t family;
unsigned bits = 255;
int p = mask.find('/');
if (p != std::string::npos)
{
char null_byte = 0;
if (sscanf(mask.c_str()+p+1, "%u%c", &bits, &null_byte) != 1 || bits > 128)
throw std::runtime_error("Invalid IP address mask: " + mask);
mask = mask.substr(0, p);
}
in_addr ipv4;
in6_addr ipv6;
uint8_t bits;
};
if (inet_pton(AF_INET, mask.c_str(), &ipv4) == 1)
{
if (bits == 255)
bits = 32;
if (bits > 32)
throw std::runtime_error("Invalid IP address mask: " + mask);
return (addr_mask_t){ .family = AF_INET, .ipv4 = ipv4, .bits = (uint8_t)(bits ? bits : 32) };
}
else if (inet_pton(AF_INET6, mask.c_str(), &ipv6) == 1)
{
if (bits == 255)
bits = 128;
return (addr_mask_t){ .family = AF_INET6, .ipv6 = ipv6, .bits = (uint8_t)bits };
}
else
{
throw std::runtime_error("Invalid IP address mask: " + mask);
}
}
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg, bool include_v6)
{
std::vector<addr_mask_t> masks;
for (auto mask: mask_cfg)
{
unsigned bits = 0;
int p = mask.find('/');
if (p != std::string::npos)
masks.push_back(cidr_parse(mask));
if (masks[masks.size()-1].family == AF_INET6)
{
char null_byte = 0;
if (sscanf(mask.c_str()+p+1, "%u%c", &bits, &null_byte) != 1 || bits > 128)
{
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
}
mask = mask.substr(0, p);
}
in_addr ipv4;
in6_addr ipv6;
if (inet_pton(AF_INET, mask.c_str(), &ipv4) == 1)
{
if (bits > 32)
{
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
}
masks.push_back((addr_mask_t){ .family = AF_INET, .ipv4 = ipv4, .bits = (uint8_t)bits });
}
else if (include_v6 && inet_pton(AF_INET6, mask.c_str(), &ipv6) == 1)
{
masks.push_back((addr_mask_t){ .family = AF_INET6, .ipv6 = ipv6, .bits = (uint8_t)bits });
}
else
{
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
// Auto-enable IPv6 addresses
include_v6 = true;
}
}
std::set<std::string> addresses;

View File

@ -1,10 +1,22 @@
#pragma once
#include <netinet/in.h>
#include <sys/socket.h>
#include <string>
#include <vector>
struct addr_mask_t
{
sa_family_t family;
in_addr ipv4;
in6_addr ipv6;
uint8_t bits;
};
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr_storage *addr);
std::string addr_to_string(const sockaddr_storage &addr);
addr_mask_t cidr_parse(std::string mask);
bool cidr_match(const in_addr &address, const in_addr &network, uint8_t bits);
bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_t bits);
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg = std::vector<std::string>(), bool include_v6 = false);
int create_and_bind_socket(std::string bind_address, int bind_port, int listen_backlog, int *listening_port);

View File

@ -10,7 +10,7 @@
#include "rw_blocking.h"
int read_blocking(int fd, void *read_buf, size_t remaining)
size_t read_blocking(int fd, void *read_buf, size_t remaining)
{
size_t done = 0;
while (done < remaining)
@ -30,13 +30,13 @@ int read_blocking(int fd, void *read_buf, size_t remaining)
}
continue;
}
done += r;
done += (size_t)r;
read_buf = (uint8_t*)read_buf + r;
}
return done;
}
int write_blocking(int fd, void *write_buf, size_t remaining)
size_t write_blocking(int fd, void *write_buf, size_t remaining)
{
size_t done = 0;
while (done < remaining)
@ -51,7 +51,7 @@ int write_blocking(int fd, void *write_buf, size_t remaining)
}
continue;
}
done += r;
done += (size_t)r;
write_buf = (uint8_t*)write_buf + r;
}
return done;

View File

@ -6,8 +6,8 @@
#include <unistd.h>
#include <sys/uio.h>
int read_blocking(int fd, void *read_buf, size_t remaining);
int write_blocking(int fd, void *write_buf, size_t remaining);
size_t read_blocking(int fd, void *read_buf, size_t remaining);
size_t write_blocking(int fd, void *write_buf, size_t remaining);
int readv_blocking(int fd, iovec *iov, int iovcnt);
int writev_blocking(int fd, iovec *iov, int iovcnt);
int sendv_blocking(int fd, iovec *iov, int iovcnt, int flags);

View File

@ -60,6 +60,7 @@ qemu-img convert -S 4096 -p \
-O raw ./testdata/bin/read.bin
if ! diff -q ./testdata/bin/read.bin ./testdata/bin/mirror.bin; then
build/src/test/bindiff ./testdata/bin/read.bin ./testdata/bin/mirror.bin
format_error Data lost during self-heal
fi