Compare commits
4 Commits
5a8f80159f
...
1c78df4fba
Author | SHA1 | Date |
---|---|---|
Vitaliy Filippov | 1c78df4fba | |
Vitaliy Filippov | 7de38250ad | |
Vitaliy Filippov | 9c59d30e83 | |
Vitaliy Filippov | 5db02cdf6e |
|
@ -0,0 +1,193 @@
|
|||
Index: pve-qemu-kvm-9.0.0/block/meson.build
|
||||
===================================================================
|
||||
--- pve-qemu-kvm-9.0.0.orig/block/meson.build
|
||||
+++ pve-qemu-kvm-9.0.0/block/meson.build
|
||||
@@ -126,6 +126,7 @@ foreach m : [
|
||||
[libnfs, 'nfs', files('nfs.c')],
|
||||
[libssh, 'ssh', files('ssh.c')],
|
||||
[rbd, 'rbd', files('rbd.c')],
|
||||
+ [vitastor, 'vitastor', files('vitastor.c')],
|
||||
]
|
||||
if m[0].found()
|
||||
module_ss = ss.source_set()
|
||||
Index: pve-qemu-kvm-9.0.0/meson.build
|
||||
===================================================================
|
||||
--- pve-qemu-kvm-9.0.0.orig/meson.build
|
||||
+++ pve-qemu-kvm-9.0.0/meson.build
|
||||
@@ -1452,6 +1452,26 @@ if not get_option('rbd').auto() or have_
|
||||
endif
|
||||
endif
|
||||
|
||||
+vitastor = not_found
|
||||
+if not get_option('vitastor').auto() or have_block
|
||||
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
|
||||
+ required: get_option('vitastor'))
|
||||
+ if libvitastor_client.found()
|
||||
+ if cc.links('''
|
||||
+ #include <vitastor_c.h>
|
||||
+ int main(void) {
|
||||
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
+ return 0;
|
||||
+ }''', dependencies: libvitastor_client)
|
||||
+ vitastor = declare_dependency(dependencies: libvitastor_client)
|
||||
+ elif get_option('vitastor').enabled()
|
||||
+ error('could not link libvitastor_client')
|
||||
+ else
|
||||
+ warning('could not link libvitastor_client, disabling')
|
||||
+ endif
|
||||
+ endif
|
||||
+endif
|
||||
+
|
||||
glusterfs = not_found
|
||||
glusterfs_ftruncate_has_stat = false
|
||||
glusterfs_iocb_has_stat = false
|
||||
@@ -2254,6 +2274,7 @@ endif
|
||||
config_host_data.set('CONFIG_OPENGL', opengl.found())
|
||||
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
|
||||
config_host_data.set('CONFIG_RBD', rbd.found())
|
||||
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
|
||||
config_host_data.set('CONFIG_RDMA', rdma.found())
|
||||
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
|
||||
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
|
||||
@@ -4454,6 +4475,7 @@ summary_info += {'fdt support': fd
|
||||
summary_info += {'libcap-ng support': libcap_ng}
|
||||
summary_info += {'bpf support': libbpf}
|
||||
summary_info += {'rbd support': rbd}
|
||||
+summary_info += {'vitastor support': vitastor}
|
||||
summary_info += {'smartcard support': cacard}
|
||||
summary_info += {'U2F support': u2f}
|
||||
summary_info += {'libusb': libusb}
|
||||
Index: pve-qemu-kvm-9.0.0/meson_options.txt
|
||||
===================================================================
|
||||
--- pve-qemu-kvm-9.0.0.orig/meson_options.txt
|
||||
+++ pve-qemu-kvm-9.0.0/meson_options.txt
|
||||
@@ -194,6 +194,8 @@ option('lzo', type : 'feature', value :
|
||||
description: 'lzo compression support')
|
||||
option('rbd', type : 'feature', value : 'auto',
|
||||
description: 'Ceph block device driver')
|
||||
+option('vitastor', type : 'feature', value : 'auto',
|
||||
+ description: 'Vitastor block device driver')
|
||||
option('opengl', type : 'feature', value : 'auto',
|
||||
description: 'OpenGL support')
|
||||
option('rdma', type : 'feature', value : 'auto',
|
||||
Index: pve-qemu-kvm-9.0.0/qapi/block-core.json
|
||||
===================================================================
|
||||
--- pve-qemu-kvm-9.0.0.orig/qapi/block-core.json
|
||||
+++ pve-qemu-kvm-9.0.0/qapi/block-core.json
|
||||
@@ -3481,7 +3481,7 @@
|
||||
'raw', 'rbd',
|
||||
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
|
||||
'pbs',
|
||||
- 'ssh', 'throttle', 'vdi', 'vhdx',
|
||||
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
|
||||
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
|
||||
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
|
||||
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
|
||||
@@ -4591,6 +4591,28 @@
|
||||
'*server': ['InetSocketAddressBase'] } }
|
||||
|
||||
##
|
||||
+# @BlockdevOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific block device options for vitastor
|
||||
+#
|
||||
+# @image: Image name
|
||||
+# @inode: Inode number
|
||||
+# @pool: Pool ID
|
||||
+# @size: Desired image size in bytes
|
||||
+# @config-path: Path to Vitastor configuration
|
||||
+# @etcd-host: etcd connection address(es)
|
||||
+# @etcd-prefix: etcd key/value prefix
|
||||
+##
|
||||
+{ 'struct': 'BlockdevOptionsVitastor',
|
||||
+ 'data': { '*inode': 'uint64',
|
||||
+ '*pool': 'uint64',
|
||||
+ '*size': 'uint64',
|
||||
+ '*image': 'str',
|
||||
+ '*config-path': 'str',
|
||||
+ '*etcd-host': 'str',
|
||||
+ '*etcd-prefix': 'str' } }
|
||||
+
|
||||
+##
|
||||
# @ReplicationMode:
|
||||
#
|
||||
# An enumeration of replication modes.
|
||||
@@ -5053,6 +5075,7 @@
|
||||
'throttle': 'BlockdevOptionsThrottle',
|
||||
'vdi': 'BlockdevOptionsGenericFormat',
|
||||
'vhdx': 'BlockdevOptionsGenericFormat',
|
||||
+ 'vitastor': 'BlockdevOptionsVitastor',
|
||||
'virtio-blk-vfio-pci':
|
||||
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
|
||||
'if': 'CONFIG_BLKIO' },
|
||||
@@ -5498,6 +5521,20 @@
|
||||
'*encrypt' : 'RbdEncryptionCreateOptions' } }
|
||||
|
||||
##
|
||||
+# @BlockdevCreateOptionsVitastor:
|
||||
+#
|
||||
+# Driver specific image creation options for Vitastor.
|
||||
+#
|
||||
+# @location: Where to store the new image file. This location cannot
|
||||
+# point to a snapshot.
|
||||
+#
|
||||
+# @size: Size of the virtual disk in bytes
|
||||
+##
|
||||
+{ 'struct': 'BlockdevCreateOptionsVitastor',
|
||||
+ 'data': { 'location': 'BlockdevOptionsVitastor',
|
||||
+ 'size': 'size' } }
|
||||
+
|
||||
+##
|
||||
# @BlockdevVmdkSubformat:
|
||||
#
|
||||
# Subformat options for VMDK images
|
||||
@@ -5719,6 +5753,7 @@
|
||||
'ssh': 'BlockdevCreateOptionsSsh',
|
||||
'vdi': 'BlockdevCreateOptionsVdi',
|
||||
'vhdx': 'BlockdevCreateOptionsVhdx',
|
||||
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
|
||||
'vmdk': 'BlockdevCreateOptionsVmdk',
|
||||
'vpc': 'BlockdevCreateOptionsVpc'
|
||||
} }
|
||||
Index: pve-qemu-kvm-9.0.0/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
===================================================================
|
||||
--- pve-qemu-kvm-9.0.0.orig/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
+++ pve-qemu-kvm-9.0.0/scripts/ci/org.centos/stream/8/x86_64/configure
|
||||
@@ -30,7 +30,7 @@
|
||||
--with-suffix="qemu-kvm" \
|
||||
--firmwarepath=/usr/share/qemu-firmware \
|
||||
--target-list="x86_64-softmmu" \
|
||||
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
|
||||
--audio-drv-list="" \
|
||||
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
|
||||
--with-coroutine=ucontext \
|
||||
@@ -176,6 +176,7 @@
|
||||
--enable-opengl \
|
||||
--enable-pie \
|
||||
--enable-rbd \
|
||||
+--enable-vitastor \
|
||||
--enable-rdma \
|
||||
--enable-seccomp \
|
||||
--enable-snappy \
|
||||
Index: pve-qemu-kvm-9.0.0/scripts/meson-buildoptions.sh
|
||||
===================================================================
|
||||
--- pve-qemu-kvm-9.0.0.orig/scripts/meson-buildoptions.sh
|
||||
+++ pve-qemu-kvm-9.0.0/scripts/meson-buildoptions.sh
|
||||
@@ -168,6 +168,7 @@ meson_options_help() {
|
||||
printf "%s\n" ' qed qed image format support'
|
||||
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
|
||||
printf "%s\n" ' rbd Ceph block device driver'
|
||||
+ printf "%s\n" ' vitastor Vitastor block device driver'
|
||||
printf "%s\n" ' rdma Enable RDMA-based migration'
|
||||
printf "%s\n" ' replication replication support'
|
||||
printf "%s\n" ' rutabaga-gfx rutabaga_gfx support'
|
||||
@@ -445,6 +446,8 @@ _meson_option_parse() {
|
||||
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
|
||||
--enable-rbd) printf "%s" -Drbd=enabled ;;
|
||||
--disable-rbd) printf "%s" -Drbd=disabled ;;
|
||||
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
|
||||
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
|
||||
--enable-rdma) printf "%s" -Drdma=enabled ;;
|
||||
--disable-rdma) printf "%s" -Drdma=disabled ;;
|
||||
--enable-relocatable) printf "%s" -Drelocatable=true ;;
|
|
@ -176,7 +176,7 @@ void etcd_state_client_t::add_etcd_url(std::string addr)
|
|||
exit(1);
|
||||
}
|
||||
if (!local_ips.size())
|
||||
local_ips = getifaddr_list();
|
||||
local_ips = getifaddr_list(std::vector<std::string>(), true);
|
||||
std::string check_addr;
|
||||
int pos = addr.find('/');
|
||||
int pos2 = addr.find(':');
|
||||
|
|
|
@ -121,7 +121,7 @@ void osd_messenger_t::init()
|
|||
if (use_rdma)
|
||||
{
|
||||
rdma_context = msgr_rdma_context_t::create(
|
||||
rdma_device != "" ? rdma_device.c_str() : NULL,
|
||||
osd_networks, rdma_device != "" ? rdma_device.c_str() : NULL,
|
||||
rdma_port_num, rdma_gid_index, rdma_mtu, rdma_odp, log_level
|
||||
);
|
||||
if (!rdma_context)
|
||||
|
@ -266,6 +266,7 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
|||
this->rdma_port_num = (uint8_t)config["rdma_port_num"].uint64_value();
|
||||
if (!this->rdma_port_num)
|
||||
this->rdma_port_num = 1;
|
||||
if (!config["rdma_gid_index"].is_null())
|
||||
this->rdma_gid_index = (uint8_t)config["rdma_gid_index"].uint64_value();
|
||||
this->rdma_mtu = (uint32_t)config["rdma_mtu"].uint64_value();
|
||||
this->rdma_max_sge = config["rdma_max_sge"].uint64_value();
|
||||
|
@ -281,6 +282,15 @@ void osd_messenger_t::parse_config(const json11::Json & config)
|
|||
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
|
||||
this->rdma_max_msg = 129*1024;
|
||||
this->rdma_odp = config["rdma_odp"].bool_value();
|
||||
std::vector<std::string> mask;
|
||||
if (config["bind_address"].is_string())
|
||||
mask.push_back(config["bind_address"].string_value());
|
||||
else if (config["osd_network"].is_string())
|
||||
mask.push_back(config["osd_network"].string_value());
|
||||
else
|
||||
for (auto v: config["osd_network"].array_items())
|
||||
mask.push_back(v.string_value());
|
||||
this->osd_networks = mask;
|
||||
#endif
|
||||
if (!osd_num)
|
||||
this->iothread_count = (uint32_t)config["client_iothread_count"].uint64_value();
|
||||
|
|
|
@ -165,8 +165,9 @@ protected:
|
|||
|
||||
#ifdef WITH_RDMA
|
||||
bool use_rdma = true;
|
||||
std::vector<std::string> osd_networks;
|
||||
std::string rdma_device;
|
||||
uint64_t rdma_port_num = 1, rdma_gid_index = 0, rdma_mtu = 0;
|
||||
uint64_t rdma_port_num = 1, rdma_gid_index = -1, rdma_mtu = 0;
|
||||
msgr_rdma_context_t *rdma_context = NULL;
|
||||
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
|
||||
uint64_t rdma_max_msg = 0;
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "addr_util.h"
|
||||
#include "msgr_rdma.h"
|
||||
#include "messenger.h"
|
||||
|
||||
|
@ -69,7 +70,126 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
|
|||
send_out_size = 0;
|
||||
}
|
||||
|
||||
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level)
|
||||
static bool is_ipv4_gid(ibv_gid_entry *gidx)
|
||||
{
|
||||
return (((uint64_t*)gidx->gid.raw)[0] == 0 &&
|
||||
((uint32_t*)gidx->gid.raw)[2] == 0xffff0000);
|
||||
}
|
||||
|
||||
static bool match_gid(ibv_gid_entry *gidx, addr_mask_t *networks, int nnet)
|
||||
{
|
||||
if (gidx->gid_type != IBV_GID_TYPE_ROCE_V1 &&
|
||||
gidx->gid_type != IBV_GID_TYPE_ROCE_V2 ||
|
||||
((uint64_t*)gidx->gid.raw)[0] == 0 &&
|
||||
((uint64_t*)gidx->gid.raw)[1] == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if (is_ipv4_gid(gidx))
|
||||
{
|
||||
for (int i = 0; i < nnet; i++)
|
||||
{
|
||||
if (networks[i].family == AF_INET && cidr_match(*(in_addr*)(gidx->gid.raw+12), networks[i].ipv4, networks[i].bits))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < nnet; i++)
|
||||
{
|
||||
if (networks[i].family == AF_INET6 && cidr6_match(*(in6_addr*)gidx->gid.raw, networks[i].ipv6, networks[i].bits))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
struct matched_dev
|
||||
{
|
||||
int dev = -1;
|
||||
int port = -1;
|
||||
int gid = -1;
|
||||
bool rocev2 = false;
|
||||
};
|
||||
|
||||
static void log_rdma_dev_port_gid(ibv_device *dev, int ib_port, int gid_index, ibv_gid_entry & gidx)
|
||||
{
|
||||
bool is4 = ((uint64_t*)gidx.gid.raw)[0] == 0 && ((uint32_t*)gidx.gid.raw)[2] == 0xffff0000;
|
||||
char buf[256];
|
||||
inet_ntop(is4 ? AF_INET : AF_INET6, is4 ? gidx.gid.raw+12 : gidx.gid.raw, buf, sizeof(buf));
|
||||
fprintf(
|
||||
stderr, "Auto-selected RDMA device %s port %d GID %d - ROCEv%d IPv%d %s\n",
|
||||
ibv_get_device_name(dev), ib_port, gid_index,
|
||||
gidx.gid_type == IBV_GID_TYPE_ROCE_V2 ? 2 : 1, is4 ? 4 : 6, buf
|
||||
);
|
||||
}
|
||||
|
||||
static matched_dev match_device(ibv_device **dev_list, addr_mask_t *networks, int nnet, int log_level)
|
||||
{
|
||||
matched_dev best;
|
||||
ibv_device_attr attr;
|
||||
ibv_port_attr portinfo;
|
||||
ibv_gid_entry best_gidx;
|
||||
int res;
|
||||
for (int i = 0; dev_list[i]; ++i)
|
||||
{
|
||||
auto dev = dev_list[i];
|
||||
ibv_context *context = ibv_open_device(dev_list[i]);
|
||||
if ((res = ibv_query_device(context, &attr)) != 0)
|
||||
{
|
||||
fprintf(stderr, "Couldn't query RDMA device %s for its features: %s\n", ibv_get_device_name(dev_list[i]), strerror(res));
|
||||
goto cleanup;
|
||||
}
|
||||
for (int j = 1; j <= attr.phys_port_cnt; j++)
|
||||
{
|
||||
// Try to find a port with matching address
|
||||
if ((res = ibv_query_port(context, j, &portinfo)) != 0)
|
||||
{
|
||||
fprintf(stderr, "Couldn't get RDMA device %s port %d info: %s\n", ibv_get_device_name(dev), j, strerror(res));
|
||||
goto cleanup;
|
||||
}
|
||||
for (int k = 0; k < portinfo.gid_tbl_len; k++)
|
||||
{
|
||||
ibv_gid_entry gidx;
|
||||
if ((res = ibv_query_gid_ex(context, j, k, &gidx, 0)) != 0)
|
||||
{
|
||||
if (res != ENODATA)
|
||||
{
|
||||
fprintf(stderr, "Couldn't read RDMA device %s GID index %d: %s\n", ibv_get_device_name(dev), k, strerror(res));
|
||||
goto cleanup;
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
if (match_gid(&gidx, networks, nnet))
|
||||
{
|
||||
// Prefer RoCEv2
|
||||
if (!best.rocev2)
|
||||
{
|
||||
best.dev = i;
|
||||
best.port = j;
|
||||
best.gid = k;
|
||||
best.rocev2 = (gidx.gid_type == IBV_GID_TYPE_ROCE_V2);
|
||||
best_gidx = gidx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
cleanup:
|
||||
ibv_close_device(context);
|
||||
if (best.rocev2)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (best.dev >= 0 && log_level > 0)
|
||||
{
|
||||
log_rdma_dev_port_gid(dev_list[best.dev], best.port, best.gid, best_gidx);
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
msgr_rdma_context_t *msgr_rdma_context_t::create(std::vector<std::string> osd_networks, const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level)
|
||||
{
|
||||
int res;
|
||||
ibv_device **dev_list = NULL;
|
||||
|
@ -80,28 +200,23 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
|
|||
clock_gettime(CLOCK_REALTIME, &tv);
|
||||
srand48(tv.tv_sec*1000000000 + tv.tv_nsec);
|
||||
dev_list = ibv_get_device_list(NULL);
|
||||
if (!dev_list)
|
||||
if (!dev_list || !*dev_list)
|
||||
{
|
||||
if (errno == -ENOSYS || errno == ENOSYS)
|
||||
{
|
||||
if (log_level > 0)
|
||||
fprintf(stderr, "No RDMA devices found (RDMA device list returned ENOSYS)\n");
|
||||
}
|
||||
else if (!*dev_list)
|
||||
{
|
||||
if (log_level > 0)
|
||||
fprintf(stderr, "No RDMA devices found\n");
|
||||
}
|
||||
else
|
||||
fprintf(stderr, "Failed to get RDMA device list: %s\n", strerror(errno));
|
||||
goto cleanup;
|
||||
}
|
||||
if (!ib_devname)
|
||||
{
|
||||
ctx->dev = *dev_list;
|
||||
if (!ctx->dev)
|
||||
{
|
||||
if (log_level > 0)
|
||||
fprintf(stderr, "No RDMA devices found\n");
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
else
|
||||
if (ib_devname)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; dev_list[i]; ++i)
|
||||
|
@ -114,6 +229,31 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
|
|||
goto cleanup;
|
||||
}
|
||||
}
|
||||
else if (osd_networks.size())
|
||||
{
|
||||
std::vector<addr_mask_t> nets;
|
||||
for (auto & netstr: osd_networks)
|
||||
{
|
||||
nets.push_back(cidr_parse(netstr));
|
||||
}
|
||||
auto best = match_device(dev_list, nets.data(), nets.size(), log_level);
|
||||
if (best.dev < 0)
|
||||
{
|
||||
if (log_level > 0)
|
||||
fprintf(stderr, "RDMA device matching osd_network is not found, using first available device\n");
|
||||
best.dev = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
ib_port = best.port;
|
||||
gid_index = best.gid;
|
||||
}
|
||||
ctx->dev = dev_list[best.dev];
|
||||
}
|
||||
else
|
||||
{
|
||||
ctx->dev = *dev_list;
|
||||
}
|
||||
|
||||
ctx->context = ibv_open_device(ctx->dev);
|
||||
if (!ctx->context)
|
||||
|
@ -123,7 +263,6 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
|
|||
}
|
||||
|
||||
ctx->ib_port = ib_port;
|
||||
ctx->gid_index = gid_index;
|
||||
if ((res = ibv_query_port(ctx->context, ib_port, &ctx->portinfo)) != 0)
|
||||
{
|
||||
fprintf(stderr, "Couldn't get RDMA device %s port %d info: %s\n", ibv_get_device_name(ctx->dev), ib_port, strerror(res));
|
||||
|
@ -135,11 +274,48 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
|
|||
fprintf(stderr, "RDMA device %s must have local LID because it's not Ethernet, but LID is zero\n", ibv_get_device_name(ctx->dev));
|
||||
goto cleanup;
|
||||
}
|
||||
if (ibv_query_gid(ctx->context, ib_port, gid_index, &ctx->my_gid))
|
||||
|
||||
if (gid_index != -1)
|
||||
{
|
||||
ctx->gid_index = gid_index;
|
||||
if (ibv_query_gid_ex(ctx->context, ib_port, gid_index, &ctx->my_gid, 0))
|
||||
{
|
||||
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), gid_index);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Auto-guess GID
|
||||
for (int k = 0; k < ctx->portinfo.gid_tbl_len; k++)
|
||||
{
|
||||
ibv_gid_entry gidx;
|
||||
if (ibv_query_gid_ex(ctx->context, ib_port, k, &gidx, 0) != 0)
|
||||
{
|
||||
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), k);
|
||||
goto cleanup;
|
||||
}
|
||||
// Skip empty GID
|
||||
if (((uint64_t*)gidx.gid.raw)[0] == 0 &&
|
||||
((uint64_t*)gidx.gid.raw)[1] == 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
// Prefer IPv4 RoCEv2 GID by default
|
||||
if (gid_index == -1 ||
|
||||
gidx.gid_type == IBV_GID_TYPE_ROCE_V2 &&
|
||||
(ctx->my_gid.gid_type != IBV_GID_TYPE_ROCE_V2 || is_ipv4_gid(&gidx)))
|
||||
{
|
||||
gid_index = k;
|
||||
ctx->my_gid = gidx;
|
||||
}
|
||||
}
|
||||
ctx->gid_index = gid_index = (gid_index == -1 ? 0 : gid_index);
|
||||
if (log_level > 0)
|
||||
{
|
||||
log_rdma_dev_port_gid(ctx->dev, ctx->ib_port, ctx->gid_index, ctx->my_gid);
|
||||
}
|
||||
}
|
||||
|
||||
ctx->pd = ibv_alloc_pd(ctx->context);
|
||||
if (!ctx->pd)
|
||||
|
@ -255,7 +431,7 @@ msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx,
|
|||
}
|
||||
|
||||
conn->addr.lid = ctx->my_lid;
|
||||
conn->addr.gid = ctx->my_gid;
|
||||
conn->addr.gid = ctx->my_gid.gid;
|
||||
conn->addr.qpn = conn->qp->qp_num;
|
||||
conn->addr.psn = lrand48() & 0xffffff;
|
||||
|
||||
|
|
|
@ -31,12 +31,12 @@ struct msgr_rdma_context_t
|
|||
uint8_t ib_port;
|
||||
uint8_t gid_index;
|
||||
uint16_t my_lid;
|
||||
ibv_gid my_gid;
|
||||
ibv_gid_entry my_gid;
|
||||
uint32_t mtu;
|
||||
int max_cqe = 0;
|
||||
int used_max_cqe = 0;
|
||||
|
||||
static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level);
|
||||
static msgr_rdma_context_t *create(std::vector<std::string> osd_networks, const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level);
|
||||
~msgr_rdma_context_t();
|
||||
};
|
||||
|
||||
|
|
|
@ -216,7 +216,7 @@ resume_1:
|
|||
for (uint64_t osd_num: node.child_osds)
|
||||
{
|
||||
auto & osd = placement_tree->osds.at(osd_num);
|
||||
fmt_items.push_back(json11::Json::object{
|
||||
auto json_osd = json11::Json::object{
|
||||
{ "type", "osd" },
|
||||
{ "name", osd.num },
|
||||
{ "parent", node.name },
|
||||
|
@ -230,7 +230,16 @@ resume_1:
|
|||
{ "bitmap", (uint64_t)osd.bitmap_granularity },
|
||||
{ "commit", osd.immediate_commit == IMMEDIATE_NONE ? "none" : (osd.immediate_commit == IMMEDIATE_ALL ? "all" : "small") },
|
||||
{ "op_stats", osd_stats[osd_num]["op_stats"] },
|
||||
});
|
||||
};
|
||||
if (osd_stats[osd_num]["slow_ops_primary"].uint64_value() > 0)
|
||||
{
|
||||
json_osd["slow_ops_primary"] = osd_stats[osd_num]["slow_ops_primary"];
|
||||
}
|
||||
if (osd_stats[osd_num]["slow_ops_secondary"].uint64_value() > 0)
|
||||
{
|
||||
json_osd["slow_ops_secondary"] = osd_stats[osd_num]["slow_ops_secondary"];
|
||||
}
|
||||
fmt_items.push_back(json_osd);
|
||||
}
|
||||
}
|
||||
result.data = fmt_items;
|
||||
|
|
|
@ -134,6 +134,7 @@ resume_2:
|
|||
}
|
||||
int osd_count = 0, osd_up = 0;
|
||||
uint64_t total_raw = 0, free_raw = 0, free_down_raw = 0, down_raw = 0;
|
||||
std::vector<uint64_t> slow_op_primary_osds, slow_op_secondary_osds;
|
||||
parent->iterate_kvs_1(osd_stats, "/osd/stats/", [&](uint64_t stat_osd_num, json11::Json value)
|
||||
{
|
||||
osd_count++;
|
||||
|
@ -153,6 +154,14 @@ resume_2:
|
|||
if (peer_it != parent->cli->st_cli.peer_states.end())
|
||||
{
|
||||
osd_up++;
|
||||
if (value["slow_ops_primary"].uint64_value() > 0)
|
||||
{
|
||||
slow_op_primary_osds.push_back(stat_osd_num);
|
||||
}
|
||||
if (value["slow_ops_secondary"].uint64_value() > 0)
|
||||
{
|
||||
slow_op_secondary_osds.push_back(stat_osd_num);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -216,6 +225,10 @@ resume_2:
|
|||
{ "mon_master", mon_master },
|
||||
{ "osd_up", osd_up },
|
||||
{ "osd_count", osd_count },
|
||||
{ "osds_full", osds_full },
|
||||
{ "osds_nearfull", osds_nearfull },
|
||||
{ "osds_primary_slow_ops", slow_op_primary_osds },
|
||||
{ "osds_secondary_slow_ops", slow_op_secondary_osds },
|
||||
{ "total_raw", total_raw },
|
||||
{ "free_raw", free_raw },
|
||||
{ "down_raw", down_raw },
|
||||
|
@ -300,6 +313,26 @@ resume_2:
|
|||
warning_str += " "+std::to_string(osds_nearfull)+
|
||||
(osds_nearfull > 1 ? " osds are almost full\n" : " osd is almost full\n");
|
||||
}
|
||||
if (slow_op_primary_osds.size() > 0)
|
||||
{
|
||||
warning_str += " "+std::to_string(slow_op_primary_osds.size());
|
||||
warning_str += (slow_op_primary_osds.size() > 1 ? " osds have" : " osd has");
|
||||
warning_str += " slow client ops: ";
|
||||
for (int i = 0; i < slow_op_primary_osds.size(); i++)
|
||||
{
|
||||
warning_str += (i > 0 ? ", " : "")+std::to_string(slow_op_primary_osds[i])+"\n";
|
||||
}
|
||||
}
|
||||
if (slow_op_secondary_osds.size() > 0)
|
||||
{
|
||||
warning_str += " "+std::to_string(slow_op_secondary_osds.size());
|
||||
warning_str += (slow_op_secondary_osds.size() > 1 ? " osds have" : " osd has");
|
||||
warning_str += " slow replication ops: ";
|
||||
for (int i = 0; i < slow_op_secondary_osds.size(); i++)
|
||||
{
|
||||
warning_str += (i > 0 ? ", " : "")+std::to_string(slow_op_secondary_osds[i])+"\n";
|
||||
}
|
||||
}
|
||||
if (warning_str != "")
|
||||
{
|
||||
warning_str = "\n warning:\n"+warning_str;
|
||||
|
|
|
@ -535,10 +535,12 @@ void osd_t::print_stats()
|
|||
|
||||
void osd_t::print_slow()
|
||||
{
|
||||
bool has_slow = false;
|
||||
cur_slow_op_primary = 0;
|
||||
cur_slow_op_secondary = 0;
|
||||
char alloc[1024];
|
||||
timespec now;
|
||||
clock_gettime(CLOCK_REALTIME, &now);
|
||||
// FIXME: Also track slow local blockstore ops and recovery/flush/scrub ops
|
||||
for (auto & kv: msgr.clients)
|
||||
{
|
||||
for (auto op: kv.second->received_ops)
|
||||
|
@ -608,6 +610,7 @@ void osd_t::print_slow()
|
|||
op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK ||
|
||||
op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
|
||||
{
|
||||
cur_slow_op_secondary++;
|
||||
bufprintf(" state=%d", op->bs_op ? PRIV(op->bs_op)->op_state : -1);
|
||||
int wait_for = op->bs_op ? PRIV(op->bs_op)->wait_for : 0;
|
||||
if (wait_for)
|
||||
|
@ -618,15 +621,19 @@ void osd_t::print_slow()
|
|||
else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
|
||||
op->req.hdr.opcode == OSD_OP_SYNC || op->req.hdr.opcode == OSD_OP_DELETE)
|
||||
{
|
||||
cur_slow_op_primary++;
|
||||
bufprintf(" state=%d", !op->op_data ? -1 : op->op_data->st);
|
||||
}
|
||||
else
|
||||
{
|
||||
cur_slow_op_primary++;
|
||||
}
|
||||
#undef bufprintf
|
||||
printf("%s\n", alloc);
|
||||
has_slow = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (has_slow && bs)
|
||||
if ((cur_slow_op_primary+cur_slow_op_secondary) > 0 && bs)
|
||||
{
|
||||
bs->dump_diagnostics();
|
||||
}
|
||||
|
|
|
@ -151,6 +151,8 @@ class osd_t
|
|||
bool etcd_reporting_pg_state = false;
|
||||
bool etcd_reporting_stats = false;
|
||||
int print_stats_timer_id = -1, slow_log_timer_id = -1;
|
||||
uint64_t cur_slow_op_primary = 0;
|
||||
uint64_t cur_slow_op_secondary = 0;
|
||||
|
||||
// peers and PGs
|
||||
|
||||
|
|
|
@ -201,6 +201,14 @@ json11::Json osd_t::get_statistics()
|
|||
st["immediate_commit"] = immediate_commit == IMMEDIATE_ALL ? "all" : (immediate_commit == IMMEDIATE_SMALL ? "small" : "none");
|
||||
st["host"] = self_state["host"];
|
||||
st["version"] = VITASTOR_VERSION;
|
||||
if (cur_slow_op_primary > 0)
|
||||
{
|
||||
st["slow_ops_primary"] = cur_slow_op_primary;
|
||||
}
|
||||
if (cur_slow_op_secondary > 0)
|
||||
{
|
||||
st["slow_ops_secondary"] = cur_slow_op_secondary;
|
||||
}
|
||||
json11::Json::object op_stats, subop_stats;
|
||||
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
|
||||
{
|
||||
|
|
|
@ -65,7 +65,7 @@ std::string addr_to_string(const sockaddr_storage &addr)
|
|||
return std::string(peer_str)+":"+std::to_string(port);
|
||||
}
|
||||
|
||||
static bool cidr_match(const in_addr &addr, const in_addr &net, uint8_t bits)
|
||||
bool cidr_match(const in_addr &addr, const in_addr &net, uint8_t bits)
|
||||
{
|
||||
if (bits == 0)
|
||||
{
|
||||
|
@ -75,7 +75,7 @@ static bool cidr_match(const in_addr &addr, const in_addr &net, uint8_t bits)
|
|||
return !((addr.s_addr ^ net.s_addr) & htonl(0xFFFFFFFFu << (32 - bits)));
|
||||
}
|
||||
|
||||
static bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_t bits)
|
||||
bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_t bits)
|
||||
{
|
||||
const uint32_t *a = address.s6_addr32;
|
||||
const uint32_t *n = network.s6_addr32;
|
||||
|
@ -93,47 +93,49 @@ static bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_
|
|||
return true;
|
||||
}
|
||||
|
||||
struct addr_mask_t
|
||||
addr_mask_t cidr_parse(std::string mask)
|
||||
{
|
||||
sa_family_t family;
|
||||
in_addr ipv4;
|
||||
in6_addr ipv6;
|
||||
uint8_t bits;
|
||||
};
|
||||
|
||||
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg, bool include_v6)
|
||||
{
|
||||
std::vector<addr_mask_t> masks;
|
||||
for (auto mask: mask_cfg)
|
||||
{
|
||||
unsigned bits = 0;
|
||||
unsigned bits = 255;
|
||||
int p = mask.find('/');
|
||||
if (p != std::string::npos)
|
||||
{
|
||||
char null_byte = 0;
|
||||
if (sscanf(mask.c_str()+p+1, "%u%c", &bits, &null_byte) != 1 || bits > 128)
|
||||
{
|
||||
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
|
||||
}
|
||||
throw std::runtime_error("Invalid IP address mask: " + mask);
|
||||
mask = mask.substr(0, p);
|
||||
}
|
||||
in_addr ipv4;
|
||||
in6_addr ipv6;
|
||||
if (inet_pton(AF_INET, mask.c_str(), &ipv4) == 1)
|
||||
{
|
||||
if (bits == 255)
|
||||
bits = 32;
|
||||
if (bits > 32)
|
||||
{
|
||||
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
|
||||
throw std::runtime_error("Invalid IP address mask: " + mask);
|
||||
return (addr_mask_t){ .family = AF_INET, .ipv4 = ipv4, .bits = (uint8_t)(bits ? bits : 32) };
|
||||
}
|
||||
masks.push_back((addr_mask_t){ .family = AF_INET, .ipv4 = ipv4, .bits = (uint8_t)bits });
|
||||
}
|
||||
else if (include_v6 && inet_pton(AF_INET6, mask.c_str(), &ipv6) == 1)
|
||||
else if (inet_pton(AF_INET6, mask.c_str(), &ipv6) == 1)
|
||||
{
|
||||
masks.push_back((addr_mask_t){ .family = AF_INET6, .ipv6 = ipv6, .bits = (uint8_t)bits });
|
||||
if (bits == 255)
|
||||
bits = 128;
|
||||
return (addr_mask_t){ .family = AF_INET6, .ipv6 = ipv6, .bits = (uint8_t)bits };
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
|
||||
throw std::runtime_error("Invalid IP address mask: " + mask);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg, bool include_v6)
|
||||
{
|
||||
std::vector<addr_mask_t> masks;
|
||||
for (auto mask: mask_cfg)
|
||||
{
|
||||
masks.push_back(cidr_parse(mask));
|
||||
if (masks[masks.size()-1].family == AF_INET6)
|
||||
{
|
||||
// Auto-enable IPv6 addresses
|
||||
include_v6 = true;
|
||||
}
|
||||
}
|
||||
std::set<std::string> addresses;
|
||||
|
|
|
@ -1,10 +1,22 @@
|
|||
#pragma once
|
||||
|
||||
#include <netinet/in.h>
|
||||
#include <sys/socket.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
struct addr_mask_t
|
||||
{
|
||||
sa_family_t family;
|
||||
in_addr ipv4;
|
||||
in6_addr ipv6;
|
||||
uint8_t bits;
|
||||
};
|
||||
|
||||
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr_storage *addr);
|
||||
std::string addr_to_string(const sockaddr_storage &addr);
|
||||
addr_mask_t cidr_parse(std::string mask);
|
||||
bool cidr_match(const in_addr &address, const in_addr &network, uint8_t bits);
|
||||
bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_t bits);
|
||||
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg = std::vector<std::string>(), bool include_v6 = false);
|
||||
int create_and_bind_socket(std::string bind_address, int bind_port, int listen_backlog, int *listening_port);
|
||||
|
|
Loading…
Reference in New Issue