Compare commits

...

10 Commits

18 changed files with 233 additions and 69 deletions

View File

@ -3,7 +3,7 @@ VITASTOR_VERSION ?= v2.1.0
all: build push all: build push
build: build:
@docker build --rm -t vitalif/vitastor:$(VITASTOR_VERSION) . @docker build --no-cache --rm -t vitalif/vitastor:$(VITASTOR_VERSION) .
push: push:
@docker push vitalif/vitastor:$(VITASTOR_VERSION) @docker push vitalif/vitastor:$(VITASTOR_VERSION)

View File

@ -1 +1,2 @@
deb http://vitastor.io/debian bookworm main deb http://vitastor.io/debian bookworm main
deb http://http.debian.net/debian/ bookworm-backports main

View File

@ -14,6 +14,9 @@ Commands:
- [upgrade](#upgrade) - [upgrade](#upgrade)
- [defrag](#defrag) - [defrag](#defrag)
⚠️ Important: follow the instructions from [Linux NFS write size](#linux-nfs-write-size)
for optimal Vitastor NFS performance if you use EC and HDD and mount your NFS from Linux.
## Pseudo-FS ## Pseudo-FS
Simplified pseudo-FS proxy is used for file-based image access emulation. It's not Simplified pseudo-FS proxy is used for file-based image access emulation. It's not
@ -100,6 +103,62 @@ Other notable missing features which should be addressed in the future:
in the DB. The FS is implemented is such way that this garbage doesn't affect its in the DB. The FS is implemented is such way that this garbage doesn't affect its
function, but having a tool to clean it up still seems a right thing to do. function, but having a tool to clean it up still seems a right thing to do.
## Linux NFS write size
Linux NFS client (nfs/nfsv3/nfsv4 kernel modules) has a hard-coded maximum I/O size,
currently set to 1 MB - see `rsize` and `wsize` in [man 5 nfs](https://linux.die.net/man/5/nfs).
This means that when you write to a file in an FS mounted over NFS, the maximum write
request size is 1 MB, even in the O_DIRECT mode and even if the original write request
is larger.
However, for optimal linear write performance in Vitastor EC (erasure-coded) pools,
the size of write requests should be a multiple of [block_size](../config/layout-cluster.en.md#block_size),
multiplied by the data chunk count of the pool ([pg_size](../config/pool.en.md#pg_size)-[parity_chunks](../config/pool.en.md#parity_chunks)).
When write requests are smaller or not a multiple of this number, Vitastor has to first
read paired data blocks from disks, calculate new parity blocks and only then write them
back. Obviously this is 2-3 times slower than a simple disk write.
Vitastor HDD setups use 1 MB block_size by default. So, for optimal performance, if
you use EC 2+1 and HDD, you need your NFS client to send 2 MB write requests, if you
use EC 4+1 - 4 MB and so on.
But Linux NFS client only writes in 1 MB chunks. 😢
The good news is that you can fix it by rebuilding Linux NFS kernel modules 😉 🤩!
You need to change NFS_MAX_FILE_IO_SIZE in nfs_xdr.h and then rebuild and reload modules.
The instruction, using Debian as an example (should be ran under root):
```
# download current Linux kernel headers required to build modules
apt-get install linux-headers-`uname -r`
# replace NFS_MAX_FILE_IO_SIZE with a desired number (here it's 4194304 - 4 MB)
sed -i 's/NFS_MAX_FILE_IO_SIZE\s*.*/NFS_MAX_FILE_IO_SIZE\t(4194304U)/' /lib/modules/`uname -r`/source/include/linux/nfs_xdr.h
# download current Linux kernel source
mkdir linux_src
cd linux_src
apt-get source linux-image-`uname -r`-unsigned
# build NFS modules
cd linux-*/fs/nfs
make -C /lib/modules/`uname -r`/build M=$PWD -j8 modules
make -C /lib/modules/`uname -r`/build M=$PWD modules_install
# move default NFS modules away
mv /lib/modules/`uname -r`/kernel/fs/nfs ~/nfs_orig_`uname -r`
depmod -a
# unload old modules and load the new ones
rmmod nfsv3 nfs
modprobe nfsv3
```
After these (not much complicated 🙂) manipulations NFS begins to be mounted
with new wsize and rsize by default and it fixes Vitastor-NFS linear write performance.
## Horizontal scaling ## Horizontal scaling
Linux NFS 3.0 client doesn't support built-in scaling or failover, i.e. you can't Linux NFS 3.0 client doesn't support built-in scaling or failover, i.e. you can't

View File

@ -14,6 +14,9 @@
- [upgrade](#upgrade) - [upgrade](#upgrade)
- [defrag](#defrag) - [defrag](#defrag)
⚠️ Важно: для оптимальной производительности Vitastor NFS в Linux при использовании
HDD и EC (erasure кодов) выполните инструкции из раздела [Размер записи Linux NFS](#размер-записи-linux-nfs).
## Псевдо-ФС ## Псевдо-ФС
Упрощённая реализация псевдо-ФС используется для эмуляции файлового доступа к блочным Упрощённая реализация псевдо-ФС используется для эмуляции файлового доступа к блочным
@ -104,6 +107,66 @@ JSON-формате :-). Для инспекции содержимого БД
записи. ФС устроена так, что на работу они не влияют, но для порядка и их стоит записи. ФС устроена так, что на работу они не влияют, но для порядка и их стоит
уметь подчищать. уметь подчищать.
## Размер записи Linux NFS
Клиент Linux NFS (модули ядра nfs/nfsv3/nfsv4) имеет фиксированный в коде максимальный
размер запроса ввода-вывода, равный 1 МБ - см. `rsize` и `wsize` в [man 5 nfs](https://linux.die.net/man/5/nfs).
Это означает, что когда вы записываете в файл в примонтированной по NFS файловой системе,
максимальный размер запроса записи составляет 1 МБ, даже в режиме O_DIRECT и даже если
исходный запрос записи был больше.
Однако для оптимальной скорости линейной записи в Vitastor при использовании EC-пулов
(пулов с кодами коррекции ошибок) запросы записи должны быть по размеру кратны
[block_size](../config/layout-cluster.ru.md#block_size), умноженному на число частей
данных пула ([pg_size](../config/pool.ru.md#pg_size)-[parity_chunks](../config/pool.ru.md#parity_chunks)).
Если запросы записи меньше или не кратны, то Vitastor приходится сначала прочитать
с дисков старые версии парных блоков данных, рассчитать новые блоки чётности и только
после этого записать их на диски. Естественно, это в 2-3 раза медленнее простой записи
на диск.
При этом block_size на жёстких дисках по умолчанию устанавливается равным 1 МБ.
Таким образом, если вы используете EC 2+1 и HDD, для оптимальной скорости записи вам
нужно, чтобы NFS-клиент писал по 2 МБ, если EC 4+1 и HDD - то по 4 МБ, и т.п.
А Linux NFS-клиент пишет только по 1 МБ. 😢
Но это можно исправить, пересобрав модули ядра Linux NFS 😉 🤩! Для этого нужно
поменять значение переменной NFS_MAX_FILE_IO_SIZE в заголовочном файле nfs_xdr.h,
после чего пересобрать модули NFS.
Инструкция по пересборке на примере Debian (выполнять под root):
```
# скачиваем заголовки для сборки модулей для текущего ядра Linux
apt-get install linux-headers-`uname -r`
# заменяем в заголовках NFS_MAX_FILE_IO_SIZE на желаемый (здесь 4194304 - 4 МБ)
sed -i 's/NFS_MAX_FILE_IO_SIZE\s*.*/NFS_MAX_FILE_IO_SIZE\t(4194304U)/' /lib/modules/`uname -r`/source/include/linux/nfs_xdr.h
# скачиваем исходный код текущего ядра
mkdir linux_src
cd linux_src
apt-get source linux-image-`uname -r`-unsigned
# собираем модули NFS
cd linux-*/fs/nfs
make -C /lib/modules/`uname -r`/build M=$PWD -j8 modules
make -C /lib/modules/`uname -r`/build M=$PWD modules_install
# убираем в сторону штатные модули NFS
mv /lib/modules/`uname -r`/kernel/fs/nfs ~/nfs_orig_`uname -r`
depmod -a
# выгружаем старые модули и загружаем новые
rmmod nfsv3 nfs
modprobe nfsv3
```
После такой (относительно нехитрой 🙂) манипуляции NFS начинает по умолчанию
монтироваться с новыми wsize и rsize, и производительность линейной записи в Vitastor-NFS
исправляется.
## Горизонтальное масштабирование ## Горизонтальное масштабирование
Клиент Linux NFS 3.0 не поддерживает встроенное масштабирование или отказоустойчивость. Клиент Linux NFS 3.0 не поддерживает встроенное масштабирование или отказоустойчивость.

View File

@ -162,10 +162,12 @@ apt-get install linux-headers-`uname -r`
apt-get build-dep linux-image-`uname -r`-unsigned apt-get build-dep linux-image-`uname -r`-unsigned
apt-get source linux-image-`uname -r`-unsigned apt-get source linux-image-`uname -r`-unsigned
cd linux*/drivers/vdpa cd linux*/drivers/vdpa
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m modules_install
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
cd ../virtio cd ../virtio
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m modules_install
depmod -a depmod -a
``` ```

View File

@ -165,10 +165,12 @@ apt-get install linux-headers-`uname -r`
apt-get build-dep linux-image-`uname -r`-unsigned apt-get build-dep linux-image-`uname -r`-unsigned
apt-get source linux-image-`uname -r`-unsigned apt-get source linux-image-`uname -r`-unsigned
cd linux*/drivers/vdpa cd linux*/drivers/vdpa
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m modules_install
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
cd ../virtio cd ../virtio
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m modules_install
depmod -a depmod -a
``` ```

View File

@ -342,7 +342,7 @@ function filter_tree_by_rules(osd_tree, rules, selected)
// Convert from // Convert from
// node_list = { id: string|number, level: string, size?: number, parent?: string|number }[] // node_list = { id: string|number, level: string, size?: number, parent?: string|number }[]
// to // to
// node_tree = { [node_id]: { id, level, size?, parent?, children?: child_node_id[], levels: { [level]: id, ... } } } // node_tree = { [node_id]: { id, level, size?, parent?, children?: child_node[], levels: { [level]: id, ... } } }
function index_tree(node_list) function index_tree(node_list)
{ {
const tree = { '': { children: [], levels: {} } }; const tree = { '': { children: [], levels: {} } };
@ -357,7 +357,7 @@ function index_tree(node_list)
tree[parent_id].children = tree[parent_id].children || []; tree[parent_id].children = tree[parent_id].children || [];
tree[parent_id].children.push(tree[node.id]); tree[parent_id].children.push(tree[node.id]);
} }
const cur = tree[''].children; const cur = [ ...tree[''].children ];
for (let i = 0; i < cur.length; i++) for (let i = 0; i < cur.length; i++)
{ {
cur[i].levels[cur[i].level] = cur[i].id; cur[i].levels[cur[i].level] = cur[i].id;

View File

@ -15,7 +15,7 @@ function get_osd_tree(global_config, state)
const stat = state.osd.stats[osd_num]; const stat = state.osd.stats[osd_num];
const osd_cfg = state.config.osd[osd_num]; const osd_cfg = state.config.osd[osd_num];
let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight); let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight);
if (reweight < 0 || isNaN(reweight)) if (isNaN(reweight) || reweight < 0 || reweight > 0)
reweight = 1; reweight = 1;
if (stat && stat.size && reweight && (state.osd.state[osd_num] || Number(stat.time) >= down_time || if (stat && stat.size && reweight && (state.osd.state[osd_num] || Number(stat.time) >= down_time ||
osd_cfg && osd_cfg.noout)) osd_cfg && osd_cfg.noout))

View File

@ -40,6 +40,11 @@ async function run()
console.log("/etc/systemd/system/vitastor-etcd.service already exists"); console.log("/etc/systemd/system/vitastor-etcd.service already exists");
process.exit(1); process.exit(1);
} }
if (!in_docker && fs.existsSync("/etc/systemd/system/etcd.service"))
{
console.log("/etc/systemd/system/etcd.service already exists");
process.exit(1);
}
const config = JSON.parse(fs.readFileSync(config_path, { encoding: 'utf-8' })); const config = JSON.parse(fs.readFileSync(config_path, { encoding: 'utf-8' }));
if (!config.etcd_address) if (!config.etcd_address)
{ {
@ -97,8 +102,8 @@ WantedBy=multi-user.target
`); `);
await system(`useradd etcd`); await system(`useradd etcd`);
await system(`systemctl daemon-reload`); await system(`systemctl daemon-reload`);
await system(`systemctl enable etcd`); await system(`systemctl enable vitastor-etcd`);
await system(`systemctl start etcd`); await system(`systemctl start vitastor-etcd`);
process.exit(0); process.exit(0);
} }

View File

@ -266,6 +266,8 @@ class blockstore_impl_t
int throttle_threshold_us = 50; int throttle_threshold_us = 50;
// Maximum writes between automatically added fsync operations // Maximum writes between automatically added fsync operations
uint64_t autosync_writes = 128; uint64_t autosync_writes = 128;
// Log level (0-10)
int log_level = 0;
/******* END OF OPTIONS *******/ /******* END OF OPTIONS *******/
struct ring_consumer_t ring_consumer; struct ring_consumer_t ring_consumer;

View File

@ -113,10 +113,13 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
if (!right_dir && next_pos >= bs->journal.used_start-bs->journal.block_size) if (!right_dir && next_pos >= bs->journal.used_start-bs->journal.block_size)
{ {
// No space in the journal. Wait until used_start changes. // No space in the journal. Wait until used_start changes.
printf( if (bs->log_level > 5)
"Ran out of journal space (used_start=%08jx, next_free=%08jx, dirty_start=%08jx)\n", {
bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start printf(
); "Ran out of journal space (used_start=%08jx, next_free=%08jx, dirty_start=%08jx)\n",
bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start
);
}
PRIV(op)->wait_for = WAIT_JOURNAL; PRIV(op)->wait_for = WAIT_JOURNAL;
bs->flusher->request_trim(); bs->flusher->request_trim();
PRIV(op)->wait_detail = bs->journal.used_start; PRIV(op)->wait_detail = bs->journal.used_start;

View File

@ -101,6 +101,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes"; config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
journal.inmemory = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" && journal.inmemory = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" &&
config["inmemory_journal"] != "no"; config["inmemory_journal"] != "no";
log_level = strtoull(config["log_level"].c_str(), NULL, 10);
// Validate // Validate
if (journal.sector_count < 2) if (journal.sector_count < 2)
{ {

View File

@ -628,7 +628,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
}, },
}; };
#ifdef WITH_RDMA #ifdef WITH_RDMA
if (rdma_contexts.size()) if (!use_rdmacm && rdma_contexts.size())
{ {
// Choose the right context for the selected network // Choose the right context for the selected network
msgr_rdma_context_t *selected_ctx = choose_rdma_context(cl); msgr_rdma_context_t *selected_ctx = choose_rdma_context(cl);
@ -701,7 +701,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
return; return;
} }
#ifdef WITH_RDMA #ifdef WITH_RDMA
if (cl->rdma_conn && config["rdma_address"].is_string()) if (!use_rdmacm && cl->rdma_conn && config["rdma_address"].is_string())
{ {
msgr_rdma_address_t addr; msgr_rdma_address_t addr;
if (!msgr_rdma_address_t::from_string(config["rdma_address"].string_value().c_str(), &addr) || if (!msgr_rdma_address_t::from_string(config["rdma_address"].string_value().c_str(), &addr) ||
@ -800,7 +800,8 @@ bool osd_messenger_t::is_rdma_enabled()
{ {
return rdma_contexts.size() > 0; return rdma_contexts.size() > 0;
} }
#endif
#ifdef WITH_RDMACM
bool osd_messenger_t::is_use_rdmacm() bool osd_messenger_t::is_use_rdmacm()
{ {
return use_rdmacm; return use_rdmacm;

View File

@ -97,6 +97,7 @@ struct osd_wanted_peer_t
json11::Json raw_address_list; json11::Json raw_address_list;
json11::Json address_list; json11::Json address_list;
int port = 0; int port = 0;
// FIXME: Remove separate WITH_RDMACM?
#ifdef WITH_RDMACM #ifdef WITH_RDMACM
int rdmacm_port = 0; int rdmacm_port = 0;
#endif #endif
@ -286,6 +287,7 @@ protected:
msgr_rdma_context_t* rdmacm_create_qp(rdma_cm_id *cmid); msgr_rdma_context_t* rdmacm_create_qp(rdma_cm_id *cmid);
void rdmacm_accept(rdma_cm_event *ev); void rdmacm_accept(rdma_cm_event *ev);
void rdmacm_try_connect_peer(uint64_t peer_osd, const std::string & addr, int rdmacm_port, int fallback_tcp_port); void rdmacm_try_connect_peer(uint64_t peer_osd, const std::string & addr, int rdmacm_port, int fallback_tcp_port);
void rdmacm_set_conn_timeout(rdmacm_connecting_t *conn);
void rdmacm_on_connect_peer_error(rdma_cm_id *cmid, int res); void rdmacm_on_connect_peer_error(rdma_cm_id *cmid, int res);
void rdmacm_address_resolved(rdma_cm_event *ev); void rdmacm_address_resolved(rdma_cm_event *ev);
void rdmacm_route_resolved(rdma_cm_event *ev); void rdmacm_route_resolved(rdma_cm_event *ev);

View File

@ -70,6 +70,7 @@ msgr_rdma_context_t::~msgr_rdma_context_t()
msgr_rdma_connection_t::~msgr_rdma_connection_t() msgr_rdma_connection_t::~msgr_rdma_connection_t()
{ {
ctx->reserve_cqe(-max_send-max_recv); ctx->reserve_cqe(-max_send-max_recv);
#ifdef WITH_RDMACM
if (qp && !cmid) if (qp && !cmid)
ibv_destroy_qp(qp); ibv_destroy_qp(qp);
if (cmid) if (cmid)
@ -79,6 +80,10 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
rdma_destroy_qp(cmid); rdma_destroy_qp(cmid);
rdma_destroy_id(cmid); rdma_destroy_id(cmid);
} }
#else
if (qp)
ibv_destroy_qp(qp);
#endif
if (recv_buffers.size()) if (recv_buffers.size())
{ {
for (auto b: recv_buffers) for (auto b: recv_buffers)

View File

@ -70,7 +70,7 @@ void osd_messenger_t::rdmacm_destroy_listener(rdma_cm_id *listener)
void osd_messenger_t::handle_rdmacm_events() void osd_messenger_t::handle_rdmacm_events()
{ {
// rdma_destroy_id infinitely waits for pthread_cond if called before all events are acked :-( // rdma_destroy_id infinitely waits for pthread_cond if called before all events are acked :-(...
std::vector<rdma_cm_event> events_copy; std::vector<rdma_cm_event> events_copy;
while (1) while (1)
{ {
@ -83,7 +83,15 @@ void osd_messenger_t::handle_rdmacm_events()
fprintf(stderr, "Failed to get RDMA-CM event: %s (code %d)\n", strerror(errno), errno); fprintf(stderr, "Failed to get RDMA-CM event: %s (code %d)\n", strerror(errno), errno);
exit(1); exit(1);
} }
events_copy.push_back(*ev); // ...so we save a copy of all events EXCEPT connection requests, otherwise they sometimes fail with EVENT_DISCONNECT
if (ev->event == RDMA_CM_EVENT_CONNECT_REQUEST)
{
rdmacm_accept(ev);
}
else
{
events_copy.push_back(*ev);
}
r = rdma_ack_cm_event(ev); r = rdma_ack_cm_event(ev);
if (r != 0) if (r != 0)
{ {
@ -96,7 +104,7 @@ void osd_messenger_t::handle_rdmacm_events()
auto ev = &evl; auto ev = &evl;
if (ev->event == RDMA_CM_EVENT_CONNECT_REQUEST) if (ev->event == RDMA_CM_EVENT_CONNECT_REQUEST)
{ {
rdmacm_accept(ev); // Do nothing, handled above
} }
else if (ev->event == RDMA_CM_EVENT_CONNECT_ERROR || else if (ev->event == RDMA_CM_EVENT_CONNECT_ERROR ||
ev->event == RDMA_CM_EVENT_REJECTED || ev->event == RDMA_CM_EVENT_REJECTED ||
@ -287,29 +295,34 @@ void osd_messenger_t::rdmacm_accept(rdma_cm_event *ev)
rdma_destroy_id(ev->id); rdma_destroy_id(ev->id);
return; return;
} }
rdma_context->cm_refs++; // Wait for RDMA_CM_ESTABLISHED, and enable the connection only after it
// Wrap into a new msgr_rdma_connection_t auto conn = new rdmacm_connecting_t;
msgr_rdma_connection_t *conn = new msgr_rdma_connection_t;
conn->ctx = rdma_context;
conn->max_send = rdma_max_send;
conn->max_recv = rdma_max_recv;
conn->max_sge = rdma_max_sge > rdma_context->attrx.orig_attr.max_sge
? rdma_context->attrx.orig_attr.max_sge : rdma_max_sge;
conn->max_msg = rdma_max_msg;
conn->cmid = ev->id; conn->cmid = ev->id;
conn->qp = ev->id->qp; conn->peer_fd = fake_fd;
auto cl = new osd_client_t(); conn->parsed_addr = *(sockaddr_storage*)rdma_get_peer_addr(ev->id);
cl->peer_fd = fake_fd; conn->rdma_context = rdma_context;
cl->peer_state = PEER_RDMA; rdmacm_set_conn_timeout(conn);
cl->peer_addr = *(sockaddr_storage*)rdma_get_peer_addr(ev->id); rdmacm_connecting[ev->id] = conn;
cl->in_buf = malloc_or_die(receive_buffer_size); fprintf(stderr, "[OSD %ju] new client %d: connection from %s via RDMA-CM\n", this->osd_num, conn->peer_fd,
cl->rdma_conn = conn; addr_to_string(conn->parsed_addr).c_str());
clients[fake_fd] = cl; }
rdmacm_connections[ev->id] = cl;
// Add initial receive request(s) void osd_messenger_t::rdmacm_set_conn_timeout(rdmacm_connecting_t *conn)
try_recv_rdma(cl); {
fprintf(stderr, "[OSD %ju] new client %d: connection from %s via RDMA-CM\n", this->osd_num, fake_fd, conn->timeout_ms = peer_connect_timeout*1000;
addr_to_string(cl->peer_addr).c_str()); if (peer_connect_timeout > 0)
{
conn->timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, cmid = conn->cmid](int timer_id)
{
auto conn = rdmacm_connecting.at(cmid);
conn->timeout_id = -1;
if (conn->peer_osd)
fprintf(stderr, "RDMA-CM connection to %s timed out\n", conn->addr.c_str());
else
fprintf(stderr, "Incoming RDMA-CM connection from %s timed out\n", addr_to_string(conn->parsed_addr).c_str());
rdmacm_on_connect_peer_error(cmid, -EPIPE);
});
}
} }
void osd_messenger_t::rdmacm_on_connect_peer_error(rdma_cm_id *cmid, int res) void osd_messenger_t::rdmacm_on_connect_peer_error(rdma_cm_id *cmid, int res)
@ -332,15 +345,18 @@ void osd_messenger_t::rdmacm_on_connect_peer_error(rdma_cm_id *cmid, int res)
} }
rdmacm_connecting.erase(cmid); rdmacm_connecting.erase(cmid);
delete conn; delete conn;
if (!disable_tcp) if (peer_osd)
{ {
// Fall back to TCP instead of just reporting the error to on_connect_peer() if (!disable_tcp)
try_connect_peer_tcp(peer_osd, addr.c_str(), tcp_port); {
} // Fall back to TCP instead of just reporting the error to on_connect_peer()
else try_connect_peer_tcp(peer_osd, addr.c_str(), tcp_port);
{ }
// TCP is disabled else
on_connect_peer(peer_osd, res == 0 ? -EINVAL : (res > 0 ? -res : res)); {
// TCP is disabled
on_connect_peer(peer_osd, res == 0 ? -EINVAL : (res > 0 ? -res : res));
}
} }
} }
@ -374,6 +390,8 @@ void osd_messenger_t::rdmacm_try_connect_peer(uint64_t peer_osd, const std::stri
on_connect_peer(peer_osd, res); on_connect_peer(peer_osd, res);
return; return;
} }
if (log_level > 0)
fprintf(stderr, "Trying to connect to OSD %ju at %s:%d via RDMA-CM\n", peer_osd, addr.c_str(), rdmacm_port);
auto conn = new rdmacm_connecting_t; auto conn = new rdmacm_connecting_t;
rdmacm_connecting[cmid] = conn; rdmacm_connecting[cmid] = conn;
conn->cmid = cmid; conn->cmid = cmid;
@ -383,19 +401,7 @@ void osd_messenger_t::rdmacm_try_connect_peer(uint64_t peer_osd, const std::stri
conn->parsed_addr = sa; conn->parsed_addr = sa;
conn->rdmacm_port = rdmacm_port; conn->rdmacm_port = rdmacm_port;
conn->tcp_port = fallback_tcp_port; conn->tcp_port = fallback_tcp_port;
conn->timeout_ms = peer_connect_timeout*1000; rdmacm_set_conn_timeout(conn);
conn->timeout_id = -1;
if (peer_connect_timeout > 0)
{
conn->timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, cmid](int timer_id)
{
auto conn = rdmacm_connecting.at(cmid);
conn->timeout_id = -1;
fprintf(stderr, "RDMA-CM connection to %s timed out\n", conn->addr.c_str());
rdmacm_on_connect_peer_error(cmid, -EPIPE);
return;
});
}
if (rdma_resolve_addr(cmid, NULL, (sockaddr*)&conn->parsed_addr, conn->timeout_ms) != 0) if (rdma_resolve_addr(cmid, NULL, (sockaddr*)&conn->parsed_addr, conn->timeout_ms) != 0)
{ {
auto res = -errno; auto res = -errno;
@ -494,7 +500,7 @@ void osd_messenger_t::rdmacm_established(rdma_cm_event *ev)
// Wrap into a new msgr_rdma_connection_t // Wrap into a new msgr_rdma_connection_t
msgr_rdma_connection_t *rc = new msgr_rdma_connection_t; msgr_rdma_connection_t *rc = new msgr_rdma_connection_t;
rc->ctx = conn->rdma_context; rc->ctx = conn->rdma_context;
rc->ctx->cm_refs++; rc->ctx->cm_refs++; // FIXME now unused, count also connecting_t's when used
rc->max_send = rdma_max_send; rc->max_send = rdma_max_send;
rc->max_recv = rdma_max_recv; rc->max_recv = rdma_max_recv;
rc->max_sge = rdma_max_sge > rc->ctx->attrx.orig_attr.max_sge rc->max_sge = rdma_max_sge > rc->ctx->attrx.orig_attr.max_sge
@ -514,14 +520,20 @@ void osd_messenger_t::rdmacm_established(rdma_cm_event *ev)
cl->rdma_conn = rc; cl->rdma_conn = rc;
clients[conn->peer_fd] = cl; clients[conn->peer_fd] = cl;
if (conn->timeout_id >= 0) if (conn->timeout_id >= 0)
{
tfd->clear_timer(conn->timeout_id); tfd->clear_timer(conn->timeout_id);
}
delete conn; delete conn;
rdmacm_connecting.erase(cmid); rdmacm_connecting.erase(cmid);
rdmacm_connections[cmid] = cl; rdmacm_connections[cmid] = cl;
if (log_level > 0) if (log_level > 0 && peer_osd)
{
fprintf(stderr, "Successfully connected with OSD %ju using RDMA-CM\n", peer_osd); fprintf(stderr, "Successfully connected with OSD %ju using RDMA-CM\n", peer_osd);
}
// Add initial receive request(s) // Add initial receive request(s)
try_recv_rdma(cl); try_recv_rdma(cl);
osd_peer_fds[peer_osd] = cl->peer_fd; if (peer_osd)
on_connect_peer(peer_osd, cl->peer_fd); {
check_peer_config(cl);
}
} }

View File

@ -58,6 +58,12 @@ struct osd_changer_t
state = 100; state = 100;
return; return;
} }
if (set_reweight && new_reweight > 1)
{
result = (cli_result_t){ .err = EINVAL, .text = "Reweight can't be larger than 1" };
state = 100;
return;
}
parent->etcd_txn(json11::Json::object { parent->etcd_txn(json11::Json::object {
{ "success", json11::Json::array { { "success", json11::Json::array {
json11::Json::object { json11::Json::object {

View File

@ -22,8 +22,8 @@ int nfs3_fsstat_proc(void *opaque, rpc_op_t *rop)
{ {
auto ttb = pst_it->second["total_raw_tb"].number_value(); auto ttb = pst_it->second["total_raw_tb"].number_value();
auto ftb = (pst_it->second["total_raw_tb"].number_value() - pst_it->second["used_raw_tb"].number_value()); auto ftb = (pst_it->second["total_raw_tb"].number_value() - pst_it->second["used_raw_tb"].number_value());
tbytes = ttb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40); tbytes = ttb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)1<<40);
fbytes = ftb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40); fbytes = ftb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)1<<40);
} }
*reply = (FSSTAT3res){ *reply = (FSSTAT3res){
.status = NFS3_OK, .status = NFS3_OK,