Compare commits
10 Commits
Author | SHA1 | Date |
---|---|---|
|
15d0204f96 | |
|
21d6e88a1b | |
|
df2847df2d | |
|
327c98a4b6 | |
|
3cc0abfd81 | |
|
80e5f8ba76 | |
|
4b660f1ce8 | |
|
dfde0e60f0 | |
|
013f688ffe | |
|
cf9738ddbe |
docker
etc/apt/sources.list.d
docs/usage
mon
|
@ -3,7 +3,7 @@ VITASTOR_VERSION ?= v2.1.0
|
|||
all: build push
|
||||
|
||||
build:
|
||||
@docker build --rm -t vitalif/vitastor:$(VITASTOR_VERSION) .
|
||||
@docker build --no-cache --rm -t vitalif/vitastor:$(VITASTOR_VERSION) .
|
||||
|
||||
push:
|
||||
@docker push vitalif/vitastor:$(VITASTOR_VERSION)
|
||||
|
|
|
@ -1 +1,2 @@
|
|||
deb http://vitastor.io/debian bookworm main
|
||||
deb http://http.debian.net/debian/ bookworm-backports main
|
||||
|
|
|
@ -14,6 +14,9 @@ Commands:
|
|||
- [upgrade](#upgrade)
|
||||
- [defrag](#defrag)
|
||||
|
||||
⚠️ Important: follow the instructions from [Linux NFS write size](#linux-nfs-write-size)
|
||||
for optimal Vitastor NFS performance if you use EC and HDD and mount your NFS from Linux.
|
||||
|
||||
## Pseudo-FS
|
||||
|
||||
Simplified pseudo-FS proxy is used for file-based image access emulation. It's not
|
||||
|
@ -100,6 +103,62 @@ Other notable missing features which should be addressed in the future:
|
|||
in the DB. The FS is implemented is such way that this garbage doesn't affect its
|
||||
function, but having a tool to clean it up still seems a right thing to do.
|
||||
|
||||
## Linux NFS write size
|
||||
|
||||
Linux NFS client (nfs/nfsv3/nfsv4 kernel modules) has a hard-coded maximum I/O size,
|
||||
currently set to 1 MB - see `rsize` and `wsize` in [man 5 nfs](https://linux.die.net/man/5/nfs).
|
||||
|
||||
This means that when you write to a file in an FS mounted over NFS, the maximum write
|
||||
request size is 1 MB, even in the O_DIRECT mode and even if the original write request
|
||||
is larger.
|
||||
|
||||
However, for optimal linear write performance in Vitastor EC (erasure-coded) pools,
|
||||
the size of write requests should be a multiple of [block_size](../config/layout-cluster.en.md#block_size),
|
||||
multiplied by the data chunk count of the pool ([pg_size](../config/pool.en.md#pg_size)-[parity_chunks](../config/pool.en.md#parity_chunks)).
|
||||
When write requests are smaller or not a multiple of this number, Vitastor has to first
|
||||
read paired data blocks from disks, calculate new parity blocks and only then write them
|
||||
back. Obviously this is 2-3 times slower than a simple disk write.
|
||||
|
||||
Vitastor HDD setups use 1 MB block_size by default. So, for optimal performance, if
|
||||
you use EC 2+1 and HDD, you need your NFS client to send 2 MB write requests, if you
|
||||
use EC 4+1 - 4 MB and so on.
|
||||
|
||||
But Linux NFS client only writes in 1 MB chunks. 😢
|
||||
|
||||
The good news is that you can fix it by rebuilding Linux NFS kernel modules 😉 🤩!
|
||||
You need to change NFS_MAX_FILE_IO_SIZE in nfs_xdr.h and then rebuild and reload modules.
|
||||
|
||||
The instruction, using Debian as an example (should be ran under root):
|
||||
|
||||
```
|
||||
# download current Linux kernel headers required to build modules
|
||||
apt-get install linux-headers-`uname -r`
|
||||
|
||||
# replace NFS_MAX_FILE_IO_SIZE with a desired number (here it's 4194304 - 4 MB)
|
||||
sed -i 's/NFS_MAX_FILE_IO_SIZE\s*.*/NFS_MAX_FILE_IO_SIZE\t(4194304U)/' /lib/modules/`uname -r`/source/include/linux/nfs_xdr.h
|
||||
|
||||
# download current Linux kernel source
|
||||
mkdir linux_src
|
||||
cd linux_src
|
||||
apt-get source linux-image-`uname -r`-unsigned
|
||||
|
||||
# build NFS modules
|
||||
cd linux-*/fs/nfs
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD -j8 modules
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD modules_install
|
||||
|
||||
# move default NFS modules away
|
||||
mv /lib/modules/`uname -r`/kernel/fs/nfs ~/nfs_orig_`uname -r`
|
||||
depmod -a
|
||||
|
||||
# unload old modules and load the new ones
|
||||
rmmod nfsv3 nfs
|
||||
modprobe nfsv3
|
||||
```
|
||||
|
||||
After these (not much complicated 🙂) manipulations NFS begins to be mounted
|
||||
with new wsize and rsize by default and it fixes Vitastor-NFS linear write performance.
|
||||
|
||||
## Horizontal scaling
|
||||
|
||||
Linux NFS 3.0 client doesn't support built-in scaling or failover, i.e. you can't
|
||||
|
|
|
@ -14,6 +14,9 @@
|
|||
- [upgrade](#upgrade)
|
||||
- [defrag](#defrag)
|
||||
|
||||
⚠️ Важно: для оптимальной производительности Vitastor NFS в Linux при использовании
|
||||
HDD и EC (erasure кодов) выполните инструкции из раздела [Размер записи Linux NFS](#размер-записи-linux-nfs).
|
||||
|
||||
## Псевдо-ФС
|
||||
|
||||
Упрощённая реализация псевдо-ФС используется для эмуляции файлового доступа к блочным
|
||||
|
@ -104,6 +107,66 @@ JSON-формате :-). Для инспекции содержимого БД
|
|||
записи. ФС устроена так, что на работу они не влияют, но для порядка и их стоит
|
||||
уметь подчищать.
|
||||
|
||||
## Размер записи Linux NFS
|
||||
|
||||
Клиент Linux NFS (модули ядра nfs/nfsv3/nfsv4) имеет фиксированный в коде максимальный
|
||||
размер запроса ввода-вывода, равный 1 МБ - см. `rsize` и `wsize` в [man 5 nfs](https://linux.die.net/man/5/nfs).
|
||||
|
||||
Это означает, что когда вы записываете в файл в примонтированной по NFS файловой системе,
|
||||
максимальный размер запроса записи составляет 1 МБ, даже в режиме O_DIRECT и даже если
|
||||
исходный запрос записи был больше.
|
||||
|
||||
Однако для оптимальной скорости линейной записи в Vitastor при использовании EC-пулов
|
||||
(пулов с кодами коррекции ошибок) запросы записи должны быть по размеру кратны
|
||||
[block_size](../config/layout-cluster.ru.md#block_size), умноженному на число частей
|
||||
данных пула ([pg_size](../config/pool.ru.md#pg_size)-[parity_chunks](../config/pool.ru.md#parity_chunks)).
|
||||
Если запросы записи меньше или не кратны, то Vitastor приходится сначала прочитать
|
||||
с дисков старые версии парных блоков данных, рассчитать новые блоки чётности и только
|
||||
после этого записать их на диски. Естественно, это в 2-3 раза медленнее простой записи
|
||||
на диск.
|
||||
|
||||
При этом block_size на жёстких дисках по умолчанию устанавливается равным 1 МБ.
|
||||
Таким образом, если вы используете EC 2+1 и HDD, для оптимальной скорости записи вам
|
||||
нужно, чтобы NFS-клиент писал по 2 МБ, если EC 4+1 и HDD - то по 4 МБ, и т.п.
|
||||
|
||||
А Linux NFS-клиент пишет только по 1 МБ. 😢
|
||||
|
||||
Но это можно исправить, пересобрав модули ядра Linux NFS 😉 🤩! Для этого нужно
|
||||
поменять значение переменной NFS_MAX_FILE_IO_SIZE в заголовочном файле nfs_xdr.h,
|
||||
после чего пересобрать модули NFS.
|
||||
|
||||
Инструкция по пересборке на примере Debian (выполнять под root):
|
||||
|
||||
```
|
||||
# скачиваем заголовки для сборки модулей для текущего ядра Linux
|
||||
apt-get install linux-headers-`uname -r`
|
||||
|
||||
# заменяем в заголовках NFS_MAX_FILE_IO_SIZE на желаемый (здесь 4194304 - 4 МБ)
|
||||
sed -i 's/NFS_MAX_FILE_IO_SIZE\s*.*/NFS_MAX_FILE_IO_SIZE\t(4194304U)/' /lib/modules/`uname -r`/source/include/linux/nfs_xdr.h
|
||||
|
||||
# скачиваем исходный код текущего ядра
|
||||
mkdir linux_src
|
||||
cd linux_src
|
||||
apt-get source linux-image-`uname -r`-unsigned
|
||||
|
||||
# собираем модули NFS
|
||||
cd linux-*/fs/nfs
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD -j8 modules
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD modules_install
|
||||
|
||||
# убираем в сторону штатные модули NFS
|
||||
mv /lib/modules/`uname -r`/kernel/fs/nfs ~/nfs_orig_`uname -r`
|
||||
depmod -a
|
||||
|
||||
# выгружаем старые модули и загружаем новые
|
||||
rmmod nfsv3 nfs
|
||||
modprobe nfsv3
|
||||
```
|
||||
|
||||
После такой (относительно нехитрой 🙂) манипуляции NFS начинает по умолчанию
|
||||
монтироваться с новыми wsize и rsize, и производительность линейной записи в Vitastor-NFS
|
||||
исправляется.
|
||||
|
||||
## Горизонтальное масштабирование
|
||||
|
||||
Клиент Linux NFS 3.0 не поддерживает встроенное масштабирование или отказоустойчивость.
|
||||
|
|
|
@ -162,10 +162,12 @@ apt-get install linux-headers-`uname -r`
|
|||
apt-get build-dep linux-image-`uname -r`-unsigned
|
||||
apt-get source linux-image-`uname -r`-unsigned
|
||||
cd linux*/drivers/vdpa
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m modules_install
|
||||
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
|
||||
cd ../virtio
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m modules_install
|
||||
depmod -a
|
||||
```
|
||||
|
||||
|
|
|
@ -165,10 +165,12 @@ apt-get install linux-headers-`uname -r`
|
|||
apt-get build-dep linux-image-`uname -r`-unsigned
|
||||
apt-get source linux-image-`uname -r`-unsigned
|
||||
cd linux*/drivers/vdpa
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m modules_install
|
||||
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
|
||||
cd ../virtio
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules
|
||||
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m modules_install
|
||||
depmod -a
|
||||
```
|
||||
|
||||
|
|
|
@ -342,7 +342,7 @@ function filter_tree_by_rules(osd_tree, rules, selected)
|
|||
// Convert from
|
||||
// node_list = { id: string|number, level: string, size?: number, parent?: string|number }[]
|
||||
// to
|
||||
// node_tree = { [node_id]: { id, level, size?, parent?, children?: child_node_id[], levels: { [level]: id, ... } } }
|
||||
// node_tree = { [node_id]: { id, level, size?, parent?, children?: child_node[], levels: { [level]: id, ... } } }
|
||||
function index_tree(node_list)
|
||||
{
|
||||
const tree = { '': { children: [], levels: {} } };
|
||||
|
@ -357,7 +357,7 @@ function index_tree(node_list)
|
|||
tree[parent_id].children = tree[parent_id].children || [];
|
||||
tree[parent_id].children.push(tree[node.id]);
|
||||
}
|
||||
const cur = tree[''].children;
|
||||
const cur = [ ...tree[''].children ];
|
||||
for (let i = 0; i < cur.length; i++)
|
||||
{
|
||||
cur[i].levels[cur[i].level] = cur[i].id;
|
||||
|
|
|
@ -15,7 +15,7 @@ function get_osd_tree(global_config, state)
|
|||
const stat = state.osd.stats[osd_num];
|
||||
const osd_cfg = state.config.osd[osd_num];
|
||||
let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight);
|
||||
if (reweight < 0 || isNaN(reweight))
|
||||
if (isNaN(reweight) || reweight < 0 || reweight > 0)
|
||||
reweight = 1;
|
||||
if (stat && stat.size && reweight && (state.osd.state[osd_num] || Number(stat.time) >= down_time ||
|
||||
osd_cfg && osd_cfg.noout))
|
||||
|
|
|
@ -40,6 +40,11 @@ async function run()
|
|||
console.log("/etc/systemd/system/vitastor-etcd.service already exists");
|
||||
process.exit(1);
|
||||
}
|
||||
if (!in_docker && fs.existsSync("/etc/systemd/system/etcd.service"))
|
||||
{
|
||||
console.log("/etc/systemd/system/etcd.service already exists");
|
||||
process.exit(1);
|
||||
}
|
||||
const config = JSON.parse(fs.readFileSync(config_path, { encoding: 'utf-8' }));
|
||||
if (!config.etcd_address)
|
||||
{
|
||||
|
@ -97,8 +102,8 @@ WantedBy=multi-user.target
|
|||
`);
|
||||
await system(`useradd etcd`);
|
||||
await system(`systemctl daemon-reload`);
|
||||
await system(`systemctl enable etcd`);
|
||||
await system(`systemctl start etcd`);
|
||||
await system(`systemctl enable vitastor-etcd`);
|
||||
await system(`systemctl start vitastor-etcd`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
|
|
|
@ -266,6 +266,8 @@ class blockstore_impl_t
|
|||
int throttle_threshold_us = 50;
|
||||
// Maximum writes between automatically added fsync operations
|
||||
uint64_t autosync_writes = 128;
|
||||
// Log level (0-10)
|
||||
int log_level = 0;
|
||||
/******* END OF OPTIONS *******/
|
||||
|
||||
struct ring_consumer_t ring_consumer;
|
||||
|
|
|
@ -113,10 +113,13 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
|
|||
if (!right_dir && next_pos >= bs->journal.used_start-bs->journal.block_size)
|
||||
{
|
||||
// No space in the journal. Wait until used_start changes.
|
||||
printf(
|
||||
"Ran out of journal space (used_start=%08jx, next_free=%08jx, dirty_start=%08jx)\n",
|
||||
bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start
|
||||
);
|
||||
if (bs->log_level > 5)
|
||||
{
|
||||
printf(
|
||||
"Ran out of journal space (used_start=%08jx, next_free=%08jx, dirty_start=%08jx)\n",
|
||||
bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start
|
||||
);
|
||||
}
|
||||
PRIV(op)->wait_for = WAIT_JOURNAL;
|
||||
bs->flusher->request_trim();
|
||||
PRIV(op)->wait_detail = bs->journal.used_start;
|
||||
|
|
|
@ -101,6 +101,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
|
|||
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
|
||||
journal.inmemory = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" &&
|
||||
config["inmemory_journal"] != "no";
|
||||
log_level = strtoull(config["log_level"].c_str(), NULL, 10);
|
||||
// Validate
|
||||
if (journal.sector_count < 2)
|
||||
{
|
||||
|
|
|
@ -628,7 +628,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
|||
},
|
||||
};
|
||||
#ifdef WITH_RDMA
|
||||
if (rdma_contexts.size())
|
||||
if (!use_rdmacm && rdma_contexts.size())
|
||||
{
|
||||
// Choose the right context for the selected network
|
||||
msgr_rdma_context_t *selected_ctx = choose_rdma_context(cl);
|
||||
|
@ -701,7 +701,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
|
|||
return;
|
||||
}
|
||||
#ifdef WITH_RDMA
|
||||
if (cl->rdma_conn && config["rdma_address"].is_string())
|
||||
if (!use_rdmacm && cl->rdma_conn && config["rdma_address"].is_string())
|
||||
{
|
||||
msgr_rdma_address_t addr;
|
||||
if (!msgr_rdma_address_t::from_string(config["rdma_address"].string_value().c_str(), &addr) ||
|
||||
|
@ -800,7 +800,8 @@ bool osd_messenger_t::is_rdma_enabled()
|
|||
{
|
||||
return rdma_contexts.size() > 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
#ifdef WITH_RDMACM
|
||||
bool osd_messenger_t::is_use_rdmacm()
|
||||
{
|
||||
return use_rdmacm;
|
||||
|
|
|
@ -97,6 +97,7 @@ struct osd_wanted_peer_t
|
|||
json11::Json raw_address_list;
|
||||
json11::Json address_list;
|
||||
int port = 0;
|
||||
// FIXME: Remove separate WITH_RDMACM?
|
||||
#ifdef WITH_RDMACM
|
||||
int rdmacm_port = 0;
|
||||
#endif
|
||||
|
@ -286,6 +287,7 @@ protected:
|
|||
msgr_rdma_context_t* rdmacm_create_qp(rdma_cm_id *cmid);
|
||||
void rdmacm_accept(rdma_cm_event *ev);
|
||||
void rdmacm_try_connect_peer(uint64_t peer_osd, const std::string & addr, int rdmacm_port, int fallback_tcp_port);
|
||||
void rdmacm_set_conn_timeout(rdmacm_connecting_t *conn);
|
||||
void rdmacm_on_connect_peer_error(rdma_cm_id *cmid, int res);
|
||||
void rdmacm_address_resolved(rdma_cm_event *ev);
|
||||
void rdmacm_route_resolved(rdma_cm_event *ev);
|
||||
|
|
|
@ -70,6 +70,7 @@ msgr_rdma_context_t::~msgr_rdma_context_t()
|
|||
msgr_rdma_connection_t::~msgr_rdma_connection_t()
|
||||
{
|
||||
ctx->reserve_cqe(-max_send-max_recv);
|
||||
#ifdef WITH_RDMACM
|
||||
if (qp && !cmid)
|
||||
ibv_destroy_qp(qp);
|
||||
if (cmid)
|
||||
|
@ -79,6 +80,10 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
|
|||
rdma_destroy_qp(cmid);
|
||||
rdma_destroy_id(cmid);
|
||||
}
|
||||
#else
|
||||
if (qp)
|
||||
ibv_destroy_qp(qp);
|
||||
#endif
|
||||
if (recv_buffers.size())
|
||||
{
|
||||
for (auto b: recv_buffers)
|
||||
|
|
|
@ -70,7 +70,7 @@ void osd_messenger_t::rdmacm_destroy_listener(rdma_cm_id *listener)
|
|||
|
||||
void osd_messenger_t::handle_rdmacm_events()
|
||||
{
|
||||
// rdma_destroy_id infinitely waits for pthread_cond if called before all events are acked :-(
|
||||
// rdma_destroy_id infinitely waits for pthread_cond if called before all events are acked :-(...
|
||||
std::vector<rdma_cm_event> events_copy;
|
||||
while (1)
|
||||
{
|
||||
|
@ -83,7 +83,15 @@ void osd_messenger_t::handle_rdmacm_events()
|
|||
fprintf(stderr, "Failed to get RDMA-CM event: %s (code %d)\n", strerror(errno), errno);
|
||||
exit(1);
|
||||
}
|
||||
events_copy.push_back(*ev);
|
||||
// ...so we save a copy of all events EXCEPT connection requests, otherwise they sometimes fail with EVENT_DISCONNECT
|
||||
if (ev->event == RDMA_CM_EVENT_CONNECT_REQUEST)
|
||||
{
|
||||
rdmacm_accept(ev);
|
||||
}
|
||||
else
|
||||
{
|
||||
events_copy.push_back(*ev);
|
||||
}
|
||||
r = rdma_ack_cm_event(ev);
|
||||
if (r != 0)
|
||||
{
|
||||
|
@ -96,7 +104,7 @@ void osd_messenger_t::handle_rdmacm_events()
|
|||
auto ev = &evl;
|
||||
if (ev->event == RDMA_CM_EVENT_CONNECT_REQUEST)
|
||||
{
|
||||
rdmacm_accept(ev);
|
||||
// Do nothing, handled above
|
||||
}
|
||||
else if (ev->event == RDMA_CM_EVENT_CONNECT_ERROR ||
|
||||
ev->event == RDMA_CM_EVENT_REJECTED ||
|
||||
|
@ -287,29 +295,34 @@ void osd_messenger_t::rdmacm_accept(rdma_cm_event *ev)
|
|||
rdma_destroy_id(ev->id);
|
||||
return;
|
||||
}
|
||||
rdma_context->cm_refs++;
|
||||
// Wrap into a new msgr_rdma_connection_t
|
||||
msgr_rdma_connection_t *conn = new msgr_rdma_connection_t;
|
||||
conn->ctx = rdma_context;
|
||||
conn->max_send = rdma_max_send;
|
||||
conn->max_recv = rdma_max_recv;
|
||||
conn->max_sge = rdma_max_sge > rdma_context->attrx.orig_attr.max_sge
|
||||
? rdma_context->attrx.orig_attr.max_sge : rdma_max_sge;
|
||||
conn->max_msg = rdma_max_msg;
|
||||
// Wait for RDMA_CM_ESTABLISHED, and enable the connection only after it
|
||||
auto conn = new rdmacm_connecting_t;
|
||||
conn->cmid = ev->id;
|
||||
conn->qp = ev->id->qp;
|
||||
auto cl = new osd_client_t();
|
||||
cl->peer_fd = fake_fd;
|
||||
cl->peer_state = PEER_RDMA;
|
||||
cl->peer_addr = *(sockaddr_storage*)rdma_get_peer_addr(ev->id);
|
||||
cl->in_buf = malloc_or_die(receive_buffer_size);
|
||||
cl->rdma_conn = conn;
|
||||
clients[fake_fd] = cl;
|
||||
rdmacm_connections[ev->id] = cl;
|
||||
// Add initial receive request(s)
|
||||
try_recv_rdma(cl);
|
||||
fprintf(stderr, "[OSD %ju] new client %d: connection from %s via RDMA-CM\n", this->osd_num, fake_fd,
|
||||
addr_to_string(cl->peer_addr).c_str());
|
||||
conn->peer_fd = fake_fd;
|
||||
conn->parsed_addr = *(sockaddr_storage*)rdma_get_peer_addr(ev->id);
|
||||
conn->rdma_context = rdma_context;
|
||||
rdmacm_set_conn_timeout(conn);
|
||||
rdmacm_connecting[ev->id] = conn;
|
||||
fprintf(stderr, "[OSD %ju] new client %d: connection from %s via RDMA-CM\n", this->osd_num, conn->peer_fd,
|
||||
addr_to_string(conn->parsed_addr).c_str());
|
||||
}
|
||||
|
||||
void osd_messenger_t::rdmacm_set_conn_timeout(rdmacm_connecting_t *conn)
|
||||
{
|
||||
conn->timeout_ms = peer_connect_timeout*1000;
|
||||
if (peer_connect_timeout > 0)
|
||||
{
|
||||
conn->timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, cmid = conn->cmid](int timer_id)
|
||||
{
|
||||
auto conn = rdmacm_connecting.at(cmid);
|
||||
conn->timeout_id = -1;
|
||||
if (conn->peer_osd)
|
||||
fprintf(stderr, "RDMA-CM connection to %s timed out\n", conn->addr.c_str());
|
||||
else
|
||||
fprintf(stderr, "Incoming RDMA-CM connection from %s timed out\n", addr_to_string(conn->parsed_addr).c_str());
|
||||
rdmacm_on_connect_peer_error(cmid, -EPIPE);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void osd_messenger_t::rdmacm_on_connect_peer_error(rdma_cm_id *cmid, int res)
|
||||
|
@ -332,15 +345,18 @@ void osd_messenger_t::rdmacm_on_connect_peer_error(rdma_cm_id *cmid, int res)
|
|||
}
|
||||
rdmacm_connecting.erase(cmid);
|
||||
delete conn;
|
||||
if (!disable_tcp)
|
||||
if (peer_osd)
|
||||
{
|
||||
// Fall back to TCP instead of just reporting the error to on_connect_peer()
|
||||
try_connect_peer_tcp(peer_osd, addr.c_str(), tcp_port);
|
||||
}
|
||||
else
|
||||
{
|
||||
// TCP is disabled
|
||||
on_connect_peer(peer_osd, res == 0 ? -EINVAL : (res > 0 ? -res : res));
|
||||
if (!disable_tcp)
|
||||
{
|
||||
// Fall back to TCP instead of just reporting the error to on_connect_peer()
|
||||
try_connect_peer_tcp(peer_osd, addr.c_str(), tcp_port);
|
||||
}
|
||||
else
|
||||
{
|
||||
// TCP is disabled
|
||||
on_connect_peer(peer_osd, res == 0 ? -EINVAL : (res > 0 ? -res : res));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -374,6 +390,8 @@ void osd_messenger_t::rdmacm_try_connect_peer(uint64_t peer_osd, const std::stri
|
|||
on_connect_peer(peer_osd, res);
|
||||
return;
|
||||
}
|
||||
if (log_level > 0)
|
||||
fprintf(stderr, "Trying to connect to OSD %ju at %s:%d via RDMA-CM\n", peer_osd, addr.c_str(), rdmacm_port);
|
||||
auto conn = new rdmacm_connecting_t;
|
||||
rdmacm_connecting[cmid] = conn;
|
||||
conn->cmid = cmid;
|
||||
|
@ -383,19 +401,7 @@ void osd_messenger_t::rdmacm_try_connect_peer(uint64_t peer_osd, const std::stri
|
|||
conn->parsed_addr = sa;
|
||||
conn->rdmacm_port = rdmacm_port;
|
||||
conn->tcp_port = fallback_tcp_port;
|
||||
conn->timeout_ms = peer_connect_timeout*1000;
|
||||
conn->timeout_id = -1;
|
||||
if (peer_connect_timeout > 0)
|
||||
{
|
||||
conn->timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, cmid](int timer_id)
|
||||
{
|
||||
auto conn = rdmacm_connecting.at(cmid);
|
||||
conn->timeout_id = -1;
|
||||
fprintf(stderr, "RDMA-CM connection to %s timed out\n", conn->addr.c_str());
|
||||
rdmacm_on_connect_peer_error(cmid, -EPIPE);
|
||||
return;
|
||||
});
|
||||
}
|
||||
rdmacm_set_conn_timeout(conn);
|
||||
if (rdma_resolve_addr(cmid, NULL, (sockaddr*)&conn->parsed_addr, conn->timeout_ms) != 0)
|
||||
{
|
||||
auto res = -errno;
|
||||
|
@ -494,7 +500,7 @@ void osd_messenger_t::rdmacm_established(rdma_cm_event *ev)
|
|||
// Wrap into a new msgr_rdma_connection_t
|
||||
msgr_rdma_connection_t *rc = new msgr_rdma_connection_t;
|
||||
rc->ctx = conn->rdma_context;
|
||||
rc->ctx->cm_refs++;
|
||||
rc->ctx->cm_refs++; // FIXME now unused, count also connecting_t's when used
|
||||
rc->max_send = rdma_max_send;
|
||||
rc->max_recv = rdma_max_recv;
|
||||
rc->max_sge = rdma_max_sge > rc->ctx->attrx.orig_attr.max_sge
|
||||
|
@ -514,14 +520,20 @@ void osd_messenger_t::rdmacm_established(rdma_cm_event *ev)
|
|||
cl->rdma_conn = rc;
|
||||
clients[conn->peer_fd] = cl;
|
||||
if (conn->timeout_id >= 0)
|
||||
{
|
||||
tfd->clear_timer(conn->timeout_id);
|
||||
}
|
||||
delete conn;
|
||||
rdmacm_connecting.erase(cmid);
|
||||
rdmacm_connections[cmid] = cl;
|
||||
if (log_level > 0)
|
||||
if (log_level > 0 && peer_osd)
|
||||
{
|
||||
fprintf(stderr, "Successfully connected with OSD %ju using RDMA-CM\n", peer_osd);
|
||||
}
|
||||
// Add initial receive request(s)
|
||||
try_recv_rdma(cl);
|
||||
osd_peer_fds[peer_osd] = cl->peer_fd;
|
||||
on_connect_peer(peer_osd, cl->peer_fd);
|
||||
if (peer_osd)
|
||||
{
|
||||
check_peer_config(cl);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -58,6 +58,12 @@ struct osd_changer_t
|
|||
state = 100;
|
||||
return;
|
||||
}
|
||||
if (set_reweight && new_reweight > 1)
|
||||
{
|
||||
result = (cli_result_t){ .err = EINVAL, .text = "Reweight can't be larger than 1" };
|
||||
state = 100;
|
||||
return;
|
||||
}
|
||||
parent->etcd_txn(json11::Json::object {
|
||||
{ "success", json11::Json::array {
|
||||
json11::Json::object {
|
||||
|
|
|
@ -22,8 +22,8 @@ int nfs3_fsstat_proc(void *opaque, rpc_op_t *rop)
|
|||
{
|
||||
auto ttb = pst_it->second["total_raw_tb"].number_value();
|
||||
auto ftb = (pst_it->second["total_raw_tb"].number_value() - pst_it->second["used_raw_tb"].number_value());
|
||||
tbytes = ttb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40);
|
||||
fbytes = ftb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40);
|
||||
tbytes = ttb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)1<<40);
|
||||
fbytes = ftb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)1<<40);
|
||||
}
|
||||
*reply = (FSSTAT3res){
|
||||
.status = NFS3_OK,
|
||||
|
|
Loading…
Reference in New Issue