Compare commits

..

8 Commits

11 changed files with 93 additions and 61 deletions

View File

@ -3,7 +3,7 @@ VITASTOR_VERSION ?= v2.1.0
all: build push
build:
@docker build --rm -t vitalif/vitastor:$(VITASTOR_VERSION) .
@docker build --no-cache --rm -t vitalif/vitastor:$(VITASTOR_VERSION) .
push:
@docker push vitalif/vitastor:$(VITASTOR_VERSION)

View File

@ -1 +1,2 @@
deb http://vitastor.io/debian bookworm main
deb http://http.debian.net/debian/ bookworm-backports main

View File

@ -342,7 +342,7 @@ function filter_tree_by_rules(osd_tree, rules, selected)
// Convert from
// node_list = { id: string|number, level: string, size?: number, parent?: string|number }[]
// to
// node_tree = { [node_id]: { id, level, size?, parent?, children?: child_node_id[], levels: { [level]: id, ... } } }
// node_tree = { [node_id]: { id, level, size?, parent?, children?: child_node[], levels: { [level]: id, ... } } }
function index_tree(node_list)
{
const tree = { '': { children: [], levels: {} } };
@ -357,7 +357,7 @@ function index_tree(node_list)
tree[parent_id].children = tree[parent_id].children || [];
tree[parent_id].children.push(tree[node.id]);
}
const cur = tree[''].children;
const cur = [ ...tree[''].children ];
for (let i = 0; i < cur.length; i++)
{
cur[i].levels[cur[i].level] = cur[i].id;

View File

@ -15,7 +15,7 @@ function get_osd_tree(global_config, state)
const stat = state.osd.stats[osd_num];
const osd_cfg = state.config.osd[osd_num];
let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight);
if (reweight < 0 || isNaN(reweight))
if (isNaN(reweight) || reweight < 0 || reweight > 0)
reweight = 1;
if (stat && stat.size && reweight && (state.osd.state[osd_num] || Number(stat.time) >= down_time ||
osd_cfg && osd_cfg.noout))

View File

@ -40,6 +40,11 @@ async function run()
console.log("/etc/systemd/system/vitastor-etcd.service already exists");
process.exit(1);
}
if (!in_docker && fs.existsSync("/etc/systemd/system/etcd.service"))
{
console.log("/etc/systemd/system/etcd.service already exists");
process.exit(1);
}
const config = JSON.parse(fs.readFileSync(config_path, { encoding: 'utf-8' }));
if (!config.etcd_address)
{
@ -97,8 +102,8 @@ WantedBy=multi-user.target
`);
await system(`useradd etcd`);
await system(`systemctl daemon-reload`);
await system(`systemctl enable etcd`);
await system(`systemctl start etcd`);
await system(`systemctl enable vitastor-etcd`);
await system(`systemctl start vitastor-etcd`);
process.exit(0);
}

View File

@ -628,7 +628,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
},
};
#ifdef WITH_RDMA
if (rdma_contexts.size())
if (!use_rdmacm && rdma_contexts.size())
{
// Choose the right context for the selected network
msgr_rdma_context_t *selected_ctx = choose_rdma_context(cl);
@ -701,7 +701,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
return;
}
#ifdef WITH_RDMA
if (cl->rdma_conn && config["rdma_address"].is_string())
if (!use_rdmacm && cl->rdma_conn && config["rdma_address"].is_string())
{
msgr_rdma_address_t addr;
if (!msgr_rdma_address_t::from_string(config["rdma_address"].string_value().c_str(), &addr) ||
@ -800,7 +800,8 @@ bool osd_messenger_t::is_rdma_enabled()
{
return rdma_contexts.size() > 0;
}
#endif
#ifdef WITH_RDMACM
bool osd_messenger_t::is_use_rdmacm()
{
return use_rdmacm;

View File

@ -97,6 +97,7 @@ struct osd_wanted_peer_t
json11::Json raw_address_list;
json11::Json address_list;
int port = 0;
// FIXME: Remove separate WITH_RDMACM?
#ifdef WITH_RDMACM
int rdmacm_port = 0;
#endif
@ -286,6 +287,7 @@ protected:
msgr_rdma_context_t* rdmacm_create_qp(rdma_cm_id *cmid);
void rdmacm_accept(rdma_cm_event *ev);
void rdmacm_try_connect_peer(uint64_t peer_osd, const std::string & addr, int rdmacm_port, int fallback_tcp_port);
void rdmacm_set_conn_timeout(rdmacm_connecting_t *conn);
void rdmacm_on_connect_peer_error(rdma_cm_id *cmid, int res);
void rdmacm_address_resolved(rdma_cm_event *ev);
void rdmacm_route_resolved(rdma_cm_event *ev);

View File

@ -70,6 +70,7 @@ msgr_rdma_context_t::~msgr_rdma_context_t()
msgr_rdma_connection_t::~msgr_rdma_connection_t()
{
ctx->reserve_cqe(-max_send-max_recv);
#ifdef WITH_RDMACM
if (qp && !cmid)
ibv_destroy_qp(qp);
if (cmid)
@ -79,6 +80,10 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
rdma_destroy_qp(cmid);
rdma_destroy_id(cmid);
}
#else
if (qp)
ibv_destroy_qp(qp);
#endif
if (recv_buffers.size())
{
for (auto b: recv_buffers)

View File

@ -70,7 +70,7 @@ void osd_messenger_t::rdmacm_destroy_listener(rdma_cm_id *listener)
void osd_messenger_t::handle_rdmacm_events()
{
// rdma_destroy_id infinitely waits for pthread_cond if called before all events are acked :-(
// rdma_destroy_id infinitely waits for pthread_cond if called before all events are acked :-(...
std::vector<rdma_cm_event> events_copy;
while (1)
{
@ -83,7 +83,15 @@ void osd_messenger_t::handle_rdmacm_events()
fprintf(stderr, "Failed to get RDMA-CM event: %s (code %d)\n", strerror(errno), errno);
exit(1);
}
events_copy.push_back(*ev);
// ...so we save a copy of all events EXCEPT connection requests, otherwise they sometimes fail with EVENT_DISCONNECT
if (ev->event == RDMA_CM_EVENT_CONNECT_REQUEST)
{
rdmacm_accept(ev);
}
else
{
events_copy.push_back(*ev);
}
r = rdma_ack_cm_event(ev);
if (r != 0)
{
@ -96,7 +104,7 @@ void osd_messenger_t::handle_rdmacm_events()
auto ev = &evl;
if (ev->event == RDMA_CM_EVENT_CONNECT_REQUEST)
{
rdmacm_accept(ev);
// Do nothing, handled above
}
else if (ev->event == RDMA_CM_EVENT_CONNECT_ERROR ||
ev->event == RDMA_CM_EVENT_REJECTED ||
@ -287,29 +295,34 @@ void osd_messenger_t::rdmacm_accept(rdma_cm_event *ev)
rdma_destroy_id(ev->id);
return;
}
rdma_context->cm_refs++;
// Wrap into a new msgr_rdma_connection_t
msgr_rdma_connection_t *conn = new msgr_rdma_connection_t;
conn->ctx = rdma_context;
conn->max_send = rdma_max_send;
conn->max_recv = rdma_max_recv;
conn->max_sge = rdma_max_sge > rdma_context->attrx.orig_attr.max_sge
? rdma_context->attrx.orig_attr.max_sge : rdma_max_sge;
conn->max_msg = rdma_max_msg;
// Wait for RDMA_CM_ESTABLISHED, and enable the connection only after it
auto conn = new rdmacm_connecting_t;
conn->cmid = ev->id;
conn->qp = ev->id->qp;
auto cl = new osd_client_t();
cl->peer_fd = fake_fd;
cl->peer_state = PEER_RDMA;
cl->peer_addr = *(sockaddr_storage*)rdma_get_peer_addr(ev->id);
cl->in_buf = malloc_or_die(receive_buffer_size);
cl->rdma_conn = conn;
clients[fake_fd] = cl;
rdmacm_connections[ev->id] = cl;
// Add initial receive request(s)
try_recv_rdma(cl);
fprintf(stderr, "[OSD %ju] new client %d: connection from %s via RDMA-CM\n", this->osd_num, fake_fd,
addr_to_string(cl->peer_addr).c_str());
conn->peer_fd = fake_fd;
conn->parsed_addr = *(sockaddr_storage*)rdma_get_peer_addr(ev->id);
conn->rdma_context = rdma_context;
rdmacm_set_conn_timeout(conn);
rdmacm_connecting[ev->id] = conn;
fprintf(stderr, "[OSD %ju] new client %d: connection from %s via RDMA-CM\n", this->osd_num, conn->peer_fd,
addr_to_string(conn->parsed_addr).c_str());
}
void osd_messenger_t::rdmacm_set_conn_timeout(rdmacm_connecting_t *conn)
{
conn->timeout_ms = peer_connect_timeout*1000;
if (peer_connect_timeout > 0)
{
conn->timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, cmid = conn->cmid](int timer_id)
{
auto conn = rdmacm_connecting.at(cmid);
conn->timeout_id = -1;
if (conn->peer_osd)
fprintf(stderr, "RDMA-CM connection to %s timed out\n", conn->addr.c_str());
else
fprintf(stderr, "Incoming RDMA-CM connection from %s timed out\n", addr_to_string(conn->parsed_addr).c_str());
rdmacm_on_connect_peer_error(cmid, -EPIPE);
});
}
}
void osd_messenger_t::rdmacm_on_connect_peer_error(rdma_cm_id *cmid, int res)
@ -332,15 +345,18 @@ void osd_messenger_t::rdmacm_on_connect_peer_error(rdma_cm_id *cmid, int res)
}
rdmacm_connecting.erase(cmid);
delete conn;
if (!disable_tcp)
if (peer_osd)
{
// Fall back to TCP instead of just reporting the error to on_connect_peer()
try_connect_peer_tcp(peer_osd, addr.c_str(), tcp_port);
}
else
{
// TCP is disabled
on_connect_peer(peer_osd, res == 0 ? -EINVAL : (res > 0 ? -res : res));
if (!disable_tcp)
{
// Fall back to TCP instead of just reporting the error to on_connect_peer()
try_connect_peer_tcp(peer_osd, addr.c_str(), tcp_port);
}
else
{
// TCP is disabled
on_connect_peer(peer_osd, res == 0 ? -EINVAL : (res > 0 ? -res : res));
}
}
}
@ -374,6 +390,8 @@ void osd_messenger_t::rdmacm_try_connect_peer(uint64_t peer_osd, const std::stri
on_connect_peer(peer_osd, res);
return;
}
if (log_level > 0)
fprintf(stderr, "Trying to connect to OSD %ju at %s:%d via RDMA-CM\n", peer_osd, addr.c_str(), rdmacm_port);
auto conn = new rdmacm_connecting_t;
rdmacm_connecting[cmid] = conn;
conn->cmid = cmid;
@ -383,19 +401,7 @@ void osd_messenger_t::rdmacm_try_connect_peer(uint64_t peer_osd, const std::stri
conn->parsed_addr = sa;
conn->rdmacm_port = rdmacm_port;
conn->tcp_port = fallback_tcp_port;
conn->timeout_ms = peer_connect_timeout*1000;
conn->timeout_id = -1;
if (peer_connect_timeout > 0)
{
conn->timeout_id = tfd->set_timer(1000*peer_connect_timeout, false, [this, cmid](int timer_id)
{
auto conn = rdmacm_connecting.at(cmid);
conn->timeout_id = -1;
fprintf(stderr, "RDMA-CM connection to %s timed out\n", conn->addr.c_str());
rdmacm_on_connect_peer_error(cmid, -EPIPE);
return;
});
}
rdmacm_set_conn_timeout(conn);
if (rdma_resolve_addr(cmid, NULL, (sockaddr*)&conn->parsed_addr, conn->timeout_ms) != 0)
{
auto res = -errno;
@ -494,7 +500,7 @@ void osd_messenger_t::rdmacm_established(rdma_cm_event *ev)
// Wrap into a new msgr_rdma_connection_t
msgr_rdma_connection_t *rc = new msgr_rdma_connection_t;
rc->ctx = conn->rdma_context;
rc->ctx->cm_refs++;
rc->ctx->cm_refs++; // FIXME now unused, count also connecting_t's when used
rc->max_send = rdma_max_send;
rc->max_recv = rdma_max_recv;
rc->max_sge = rdma_max_sge > rc->ctx->attrx.orig_attr.max_sge
@ -514,14 +520,20 @@ void osd_messenger_t::rdmacm_established(rdma_cm_event *ev)
cl->rdma_conn = rc;
clients[conn->peer_fd] = cl;
if (conn->timeout_id >= 0)
{
tfd->clear_timer(conn->timeout_id);
}
delete conn;
rdmacm_connecting.erase(cmid);
rdmacm_connections[cmid] = cl;
if (log_level > 0)
if (log_level > 0 && peer_osd)
{
fprintf(stderr, "Successfully connected with OSD %ju using RDMA-CM\n", peer_osd);
}
// Add initial receive request(s)
try_recv_rdma(cl);
osd_peer_fds[peer_osd] = cl->peer_fd;
on_connect_peer(peer_osd, cl->peer_fd);
if (peer_osd)
{
check_peer_config(cl);
}
}

View File

@ -58,6 +58,12 @@ struct osd_changer_t
state = 100;
return;
}
if (set_reweight && new_reweight > 1)
{
result = (cli_result_t){ .err = EINVAL, .text = "Reweight can't be larger than 1" };
state = 100;
return;
}
parent->etcd_txn(json11::Json::object {
{ "success", json11::Json::array {
json11::Json::object {

View File

@ -22,8 +22,8 @@ int nfs3_fsstat_proc(void *opaque, rpc_op_t *rop)
{
auto ttb = pst_it->second["total_raw_tb"].number_value();
auto ftb = (pst_it->second["total_raw_tb"].number_value() - pst_it->second["used_raw_tb"].number_value());
tbytes = ttb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40);
fbytes = ftb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)2<<40);
tbytes = ttb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)1<<40);
fbytes = ftb / pst_it->second["raw_to_usable"].number_value() * ((uint64_t)1<<40);
}
*reply = (FSSTAT3res){
.status = NFS3_OK,