Re-peer PGs after stopping the peer

Fixes the bug where two peers killed at once have lead to PG state PG_DEGRADED|PG_HAS_INCOMPLETE instead of PG_INCOMPLETE
trace-sqes
Vitaliy Filippov 2020-05-23 18:41:28 +03:00
parent e614a98543
commit 7df384031a
1 changed files with 8 additions and 4 deletions

12
osd.cpp
View File

@ -401,12 +401,12 @@ void osd_t::cancel_op(osd_op_t *op)
void osd_t::stop_client(int peer_fd) void osd_t::stop_client(int peer_fd)
{ {
// FIXME Fix the bug where sometimes a dead peer is undetected which leads to PG DEGRADED|HAS_INCOMPLETE!
auto it = clients.find(peer_fd); auto it = clients.find(peer_fd);
if (it == clients.end()) if (it == clients.end())
{ {
return; return;
} }
uint64_t repeer_osd = 0;
osd_client_t cl = it->second; osd_client_t cl = it->second;
if (cl.peer_state == PEER_CONNECTED) if (cl.peer_state == PEER_CONNECTED)
{ {
@ -415,7 +415,8 @@ void osd_t::stop_client(int peer_fd)
// Reload configuration from etcd when the connection is dropped // Reload configuration from etcd when the connection is dropped
printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl.osd_num); printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl.osd_num);
st_cli.peer_states.erase(cl.osd_num); st_cli.peer_states.erase(cl.osd_num);
repeer_pgs(cl.osd_num); repeer_osd = cl.osd_num;
peering_state |= OSD_CONNECTING_PEERS;
} }
else else
{ {
@ -429,10 +430,9 @@ void osd_t::stop_client(int peer_fd)
} }
if (cl.osd_num) if (cl.osd_num)
{ {
osd_peer_fds.erase(cl.osd_num);
// Cancel outbound operations // Cancel outbound operations
cancel_osd_ops(cl); cancel_osd_ops(cl);
osd_peer_fds.erase(cl.osd_num);
peering_state |= OSD_CONNECTING_PEERS;
} }
if (cl.read_op) if (cl.read_op)
{ {
@ -456,6 +456,10 @@ void osd_t::stop_client(int peer_fd)
} }
free(cl.in_buf); free(cl.in_buf);
close(peer_fd); close(peer_fd);
if (repeer_osd)
{
repeer_pgs(repeer_osd);
}
} }
void osd_t::exec_op(osd_op_t *cur_op) void osd_t::exec_op(osd_op_t *cur_op)