Retry consul connection attempts and then die

trace-sqes
Vitaliy Filippov 2020-04-15 15:33:18 +03:00
parent d78ce509c6
commit 089b4eb208
4 changed files with 33 additions and 1 deletions

View File

@ -30,7 +30,7 @@ libfio_blockstore.so: ./libblockstore.so fio_engine.cpp json11.o
g++ $(CXXFLAGS) -shared -o libfio_blockstore.so fio_engine.cpp json11.o ./libblockstore.so -ltcmalloc_minimal -luring
OSD_OBJS := osd.o osd_secondary.o osd_receive.o osd_send.o osd_peering.o osd_flush.o osd_peering_pg.o \
osd_primary.o osd_cluster.o osd_rmw.o json11.o timerfd_interval.o base64.o
osd_primary.o osd_cluster.o osd_rmw.o json11.o timerfd_interval.o base64.o timerfd_manager.o
base64.o: base64.cpp base64.h
g++ $(CXXFLAGS) -c -o $@ $<
osd_secondary.o: osd_secondary.cpp osd.h osd_ops.h ringloop.h

View File

@ -42,6 +42,8 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo
print_stats();
});
this->tfd = new timerfd_manager_t(ringloop);
if (run_primary)
init_primary();
@ -51,6 +53,11 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo
osd_t::~osd_t()
{
if (tfd)
{
delete tfd;
tfd = NULL;
}
if (stats_tfd)
{
delete stats_tfd;

7
osd.h
View File

@ -16,6 +16,7 @@
#include "blockstore.h"
#include "ringloop.h"
#include "timerfd_interval.h"
#include "timerfd_manager.h"
#include "osd_ops.h"
#include "osd_peering_pg.h"
#include "json11/json11.hpp"
@ -48,6 +49,9 @@
#define MAX_RECOVERY_QUEUE 2048
#define DEFAULT_RECOVERY_QUEUE 4
#define MAX_CONSUL_ATTEMPTS 5
#define CONSUL_RETRY_INTERVAL 1000
//#define OSD_STUB
extern const char* osd_op_names[];
@ -210,6 +214,8 @@ class osd_t
// peer OSDs
std::vector<std::string> bind_addresses;
int consul_failed_attempts = 0;
std::map<uint64_t, int> osd_peer_fds;
std::map<pg_num_t, pg_t> pgs;
std::set<pg_num_t> dirty_pgs;
@ -233,6 +239,7 @@ class osd_t
uint64_t pg_stripe_size = 4*1024*1024; // 4 MB by default
ring_loop_t *ringloop;
timerfd_interval *stats_tfd = NULL, *sync_tfd = NULL, *consul_tfd = NULL;
timerfd_manager_t *tfd = NULL;
int wait_state = 0;
int epoll_fd = 0;

View File

@ -132,9 +132,27 @@ void osd_t::report_status()
{
int pos = res.find("\r\n\r\n");
if (pos >= 0)
{
res = res.substr(pos+4);
}
if (err != 0 || res != "true")
{
consul_failed_attempts++;
printf("Error reporting state to Consul: code %d (%s), response text: %s\n", err, strerror(err), res.c_str());
if (consul_failed_attempts > MAX_CONSUL_ATTEMPTS)
{
throw std::runtime_error("Cluster connection failed");
}
// Retry
tfd->set_timer(CONSUL_RETRY_INTERVAL, false, [this](int timer_id)
{
report_status();
});
}
else
{
consul_failed_attempts = 0;
}
});
}