diff --git a/Makefile b/Makefile index 637289a2..839bb79b 100644 --- a/Makefile +++ b/Makefile @@ -30,7 +30,7 @@ libfio_blockstore.so: ./libblockstore.so fio_engine.cpp json11.o g++ $(CXXFLAGS) -shared -o libfio_blockstore.so fio_engine.cpp json11.o ./libblockstore.so -ltcmalloc_minimal -luring OSD_OBJS := osd.o osd_secondary.o osd_receive.o osd_send.o osd_peering.o osd_flush.o osd_peering_pg.o \ - osd_primary.o osd_cluster.o osd_rmw.o json11.o timerfd_interval.o base64.o + osd_primary.o osd_cluster.o osd_rmw.o json11.o timerfd_interval.o base64.o timerfd_manager.o base64.o: base64.cpp base64.h g++ $(CXXFLAGS) -c -o $@ $< osd_secondary.o: osd_secondary.cpp osd.h osd_ops.h ringloop.h diff --git a/osd.cpp b/osd.cpp index fee1c981..d6577ce7 100644 --- a/osd.cpp +++ b/osd.cpp @@ -42,6 +42,8 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo print_stats(); }); + this->tfd = new timerfd_manager_t(ringloop); + if (run_primary) init_primary(); @@ -51,6 +53,11 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo osd_t::~osd_t() { + if (tfd) + { + delete tfd; + tfd = NULL; + } if (stats_tfd) { delete stats_tfd; diff --git a/osd.h b/osd.h index d2ddd457..f33f068a 100644 --- a/osd.h +++ b/osd.h @@ -16,6 +16,7 @@ #include "blockstore.h" #include "ringloop.h" #include "timerfd_interval.h" +#include "timerfd_manager.h" #include "osd_ops.h" #include "osd_peering_pg.h" #include "json11/json11.hpp" @@ -48,6 +49,9 @@ #define MAX_RECOVERY_QUEUE 2048 #define DEFAULT_RECOVERY_QUEUE 4 +#define MAX_CONSUL_ATTEMPTS 5 +#define CONSUL_RETRY_INTERVAL 1000 + //#define OSD_STUB extern const char* osd_op_names[]; @@ -210,6 +214,8 @@ class osd_t // peer OSDs std::vector bind_addresses; + int consul_failed_attempts = 0; + std::map osd_peer_fds; std::map pgs; std::set dirty_pgs; @@ -233,6 +239,7 @@ class osd_t uint64_t pg_stripe_size = 4*1024*1024; // 4 MB by default ring_loop_t *ringloop; timerfd_interval *stats_tfd = NULL, *sync_tfd = NULL, *consul_tfd = NULL; + timerfd_manager_t *tfd = NULL; int wait_state = 0; int epoll_fd = 0; diff --git a/osd_cluster.cpp b/osd_cluster.cpp index 0da3f7bb..c3efe4c0 100644 --- a/osd_cluster.cpp +++ b/osd_cluster.cpp @@ -132,9 +132,27 @@ void osd_t::report_status() { int pos = res.find("\r\n\r\n"); if (pos >= 0) + { res = res.substr(pos+4); + } if (err != 0 || res != "true") + { + consul_failed_attempts++; printf("Error reporting state to Consul: code %d (%s), response text: %s\n", err, strerror(err), res.c_str()); + if (consul_failed_attempts > MAX_CONSUL_ATTEMPTS) + { + throw std::runtime_error("Cluster connection failed"); + } + // Retry + tfd->set_timer(CONSUL_RETRY_INTERVAL, false, [this](int timer_id) + { + report_status(); + }); + } + else + { + consul_failed_attempts = 0; + } }); }