forked from vitalif/vitastor
Support reloading state when etcd says "revisions were compacted"
Before this change, OSDs almost always died when one of the etcds was restarted, even though the rest of them was still in quorum and the lease was still activecached-reads
parent
2f999d8607
commit
161a23c966
|
@ -357,7 +357,7 @@ void etcd_state_client_t::start_etcd_watcher()
|
|||
watch_id == ETCD_OSD_STATE_WATCH_ID)
|
||||
etcd_watches_initialised++;
|
||||
if (etcd_watches_initialised == 4 && this->log_level > 0)
|
||||
fprintf(stderr, "Successfully subscribed to etcd at %s\n", selected_etcd_address.c_str());
|
||||
fprintf(stderr, "Successfully subscribed to etcd at %s\n", cur_addr.c_str());
|
||||
}
|
||||
if (data["result"]["canceled"].bool_value())
|
||||
{
|
||||
|
@ -368,15 +368,17 @@ void etcd_state_client_t::start_etcd_watcher()
|
|||
// so we should restart from the beginning if we can
|
||||
if (on_reload_hook != NULL)
|
||||
{
|
||||
fprintf(stderr, "Revisions before %lu were compacted by etcd, reloading state\n",
|
||||
data["result"]["compact_revision"].uint64_value());
|
||||
if (etcd_watch_ws)
|
||||
// check to not trigger on_reload_hook multiple times
|
||||
if (etcd_watch_ws != NULL)
|
||||
{
|
||||
fprintf(stderr, "Revisions before %lu were compacted by etcd, reloading state\n",
|
||||
data["result"]["compact_revision"].uint64_value());
|
||||
http_close(etcd_watch_ws);
|
||||
etcd_watch_ws = NULL;
|
||||
etcd_watch_revision = 0;
|
||||
on_reload_hook();
|
||||
}
|
||||
etcd_watch_revision = 0;
|
||||
on_reload_hook();
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -423,13 +425,9 @@ void etcd_state_client_t::start_etcd_watcher()
|
|||
}
|
||||
if (msg->eof)
|
||||
{
|
||||
fprintf(stderr, "Disconnected from etcd %s\n", cur_addr.c_str());
|
||||
if (cur_addr == selected_etcd_address)
|
||||
{
|
||||
fprintf(stderr, "Disconnected from etcd %s\n", selected_etcd_address.c_str());
|
||||
selected_etcd_address = "";
|
||||
}
|
||||
else
|
||||
fprintf(stderr, "Disconnected from etcd\n");
|
||||
if (etcd_watch_ws)
|
||||
{
|
||||
http_close(etcd_watch_ws);
|
||||
|
@ -446,6 +444,7 @@ void etcd_state_client_t::start_etcd_watcher()
|
|||
else if (etcd_watches_initialised > 0)
|
||||
{
|
||||
// Connection was live, retry immediately
|
||||
etcd_watches_initialised = 0;
|
||||
start_etcd_watcher();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -198,13 +198,14 @@ class osd_t
|
|||
void on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num);
|
||||
void on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes);
|
||||
void on_load_config_hook(json11::Json::object & changes);
|
||||
void on_reload_config_hook(json11::Json::object & changes);
|
||||
json11::Json on_load_pgs_checks_hook();
|
||||
void on_load_pgs_hook(bool success);
|
||||
void bind_socket();
|
||||
void acquire_lease();
|
||||
json11::Json get_osd_state();
|
||||
void create_osd_state();
|
||||
void renew_lease();
|
||||
void renew_lease(bool reload);
|
||||
void print_stats();
|
||||
void print_slow();
|
||||
void reset_stats();
|
||||
|
|
|
@ -70,6 +70,7 @@ void osd_t::init_cluster()
|
|||
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
|
||||
st_cli.load_pgs_checks_hook = [this]() { return on_load_pgs_checks_hook(); };
|
||||
st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
|
||||
st_cli.on_reload_hook = [this]() { st_cli.load_global_config(); };
|
||||
peering_state = OSD_LOADING_PGS;
|
||||
st_cli.load_global_config();
|
||||
}
|
||||
|
@ -395,6 +396,14 @@ void osd_t::on_load_config_hook(json11::Json::object & global_config)
|
|||
parse_config(true);
|
||||
bind_socket();
|
||||
acquire_lease();
|
||||
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_reload_config_hook(cfg); };
|
||||
}
|
||||
|
||||
void osd_t::on_reload_config_hook(json11::Json::object & global_config)
|
||||
{
|
||||
etcd_global_config = global_config;
|
||||
parse_config(false);
|
||||
renew_lease(true);
|
||||
}
|
||||
|
||||
// Acquire lease
|
||||
|
@ -424,7 +433,7 @@ void osd_t::acquire_lease()
|
|||
);
|
||||
tfd->set_timer(etcd_report_interval*1000, true, [this](int timer_id)
|
||||
{
|
||||
renew_lease();
|
||||
renew_lease(false);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -499,11 +508,11 @@ void osd_t::create_osd_state()
|
|||
}
|
||||
|
||||
// Renew lease
|
||||
void osd_t::renew_lease()
|
||||
void osd_t::renew_lease(bool reload)
|
||||
{
|
||||
st_cli.etcd_call("/lease/keepalive", json11::Json::object {
|
||||
{ "ID", etcd_lease_id }
|
||||
}, st_cli.etcd_quick_timeout, 0, 0, [this](std::string err, json11::Json data)
|
||||
}, st_cli.etcd_quick_timeout, 0, 0, [this, reload](std::string err, json11::Json data)
|
||||
{
|
||||
if (err == "" && data["result"]["TTL"].string_value() == "")
|
||||
{
|
||||
|
@ -522,15 +531,20 @@ void osd_t::renew_lease()
|
|||
force_stop(1);
|
||||
}
|
||||
// Retry
|
||||
tfd->set_timer(st_cli.etcd_quick_timeout, false, [this](int timer_id)
|
||||
tfd->set_timer(st_cli.etcd_quick_timeout, false, [this, reload](int timer_id)
|
||||
{
|
||||
renew_lease();
|
||||
renew_lease(reload);
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
etcd_failed_attempts = 0;
|
||||
report_statistics();
|
||||
// Reload PGs
|
||||
if (reload && run_primary)
|
||||
{
|
||||
st_cli.load_pgs();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -560,7 +574,6 @@ void osd_t::force_stop(int exitcode)
|
|||
|
||||
json11::Json osd_t::on_load_pgs_checks_hook()
|
||||
{
|
||||
assert(this->pgs.size() == 0);
|
||||
json11::Json::array checks = {
|
||||
json11::Json::object {
|
||||
{ "target", "LEASE" },
|
||||
|
|
Loading…
Reference in New Issue