Support reloading state when etcd says "revisions were compacted"
Test / test_etcd_fail (push) Successful in 46s
Details
Test / buildenv (push) Successful in 9s
Details
Test / build (push) Successful in 2m38s
Details
Test / make_test (push) Successful in 33s
Details
Test / test_add_osd (push) Successful in 2m38s
Details
Test / test_cas (push) Successful in 7s
Details
Test / test_change_pg_count (push) Successful in 40s
Details
Test / test_change_pg_count_ec (push) Successful in 40s
Details
Test / test_change_pg_size (push) Successful in 7s
Details
Test / test_create_nomaxid (push) Successful in 7s
Details
Test / test_interrupted_rebalance (push) Successful in 3m9s
Details
Test / test_interrupted_rebalance_imm (push) Successful in 1m38s
Details
Test / test_interrupted_rebalance_ec (push) Successful in 1m54s
Details
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m36s
Details
Test / test_failure_domain (push) Successful in 9s
Details
Test / test_snapshot (push) Successful in 23s
Details
Test / test_snapshot_ec (push) Successful in 22s
Details
Test / test_minsize_1 (push) Successful in 14s
Details
Test / test_move_reappear (push) Successful in 19s
Details
Test / test_rm (push) Successful in 12s
Details
Test / test_snapshot_chain (push) Successful in 2m2s
Details
Test / test_snapshot_chain_ec (push) Successful in 2m38s
Details
Test / test_snapshot_down (push) Successful in 21s
Details
Test / test_snapshot_down_ec (push) Successful in 24s
Details
Test / test_splitbrain (push) Successful in 15s
Details
Test / test_rebalance_verify (push) Successful in 3m10s
Details
Test / test_rebalance_verify_imm (push) Successful in 3m10s
Details
Test / test_rebalance_verify_ec (push) Successful in 3m27s
Details
Test / test_rebalance_verify_ec_imm (push) Successful in 6m2s
Details
Test / test_write (push) Successful in 35s
Details
Test / test_write_xor (push) Successful in 45s
Details
Test / test_write_no_same (push) Successful in 22s
Details
Test / test_heal_pg_size_2 (push) Successful in 4m0s
Details
Test / test_heal_ec (push) Successful in 3m52s
Details
Test / test_scrub (push) Successful in 1m1s
Details
Test / test_scrub_zero_osd_2 (push) Successful in 42s
Details
Test / test_scrub_xor (push) Successful in 34s
Details
Test / test_scrub_pg_size_3 (push) Successful in 53s
Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 45s
Details
Test / test_scrub_ec (push) Successful in 26s
Details
Test / test_etcd_fail (push) Successful in 46s
Details
Test / buildenv (push) Successful in 9s
Details
Test / build (push) Successful in 2m38s
Details
Test / make_test (push) Successful in 33s
Details
Test / test_add_osd (push) Successful in 2m38s
Details
Test / test_cas (push) Successful in 7s
Details
Test / test_change_pg_count (push) Successful in 40s
Details
Test / test_change_pg_count_ec (push) Successful in 40s
Details
Test / test_change_pg_size (push) Successful in 7s
Details
Test / test_create_nomaxid (push) Successful in 7s
Details
Test / test_interrupted_rebalance (push) Successful in 3m9s
Details
Test / test_interrupted_rebalance_imm (push) Successful in 1m38s
Details
Test / test_interrupted_rebalance_ec (push) Successful in 1m54s
Details
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m36s
Details
Test / test_failure_domain (push) Successful in 9s
Details
Test / test_snapshot (push) Successful in 23s
Details
Test / test_snapshot_ec (push) Successful in 22s
Details
Test / test_minsize_1 (push) Successful in 14s
Details
Test / test_move_reappear (push) Successful in 19s
Details
Test / test_rm (push) Successful in 12s
Details
Test / test_snapshot_chain (push) Successful in 2m2s
Details
Test / test_snapshot_chain_ec (push) Successful in 2m38s
Details
Test / test_snapshot_down (push) Successful in 21s
Details
Test / test_snapshot_down_ec (push) Successful in 24s
Details
Test / test_splitbrain (push) Successful in 15s
Details
Test / test_rebalance_verify (push) Successful in 3m10s
Details
Test / test_rebalance_verify_imm (push) Successful in 3m10s
Details
Test / test_rebalance_verify_ec (push) Successful in 3m27s
Details
Test / test_rebalance_verify_ec_imm (push) Successful in 6m2s
Details
Test / test_write (push) Successful in 35s
Details
Test / test_write_xor (push) Successful in 45s
Details
Test / test_write_no_same (push) Successful in 22s
Details
Test / test_heal_pg_size_2 (push) Successful in 4m0s
Details
Test / test_heal_ec (push) Successful in 3m52s
Details
Test / test_scrub (push) Successful in 1m1s
Details
Test / test_scrub_zero_osd_2 (push) Successful in 42s
Details
Test / test_scrub_xor (push) Successful in 34s
Details
Test / test_scrub_pg_size_3 (push) Successful in 53s
Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 45s
Details
Test / test_scrub_ec (push) Successful in 26s
Details
Before this change, OSDs almost always died when one of the etcds was restarted, even though the rest of them was still in quorum and the lease was still activecached-reads
parent
2f999d8607
commit
161a23c966
|
@ -357,7 +357,7 @@ void etcd_state_client_t::start_etcd_watcher()
|
|||
watch_id == ETCD_OSD_STATE_WATCH_ID)
|
||||
etcd_watches_initialised++;
|
||||
if (etcd_watches_initialised == 4 && this->log_level > 0)
|
||||
fprintf(stderr, "Successfully subscribed to etcd at %s\n", selected_etcd_address.c_str());
|
||||
fprintf(stderr, "Successfully subscribed to etcd at %s\n", cur_addr.c_str());
|
||||
}
|
||||
if (data["result"]["canceled"].bool_value())
|
||||
{
|
||||
|
@ -368,15 +368,17 @@ void etcd_state_client_t::start_etcd_watcher()
|
|||
// so we should restart from the beginning if we can
|
||||
if (on_reload_hook != NULL)
|
||||
{
|
||||
fprintf(stderr, "Revisions before %lu were compacted by etcd, reloading state\n",
|
||||
data["result"]["compact_revision"].uint64_value());
|
||||
if (etcd_watch_ws)
|
||||
// check to not trigger on_reload_hook multiple times
|
||||
if (etcd_watch_ws != NULL)
|
||||
{
|
||||
fprintf(stderr, "Revisions before %lu were compacted by etcd, reloading state\n",
|
||||
data["result"]["compact_revision"].uint64_value());
|
||||
http_close(etcd_watch_ws);
|
||||
etcd_watch_ws = NULL;
|
||||
etcd_watch_revision = 0;
|
||||
on_reload_hook();
|
||||
}
|
||||
etcd_watch_revision = 0;
|
||||
on_reload_hook();
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -423,13 +425,9 @@ void etcd_state_client_t::start_etcd_watcher()
|
|||
}
|
||||
if (msg->eof)
|
||||
{
|
||||
fprintf(stderr, "Disconnected from etcd %s\n", cur_addr.c_str());
|
||||
if (cur_addr == selected_etcd_address)
|
||||
{
|
||||
fprintf(stderr, "Disconnected from etcd %s\n", selected_etcd_address.c_str());
|
||||
selected_etcd_address = "";
|
||||
}
|
||||
else
|
||||
fprintf(stderr, "Disconnected from etcd\n");
|
||||
if (etcd_watch_ws)
|
||||
{
|
||||
http_close(etcd_watch_ws);
|
||||
|
@ -446,6 +444,7 @@ void etcd_state_client_t::start_etcd_watcher()
|
|||
else if (etcd_watches_initialised > 0)
|
||||
{
|
||||
// Connection was live, retry immediately
|
||||
etcd_watches_initialised = 0;
|
||||
start_etcd_watcher();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -198,13 +198,14 @@ class osd_t
|
|||
void on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num);
|
||||
void on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes);
|
||||
void on_load_config_hook(json11::Json::object & changes);
|
||||
void on_reload_config_hook(json11::Json::object & changes);
|
||||
json11::Json on_load_pgs_checks_hook();
|
||||
void on_load_pgs_hook(bool success);
|
||||
void bind_socket();
|
||||
void acquire_lease();
|
||||
json11::Json get_osd_state();
|
||||
void create_osd_state();
|
||||
void renew_lease();
|
||||
void renew_lease(bool reload);
|
||||
void print_stats();
|
||||
void print_slow();
|
||||
void reset_stats();
|
||||
|
|
|
@ -70,6 +70,7 @@ void osd_t::init_cluster()
|
|||
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
|
||||
st_cli.load_pgs_checks_hook = [this]() { return on_load_pgs_checks_hook(); };
|
||||
st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
|
||||
st_cli.on_reload_hook = [this]() { st_cli.load_global_config(); };
|
||||
peering_state = OSD_LOADING_PGS;
|
||||
st_cli.load_global_config();
|
||||
}
|
||||
|
@ -395,6 +396,14 @@ void osd_t::on_load_config_hook(json11::Json::object & global_config)
|
|||
parse_config(true);
|
||||
bind_socket();
|
||||
acquire_lease();
|
||||
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_reload_config_hook(cfg); };
|
||||
}
|
||||
|
||||
void osd_t::on_reload_config_hook(json11::Json::object & global_config)
|
||||
{
|
||||
etcd_global_config = global_config;
|
||||
parse_config(false);
|
||||
renew_lease(true);
|
||||
}
|
||||
|
||||
// Acquire lease
|
||||
|
@ -424,7 +433,7 @@ void osd_t::acquire_lease()
|
|||
);
|
||||
tfd->set_timer(etcd_report_interval*1000, true, [this](int timer_id)
|
||||
{
|
||||
renew_lease();
|
||||
renew_lease(false);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -499,11 +508,11 @@ void osd_t::create_osd_state()
|
|||
}
|
||||
|
||||
// Renew lease
|
||||
void osd_t::renew_lease()
|
||||
void osd_t::renew_lease(bool reload)
|
||||
{
|
||||
st_cli.etcd_call("/lease/keepalive", json11::Json::object {
|
||||
{ "ID", etcd_lease_id }
|
||||
}, st_cli.etcd_quick_timeout, 0, 0, [this](std::string err, json11::Json data)
|
||||
}, st_cli.etcd_quick_timeout, 0, 0, [this, reload](std::string err, json11::Json data)
|
||||
{
|
||||
if (err == "" && data["result"]["TTL"].string_value() == "")
|
||||
{
|
||||
|
@ -522,15 +531,20 @@ void osd_t::renew_lease()
|
|||
force_stop(1);
|
||||
}
|
||||
// Retry
|
||||
tfd->set_timer(st_cli.etcd_quick_timeout, false, [this](int timer_id)
|
||||
tfd->set_timer(st_cli.etcd_quick_timeout, false, [this, reload](int timer_id)
|
||||
{
|
||||
renew_lease();
|
||||
renew_lease(reload);
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
etcd_failed_attempts = 0;
|
||||
report_statistics();
|
||||
// Reload PGs
|
||||
if (reload && run_primary)
|
||||
{
|
||||
st_cli.load_pgs();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -560,7 +574,6 @@ void osd_t::force_stop(int exitcode)
|
|||
|
||||
json11::Json osd_t::on_load_pgs_checks_hook()
|
||||
{
|
||||
assert(this->pgs.size() == 0);
|
||||
json11::Json::array checks = {
|
||||
json11::Json::object {
|
||||
{ "target", "LEASE" },
|
||||
|
|
Loading…
Reference in New Issue