Support reloading state when etcd says "revisions were compacted"
Test / test_etcd_fail (push) Successful in 46s Details
Test / buildenv (push) Successful in 9s Details
Test / build (push) Successful in 2m38s Details
Test / make_test (push) Successful in 33s Details
Test / test_add_osd (push) Successful in 2m38s Details
Test / test_cas (push) Successful in 7s Details
Test / test_change_pg_count (push) Successful in 40s Details
Test / test_change_pg_count_ec (push) Successful in 40s Details
Test / test_change_pg_size (push) Successful in 7s Details
Test / test_create_nomaxid (push) Successful in 7s Details
Test / test_interrupted_rebalance (push) Successful in 3m9s Details
Test / test_interrupted_rebalance_imm (push) Successful in 1m38s Details
Test / test_interrupted_rebalance_ec (push) Successful in 1m54s Details
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m36s Details
Test / test_failure_domain (push) Successful in 9s Details
Test / test_snapshot (push) Successful in 23s Details
Test / test_snapshot_ec (push) Successful in 22s Details
Test / test_minsize_1 (push) Successful in 14s Details
Test / test_move_reappear (push) Successful in 19s Details
Test / test_rm (push) Successful in 12s Details
Test / test_snapshot_chain (push) Successful in 2m2s Details
Test / test_snapshot_chain_ec (push) Successful in 2m38s Details
Test / test_snapshot_down (push) Successful in 21s Details
Test / test_snapshot_down_ec (push) Successful in 24s Details
Test / test_splitbrain (push) Successful in 15s Details
Test / test_rebalance_verify (push) Successful in 3m10s Details
Test / test_rebalance_verify_imm (push) Successful in 3m10s Details
Test / test_rebalance_verify_ec (push) Successful in 3m27s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 6m2s Details
Test / test_write (push) Successful in 35s Details
Test / test_write_xor (push) Successful in 45s Details
Test / test_write_no_same (push) Successful in 22s Details
Test / test_heal_pg_size_2 (push) Successful in 4m0s Details
Test / test_heal_ec (push) Successful in 3m52s Details
Test / test_scrub (push) Successful in 1m1s Details
Test / test_scrub_zero_osd_2 (push) Successful in 42s Details
Test / test_scrub_xor (push) Successful in 34s Details
Test / test_scrub_pg_size_3 (push) Successful in 53s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 45s Details
Test / test_scrub_ec (push) Successful in 26s Details

Before this change, OSDs almost always died when one of the etcds was restarted,
even though the rest of them was still in quorum and the lease was still active
cached-reads
Vitaliy Filippov 2023-07-07 01:16:26 +03:00
parent 2f999d8607
commit 161a23c966
3 changed files with 31 additions and 18 deletions

View File

@ -357,7 +357,7 @@ void etcd_state_client_t::start_etcd_watcher()
watch_id == ETCD_OSD_STATE_WATCH_ID)
etcd_watches_initialised++;
if (etcd_watches_initialised == 4 && this->log_level > 0)
fprintf(stderr, "Successfully subscribed to etcd at %s\n", selected_etcd_address.c_str());
fprintf(stderr, "Successfully subscribed to etcd at %s\n", cur_addr.c_str());
}
if (data["result"]["canceled"].bool_value())
{
@ -368,15 +368,17 @@ void etcd_state_client_t::start_etcd_watcher()
// so we should restart from the beginning if we can
if (on_reload_hook != NULL)
{
fprintf(stderr, "Revisions before %lu were compacted by etcd, reloading state\n",
data["result"]["compact_revision"].uint64_value());
if (etcd_watch_ws)
// check to not trigger on_reload_hook multiple times
if (etcd_watch_ws != NULL)
{
fprintf(stderr, "Revisions before %lu were compacted by etcd, reloading state\n",
data["result"]["compact_revision"].uint64_value());
http_close(etcd_watch_ws);
etcd_watch_ws = NULL;
etcd_watch_revision = 0;
on_reload_hook();
}
etcd_watch_revision = 0;
on_reload_hook();
return;
}
else
{
@ -423,13 +425,9 @@ void etcd_state_client_t::start_etcd_watcher()
}
if (msg->eof)
{
fprintf(stderr, "Disconnected from etcd %s\n", cur_addr.c_str());
if (cur_addr == selected_etcd_address)
{
fprintf(stderr, "Disconnected from etcd %s\n", selected_etcd_address.c_str());
selected_etcd_address = "";
}
else
fprintf(stderr, "Disconnected from etcd\n");
if (etcd_watch_ws)
{
http_close(etcd_watch_ws);
@ -446,6 +444,7 @@ void etcd_state_client_t::start_etcd_watcher()
else if (etcd_watches_initialised > 0)
{
// Connection was live, retry immediately
etcd_watches_initialised = 0;
start_etcd_watcher();
}
}

View File

@ -198,13 +198,14 @@ class osd_t
void on_change_pg_history_hook(pool_id_t pool_id, pg_num_t pg_num);
void on_change_etcd_state_hook(std::map<std::string, etcd_kv_t> & changes);
void on_load_config_hook(json11::Json::object & changes);
void on_reload_config_hook(json11::Json::object & changes);
json11::Json on_load_pgs_checks_hook();
void on_load_pgs_hook(bool success);
void bind_socket();
void acquire_lease();
json11::Json get_osd_state();
void create_osd_state();
void renew_lease();
void renew_lease(bool reload);
void print_stats();
void print_slow();
void reset_stats();

View File

@ -70,6 +70,7 @@ void osd_t::init_cluster()
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
st_cli.load_pgs_checks_hook = [this]() { return on_load_pgs_checks_hook(); };
st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
st_cli.on_reload_hook = [this]() { st_cli.load_global_config(); };
peering_state = OSD_LOADING_PGS;
st_cli.load_global_config();
}
@ -395,6 +396,14 @@ void osd_t::on_load_config_hook(json11::Json::object & global_config)
parse_config(true);
bind_socket();
acquire_lease();
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_reload_config_hook(cfg); };
}
void osd_t::on_reload_config_hook(json11::Json::object & global_config)
{
etcd_global_config = global_config;
parse_config(false);
renew_lease(true);
}
// Acquire lease
@ -424,7 +433,7 @@ void osd_t::acquire_lease()
);
tfd->set_timer(etcd_report_interval*1000, true, [this](int timer_id)
{
renew_lease();
renew_lease(false);
});
}
@ -499,11 +508,11 @@ void osd_t::create_osd_state()
}
// Renew lease
void osd_t::renew_lease()
void osd_t::renew_lease(bool reload)
{
st_cli.etcd_call("/lease/keepalive", json11::Json::object {
{ "ID", etcd_lease_id }
}, st_cli.etcd_quick_timeout, 0, 0, [this](std::string err, json11::Json data)
}, st_cli.etcd_quick_timeout, 0, 0, [this, reload](std::string err, json11::Json data)
{
if (err == "" && data["result"]["TTL"].string_value() == "")
{
@ -522,15 +531,20 @@ void osd_t::renew_lease()
force_stop(1);
}
// Retry
tfd->set_timer(st_cli.etcd_quick_timeout, false, [this](int timer_id)
tfd->set_timer(st_cli.etcd_quick_timeout, false, [this, reload](int timer_id)
{
renew_lease();
renew_lease(reload);
});
}
else
{
etcd_failed_attempts = 0;
report_statistics();
// Reload PGs
if (reload && run_primary)
{
st_cli.load_pgs();
}
}
});
}
@ -560,7 +574,6 @@ void osd_t::force_stop(int exitcode)
json11::Json osd_t::on_load_pgs_checks_hook()
{
assert(this->pgs.size() == 0);
json11::Json::array checks = {
json11::Json::object {
{ "target", "LEASE" },