From 086667f568b6682757bb5ccbbf0cd372f3c7026f Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Mon, 8 Mar 2021 00:45:18 +0300 Subject: [PATCH] Do not check PG state key ownership if it doesn't exist yet This fixes the bug where OSDs were sometimes trying to report updated PG states infinitely without luck when PGs transitioned from 'starting' to 'peering' too fast --- src/osd_cluster.cpp | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/osd_cluster.cpp b/src/osd_cluster.cpp index 72164be8..860634b1 100644 --- a/src/osd_cluster.cpp +++ b/src/osd_cluster.cpp @@ -665,7 +665,21 @@ void osd_t::report_pg_states() auto & pg = pg_it->second; reporting_pgs.push_back({ *it, pg.history_changed }); std::string state_key_base64 = base64_encode(st_cli.etcd_prefix+"/pg/state/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num)); - if (pg.state == PG_STARTING) + bool pg_state_exists = false; + if (pg.state != PG_STARTING) + { + auto pool_it = st_cli.pool_config.find(pg.pool_id); + if (pool_it != st_cli.pool_config.end()) + { + auto pg_it = pool_it->second.pg_config.find(pg.pg_num); + if (pg_it != pool_it->second.pg_config.end() && + pg_it->second.cur_state != 0) + { + pg_state_exists = true; + } + } + } + if (!pg_state_exists) { // Check that the PG key does not exist // Failed check indicates an unsuccessful PG lock attempt in this case @@ -677,9 +691,7 @@ void osd_t::report_pg_states() } else { - // Check that the key is ours - // Failed check indicates success for OFFLINE pgs (PG lock is already deleted) - // and an unexpected race condition for started pgs (PG lock is held by someone else) + // Check that the key is ours if it already exists checks.push_back(json11::Json::object { { "target", "LEASE" }, { "lease", etcd_lease_id },