From 6442010f932d72242ce2639d11bbf7f462cc5296 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Fri, 12 May 2023 23:12:32 +0300 Subject: [PATCH] Skip offline PGs during state reporting when the state is already deleted or taken over by another OSD This fixes OSDs being unable to report PG states in rare conditions --- src/osd_cluster.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/osd_cluster.cpp b/src/osd_cluster.cpp index 7415e3f1..0aeba06a 100644 --- a/src/osd_cluster.cpp +++ b/src/osd_cluster.cpp @@ -812,11 +812,21 @@ void osd_t::report_pg_states() pg_it->second.cur_state != 0) { pg_state_exists = true; + if (pg.state == PG_OFFLINE && pg_it->second.cur_primary != this->osd_num) + { + // Nothing to check or report, PG is already taken over by another OSD + continue; + } } } } if (!pg_state_exists) { + if (pg.state == PG_OFFLINE) + { + // Nothing to check or report, PG is already stopped + continue; + } // Check that the PG key does not exist // Failed check indicates an unsuccessful PG lock attempt in this case checks.push_back(json11::Json::object { @@ -901,6 +911,15 @@ void osd_t::report_pg_states() { etcd_reporting_pg_state = false; if (!data["succeeded"].bool_value()) + { + std::string rpgnames = ""; + for (auto pp: reporting_pgs) + { + rpgnames += (rpgnames.size() ? ", " : "")+std::to_string(pp.pool_pg_num.pool_id)+"/"+std::to_string(pp.pool_pg_num.pg_num); + } + printf("Error reporting PG %s states, will repeat the attempt: %s\n", rpgnames.c_str(), err.c_str()); + } + if (!data["succeeded"].bool_value()) { // One of PG state updates failed, put dirty flags back for (auto pp: reporting_pgs)