Add a minimum interval for etcd_state_client to reload state

(To prevent excessive load on etcd during outages)
Wait for all "up" OSDs to be connected before starting PG
2025-03-19 02:36:09 +03:00 · 2025-03-19 02:36:09 +03:00
7 changed files with 77 additions and 0 deletions
--- a/docs/config/network.en.md
+++ b/docs/config/network.en.md
@ -30,6 +30,7 @@ between clients, OSDs and etcd.
 - [etcd_slow_timeout](#etcd_slow_timeout)
 - [etcd_keepalive_timeout](#etcd_keepalive_timeout)
 - [etcd_ws_keepalive_interval](#etcd_ws_keepalive_interval)
+- [etcd_min_reload_interval](#etcd_min_reload_interval)

 ## tcp_header_buffer_size

@ -261,3 +262,13 @@ etcd_report_interval to guarantee that keepalive actually works.

 etcd websocket ping interval required to keep the connection alive and
 detect disconnections quickly.
+
+## etcd_min_reload_interval
+
+- Type: milliseconds
+- Default: 1000
+- Can be changed online: yes
+
+Minimum interval for full etcd state reload. Introduced to prevent
+excessive load on etcd during outages when etcd can't keep up with event
+streams and cancels them.
--- a/docs/config/network.ru.md
+++ b/docs/config/network.ru.md
@ -30,6 +30,7 @@
 - [etcd_slow_timeout](#etcd_slow_timeout)
 - [etcd_keepalive_timeout](#etcd_keepalive_timeout)
 - [etcd_ws_keepalive_interval](#etcd_ws_keepalive_interval)
+- [etcd_min_reload_interval](#etcd_min_reload_interval)

 ## tcp_header_buffer_size

@ -271,3 +272,13 @@ etcd_report_interval, чтобы keepalive гарантированно рабо
 - Можно менять на лету: да

 Интервал проверки живости вебсокет-подключений к etcd.
+
+## etcd_min_reload_interval
+
+- Тип: миллисекунды
+- Значение по умолчанию: 1000
+- Можно менять на лету: да
+
+Минимальный интервал полной перезагрузки состояния из etcd. Добавлено для
+предотвращения избыточной нагрузки на etcd во время отказов, когда etcd не
+успевает рассылать потоки событий и отменяет их.
--- a/docs/config/src/network.yml
+++ b/docs/config/src/network.yml
@ -306,3 +306,15 @@
    detect disconnections quickly.
  info_ru: |
    Интервал проверки живости вебсокет-подключений к etcd.
+- name: etcd_min_reload_interval
+  type: ms
+  default: 1000
+  online: true
+  info: |
+    Minimum interval for full etcd state reload. Introduced to prevent
+    excessive load on etcd during outages when etcd can't keep up with event
+    streams and cancels them.
+  info_ru: |
+    Минимальный интервал полной перезагрузки состояния из etcd. Добавлено для
+    предотвращения избыточной нагрузки на etcd во время отказов, когда etcd не
+    успевает рассылать потоки событий и отменяет их.
--- a/docs/usage/cli.ru.md
+++ b/docs/usage/cli.ru.md
@ -22,6 +22,8 @@ vitastor-cli - интерфейс командной строки для адм
 - [flatten](#flatten)
 - [rm-data](#rm-data)
 - [merge-data](#merge-data)
+- [describe](#describe)
+- [fix](#fix)
 - [alloc-osd](#alloc-osd)
 - [rm-osd](#rm-osd)
 - [osd-tree](#osd-tree)
--- a/src/client/etcd_state_client.cpp
+++ b/src/client/etcd_state_client.cpp
@ -31,6 +31,11 @@ etcd_state_client_t::~etcd_state_client_t()
        keepalive_client = NULL;
    }
 #endif
+    if (load_pgs_timer_id >= 0)
+    {
+        tfd->clear_timer(load_pgs_timer_id);
+        load_pgs_timer_id = -1;
+    }
 }

 #ifndef __MOCK__
@ -143,6 +148,7 @@ void etcd_state_client_t::etcd_call(std::string api, json11::Json payload, int t
                }
                if (interval > 0)
                {
+                    // FIXME: Prevent destruction of etcd_state_client if timers or requests are active
                    tfd->set_timer(interval, false, [this, api, payload, timeout, retries, interval, callback](int)
                    {
                        etcd_call(api, payload, timeout, retries, interval, callback);
@ -271,6 +277,11 @@ void etcd_state_client_t::parse_config(const json11::Json & config)
    {
        this->etcd_quick_timeout = 1000;
    }
+    this->etcd_min_reload_interval = config["etcd_min_reload_interval"].uint64_value();
+    if (this->etcd_min_reload_interval <= 0)
+    {
+        this->etcd_min_reload_interval = 50;
+    }
    if (this->etcd_ws_keepalive_interval != old_etcd_ws_keepalive_interval && ws_keepalive_timer >= 0)
    {
 #ifndef __MOCK__
@ -603,6 +614,23 @@ void etcd_state_client_t::load_global_config()

 void etcd_state_client_t::load_pgs()
 {
+    timespec tv;
+    clock_gettime(CLOCK_REALTIME, &tv);
+    uint64_t ms_passed = (tv.tv_sec-etcd_last_reload.tv_sec)*1000 + (tv.tv_nsec-etcd_last_reload.tv_nsec)/1000000;
+    if (ms_passed < etcd_min_reload_interval)
+    {
+        if (load_pgs_timer_id < 0)
+        {
+            load_pgs_timer_id = tfd->set_timer(etcd_min_reload_interval+50-ms_passed, false, [this](int) { load_pgs(); });
+        }
+        return;
+    }
+    etcd_last_reload = tv;
+    if (load_pgs_timer_id >= 0)
+    {
+        tfd->clear_timer(load_pgs_timer_id);
+        load_pgs_timer_id = -1;
+    }
    json11::Json::array txn = {
        json11::Json::object {
            { "request_range", json11::Json::object {
--- a/src/client/etcd_state_client.h
+++ b/src/client/etcd_state_client.h
@ -108,6 +108,7 @@ public:
    int max_etcd_attempts = 5;
    int etcd_quick_timeout = 1000;
    int etcd_slow_timeout = 5000;
+    int etcd_min_reload_interval = 1000;
    bool infinite_start = true;
    uint64_t global_block_size = DEFAULT_BLOCK_SIZE;
    uint32_t global_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
@ -122,6 +123,8 @@ public:
    uint64_t etcd_watch_revision_config = 0;
    uint64_t etcd_watch_revision_osd = 0;
    uint64_t etcd_watch_revision_pg = 0;
+    timespec etcd_last_reload = {};
+    int load_pgs_timer_id = -1;
    std::map<pool_id_t, pool_config_t> pool_config;
    std::map<osd_num_t, json11::Json> peer_states;
    std::set<osd_num_t> seen_peers;
--- a/src/osd/osd_peering.cpp
+++ b/src/osd/osd_peering.cpp
@ -199,6 +199,7 @@ void osd_t::start_pg_peering(pg_t & pg)
    drop_dirty_pg_connections({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
    // Try to connect with current peers if they're up, but we don't have connections to them
    // Otherwise we may erroneously decide that the pg is incomplete :-)
+    bool all_connected = true;
    for (auto pg_osd: pg.all_peers)
    {
        if (pg_osd != this->osd_num &&
@ -206,8 +207,17 @@ void osd_t::start_pg_peering(pg_t & pg)
            msgr.wanted_peers.find(pg_osd) == msgr.wanted_peers.end())
        {
            msgr.connect_peer(pg_osd, st_cli.peer_states[pg_osd]);
+            if (!st_cli.peer_states[pg_osd].is_null())
+                all_connected = false;
        }
    }
+    if (!all_connected)
+    {
+        // Wait until all OSDs are either connected or their /osd/state disappears from etcd
+        pg.state = PG_INCOMPLETE;
+        report_pg_state(pg);
+        return;
+    }
    // Calculate current write OSD set
    pg.pg_cursize = 0;
    pg.cur_set.resize(pg.target_set.size());
Author	SHA1	Message	Date
Vitaliy Filippov	0f80c87b43	Add a minimum interval for etcd_state_client to reload state (To prevent excessive load on etcd during outages)	2025-03-19 02:36:09 +03:00
Vitaliy Filippov	e0953fd502	Wait for all "up" OSDs to be connected before starting PG	2025-03-19 02:36:09 +03:00