Add a new recovery_pg_switch setting to mix all PGs during recovery

2022-12-30 01:53:29 +03:00 · 2022-12-30 01:53:29 +03:00 · 998e24adf8
parent d7bd36dc32
commit 998e24adf8
8 changed files with 98 additions and 25 deletions
--- a/docs/config/osd.en.md
+++ b/docs/config/osd.en.md
@ -17,6 +17,7 @@ initialization and can be changed with an OSD restart.
 - [autosync_interval](#autosync_interval)
 - [autosync_writes](#autosync_writes)
 - [recovery_queue_depth](#recovery_queue_depth)
+- [recovery_pg_switch](#recovery_pg_switch)
 - [recovery_sync_batch](#recovery_sync_batch)
 - [readonly](#readonly)
 - [no_recovery](#no_recovery)
@ -115,6 +116,16 @@ Maximum recovery operations per one primary OSD at any given moment of time.
 Currently it's the only parameter available to tune the speed or recovery
 and rebalancing, but it's planned to implement more.

+## recovery_pg_switch
+
+- Type: integer
+- Default: 128
+
+Number of recovery operations before switching to recovery of the next PG.
+The idea is to mix all PGs during recovery for more even space and load
+distribution but still benefit from recovery queue depth greater than 1.
+Degraded PGs are anyway scanned first.
+
 ## recovery_sync_batch

 - Type: integer
--- a/docs/config/osd.ru.md
+++ b/docs/config/osd.ru.md
@ -18,6 +18,7 @@
 - [autosync_interval](#autosync_interval)
 - [autosync_writes](#autosync_writes)
 - [recovery_queue_depth](#recovery_queue_depth)
+- [recovery_pg_switch](#recovery_pg_switch)
 - [recovery_sync_batch](#recovery_sync_batch)
 - [readonly](#readonly)
 - [no_recovery](#no_recovery)
@ -119,6 +120,17 @@ OSD, чтобы успевать очищать журнал - без них OSD
 для ускорения или замедления восстановления и перебалансировки данных, но
 в планах реализация других параметров.

+## recovery_pg_switch
+
+- Тип: целое число
+- Значение по умолчанию: 128
+
+Число операций восстановления перед переключением на восстановление другой PG.
+Идея заключается в том, чтобы восстанавливать все PG одновременно для более
+равномерного распределения места и нагрузки, но при этом всё равно выигрывать
+от глубины очереди восстановления, большей, чем 1. Деградированные PG в любом
+случае сканируются первыми.
+
 ## recovery_sync_batch

 - Тип: целое число
--- a/docs/config/src/osd.yml
+++ b/docs/config/src/osd.yml
@ -102,6 +102,20 @@
    момент времени. На данный момент единственный параметр, который можно менять
    для ускорения или замедления восстановления и перебалансировки данных, но
    в планах реализация других параметров.
+- name: recovery_pg_switch
+  type: int
+  default: 128
+  info: |
+    Number of recovery operations before switching to recovery of the next PG.
+    The idea is to mix all PGs during recovery for more even space and load
+    distribution but still benefit from recovery queue depth greater than 1.
+    Degraded PGs are anyway scanned first.
+  info_ru: |
+    Число операций восстановления перед переключением на восстановление другой PG.
+    Идея заключается в том, чтобы восстанавливать все PG одновременно для более
+    равномерного распределения места и нагрузки, но при этом всё равно выигрывать
+    от глубины очереди восстановления, большей, чем 1. Деградированные PG в любом
+    случае сканируются первыми.
 - name: recovery_sync_batch
  type: int
  default: 16
--- a/src/osd.cpp
+++ b/src/osd.cpp
@ -163,6 +163,9 @@ void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
    recovery_queue_depth = config["recovery_queue_depth"].uint64_value();
    if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
        recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
+    recovery_pg_switch = config["recovery_pg_switch"].uint64_value();
+    if (recovery_pg_switch < 1)
+        recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
    recovery_sync_batch = config["recovery_sync_batch"].uint64_value();
    if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
        recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
--- a/src/osd.h
+++ b/src/osd.h
@ -34,6 +34,7 @@
 #define DEFAULT_AUTOSYNC_WRITES 128
 #define MAX_RECOVERY_QUEUE 2048
 #define DEFAULT_RECOVERY_QUEUE 4
+#define DEFAULT_RECOVERY_PG_SWITCH 128
 #define DEFAULT_RECOVERY_BATCH 16

 //#define OSD_STUB
@ -108,6 +109,7 @@ class osd_t
    int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // "emergency" sync every 5 seconds
    int autosync_writes = DEFAULT_AUTOSYNC_WRITES;
    int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
+    int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
    int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
    int inode_vanish_time = 60;
    int log_level = 0;
@ -135,7 +137,10 @@ class osd_t
    uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
    int peering_state = 0;
    std::map<object_id, osd_recovery_op_t> recovery_ops;
-    int recovery_done = 0;
+    bool recovery_last_degraded = true;
+    pool_pg_num_t recovery_last_pg;
+    object_id recovery_last_oid;
+    int recovery_pg_done = 0, recovery_done = 0;
    osd_op_t *autosync_op = NULL;

    // Unstable writes
--- a/src/osd_flush.cpp
+++ b/src/osd_flush.cpp
@ -226,42 +226,51 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t

 bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
 {
-    if (!no_recovery)
+    if (!pgs.size())
    {
-        for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
-        {
-            if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED))
-            {
-                for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++)
-                {
-                    if (recovery_ops.find(obj_it->first) == recovery_ops.end())
-                    {
-                        op.degraded = true;
-                        op.oid = obj_it->first;
-                        return true;
-                    }
-                }
-            }
-        }
+        return false;
    }
-    if (!no_rebalance)
+    // Restart scanning from the same degraded/misplaced status as the last time
+    for (int tried_degraded = 0; tried_degraded < 2; tried_degraded++)
    {
-        for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
+        if (recovery_last_degraded ? !no_recovery : !no_rebalance)
        {
            // Don't try to "recover" misplaced objects if "recovery" would make them degraded
-            if ((pg_it->second.state & (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
+            auto mask = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED);
+            auto check = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_HAS_MISPLACED);
+            // Restart scanning from the same PG as the last time
+            for (auto pg_it = pgs.lower_bound(recovery_last_pg); pg_it != pgs.end(); pg_it++)
            {
-                for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
+                if ((pg_it->second.state & mask) == check)
                {
-                    if (recovery_ops.find(obj_it->first) == recovery_ops.end())
+                    auto & src = recovery_last_degraded ? pg_it->second.degraded_objects : pg_it->second.misplaced_objects;
+                    assert(src.size() > 0);
+                    // Restart scanning from the next object
+                    for (auto obj_it = src.upper_bound(recovery_last_oid); obj_it != src.end(); obj_it++)
                    {
-                        op.degraded = false;
-                        op.oid = obj_it->first;
-                        return true;
+                        if (recovery_ops.find(obj_it->first) == recovery_ops.end())
+                        {
+                            op.degraded = recovery_last_degraded;
+                            recovery_last_oid = op.oid = obj_it->first;
+                            recovery_pg_done++;
+                            // Switch to another PG after recovery_pg_switch operations
+                            // to always mix all PGs during recovery but still benefit
+                            // from recovery queue depth greater than 1
+                            if (recovery_pg_done >= recovery_pg_switch)
+                            {
+                                recovery_pg_done = 0;
+                                recovery_last_pg.pg_num++;
+                                recovery_last_oid = {};
+                            }
+                            return true;
+                        }
                    }
                }
            }
        }
+        recovery_last_degraded = !recovery_last_degraded;
+        recovery_last_pg = {};
+        recovery_last_oid = {};
    }
    return false;
 }
--- a/src/osd_id.h
+++ b/src/osd_id.h
@ -28,3 +28,13 @@ inline bool operator < (const pool_pg_num_t & a, const pool_pg_num_t & b)
 {
    return a.pool_id < b.pool_id || a.pool_id == b.pool_id && a.pg_num < b.pg_num;
 }
+
+inline bool operator == (const pool_pg_num_t & a, const pool_pg_num_t & b)
+{
+    return a.pool_id == b.pool_id && a.pg_num == b.pg_num;
+}
+
+inline bool operator != (const pool_pg_num_t & a, const pool_pg_num_t & b)
+{
+    return a.pool_id != b.pool_id || a.pg_num != b.pg_num;
+}
--- a/src/osd_peering.cpp
+++ b/src/osd_peering.cpp
@ -32,7 +32,16 @@ void osd_t::handle_peers()
                    if (p.second.state & PG_HAS_UNCLEAN)
                        peering_state = peering_state | OSD_FLUSHING_PGS;
                    else if (p.second.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED))
+                    {
                        peering_state = peering_state | OSD_RECOVERING;
+                        if (p.second.state & PG_HAS_DEGRADED)
+                        {
+                            // Restart recovery from degraded objects
+                            recovery_last_degraded = true;
+                            recovery_last_pg = {};
+                            recovery_last_oid = {};
+                        }
+                    }
                    ringloop->wakeup();
                    return;
                }