diff --git a/docs/config/osd.en.md b/docs/config/osd.en.md index c436e233..6f7d2dbd 100644 --- a/docs/config/osd.en.md +++ b/docs/config/osd.en.md @@ -17,6 +17,7 @@ initialization and can be changed with an OSD restart. - [autosync_interval](#autosync_interval) - [autosync_writes](#autosync_writes) - [recovery_queue_depth](#recovery_queue_depth) +- [recovery_pg_switch](#recovery_pg_switch) - [recovery_sync_batch](#recovery_sync_batch) - [readonly](#readonly) - [no_recovery](#no_recovery) @@ -115,6 +116,16 @@ Maximum recovery operations per one primary OSD at any given moment of time. Currently it's the only parameter available to tune the speed or recovery and rebalancing, but it's planned to implement more. +## recovery_pg_switch + +- Type: integer +- Default: 128 + +Number of recovery operations before switching to recovery of the next PG. +The idea is to mix all PGs during recovery for more even space and load +distribution but still benefit from recovery queue depth greater than 1. +Degraded PGs are anyway scanned first. + ## recovery_sync_batch - Type: integer diff --git a/docs/config/osd.ru.md b/docs/config/osd.ru.md index 0faea608..f52ef7fb 100644 --- a/docs/config/osd.ru.md +++ b/docs/config/osd.ru.md @@ -18,6 +18,7 @@ - [autosync_interval](#autosync_interval) - [autosync_writes](#autosync_writes) - [recovery_queue_depth](#recovery_queue_depth) +- [recovery_pg_switch](#recovery_pg_switch) - [recovery_sync_batch](#recovery_sync_batch) - [readonly](#readonly) - [no_recovery](#no_recovery) @@ -119,6 +120,17 @@ OSD, чтобы успевать очищать журнал - без них OSD для ускорения или замедления восстановления и перебалансировки данных, но в планах реализация других параметров. +## recovery_pg_switch + +- Тип: целое число +- Значение по умолчанию: 128 + +Число операций восстановления перед переключением на восстановление другой PG. +Идея заключается в том, чтобы восстанавливать все PG одновременно для более +равномерного распределения места и нагрузки, но при этом всё равно выигрывать +от глубины очереди восстановления, большей, чем 1. Деградированные PG в любом +случае сканируются первыми. + ## recovery_sync_batch - Тип: целое число diff --git a/docs/config/src/osd.yml b/docs/config/src/osd.yml index e2144a1b..c5ee2489 100644 --- a/docs/config/src/osd.yml +++ b/docs/config/src/osd.yml @@ -102,6 +102,20 @@ момент времени. На данный момент единственный параметр, который можно менять для ускорения или замедления восстановления и перебалансировки данных, но в планах реализация других параметров. +- name: recovery_pg_switch + type: int + default: 128 + info: | + Number of recovery operations before switching to recovery of the next PG. + The idea is to mix all PGs during recovery for more even space and load + distribution but still benefit from recovery queue depth greater than 1. + Degraded PGs are anyway scanned first. + info_ru: | + Число операций восстановления перед переключением на восстановление другой PG. + Идея заключается в том, чтобы восстанавливать все PG одновременно для более + равномерного распределения места и нагрузки, но при этом всё равно выигрывать + от глубины очереди восстановления, большей, чем 1. Деградированные PG в любом + случае сканируются первыми. - name: recovery_sync_batch type: int default: 16 diff --git a/src/osd.cpp b/src/osd.cpp index 435bbdb2..0fc0cb55 100644 --- a/src/osd.cpp +++ b/src/osd.cpp @@ -163,6 +163,9 @@ void osd_t::parse_config(const json11::Json & config, bool allow_disk_params) recovery_queue_depth = config["recovery_queue_depth"].uint64_value(); if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE) recovery_queue_depth = DEFAULT_RECOVERY_QUEUE; + recovery_pg_switch = config["recovery_pg_switch"].uint64_value(); + if (recovery_pg_switch < 1) + recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH; recovery_sync_batch = config["recovery_sync_batch"].uint64_value(); if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE) recovery_sync_batch = DEFAULT_RECOVERY_BATCH; diff --git a/src/osd.h b/src/osd.h index c788ce16..37810216 100644 --- a/src/osd.h +++ b/src/osd.h @@ -34,6 +34,7 @@ #define DEFAULT_AUTOSYNC_WRITES 128 #define MAX_RECOVERY_QUEUE 2048 #define DEFAULT_RECOVERY_QUEUE 4 +#define DEFAULT_RECOVERY_PG_SWITCH 128 #define DEFAULT_RECOVERY_BATCH 16 //#define OSD_STUB @@ -108,6 +109,7 @@ class osd_t int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // "emergency" sync every 5 seconds int autosync_writes = DEFAULT_AUTOSYNC_WRITES; int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE; + int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH; int recovery_sync_batch = DEFAULT_RECOVERY_BATCH; int inode_vanish_time = 60; int log_level = 0; @@ -135,7 +137,10 @@ class osd_t uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0; int peering_state = 0; std::map recovery_ops; - int recovery_done = 0; + bool recovery_last_degraded = true; + pool_pg_num_t recovery_last_pg; + object_id recovery_last_oid; + int recovery_pg_done = 0, recovery_done = 0; osd_op_t *autosync_op = NULL; // Unstable writes diff --git a/src/osd_flush.cpp b/src/osd_flush.cpp index 83b345ea..9050dbb0 100644 --- a/src/osd_flush.cpp +++ b/src/osd_flush.cpp @@ -226,42 +226,51 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t bool osd_t::pick_next_recovery(osd_recovery_op_t &op) { - if (!no_recovery) + if (!pgs.size()) { - for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++) - { - if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED)) - { - for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++) - { - if (recovery_ops.find(obj_it->first) == recovery_ops.end()) - { - op.degraded = true; - op.oid = obj_it->first; - return true; - } - } - } - } + return false; } - if (!no_rebalance) + // Restart scanning from the same degraded/misplaced status as the last time + for (int tried_degraded = 0; tried_degraded < 2; tried_degraded++) { - for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++) + if (recovery_last_degraded ? !no_recovery : !no_rebalance) { // Don't try to "recover" misplaced objects if "recovery" would make them degraded - if ((pg_it->second.state & (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED)) + auto mask = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED); + auto check = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_HAS_MISPLACED); + // Restart scanning from the same PG as the last time + for (auto pg_it = pgs.lower_bound(recovery_last_pg); pg_it != pgs.end(); pg_it++) { - for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++) + if ((pg_it->second.state & mask) == check) { - if (recovery_ops.find(obj_it->first) == recovery_ops.end()) + auto & src = recovery_last_degraded ? pg_it->second.degraded_objects : pg_it->second.misplaced_objects; + assert(src.size() > 0); + // Restart scanning from the next object + for (auto obj_it = src.upper_bound(recovery_last_oid); obj_it != src.end(); obj_it++) { - op.degraded = false; - op.oid = obj_it->first; - return true; + if (recovery_ops.find(obj_it->first) == recovery_ops.end()) + { + op.degraded = recovery_last_degraded; + recovery_last_oid = op.oid = obj_it->first; + recovery_pg_done++; + // Switch to another PG after recovery_pg_switch operations + // to always mix all PGs during recovery but still benefit + // from recovery queue depth greater than 1 + if (recovery_pg_done >= recovery_pg_switch) + { + recovery_pg_done = 0; + recovery_last_pg.pg_num++; + recovery_last_oid = {}; + } + return true; + } } } } } + recovery_last_degraded = !recovery_last_degraded; + recovery_last_pg = {}; + recovery_last_oid = {}; } return false; } diff --git a/src/osd_id.h b/src/osd_id.h index 78b8c7c2..2ebeb817 100644 --- a/src/osd_id.h +++ b/src/osd_id.h @@ -28,3 +28,13 @@ inline bool operator < (const pool_pg_num_t & a, const pool_pg_num_t & b) { return a.pool_id < b.pool_id || a.pool_id == b.pool_id && a.pg_num < b.pg_num; } + +inline bool operator == (const pool_pg_num_t & a, const pool_pg_num_t & b) +{ + return a.pool_id == b.pool_id && a.pg_num == b.pg_num; +} + +inline bool operator != (const pool_pg_num_t & a, const pool_pg_num_t & b) +{ + return a.pool_id != b.pool_id || a.pg_num != b.pg_num; +} diff --git a/src/osd_peering.cpp b/src/osd_peering.cpp index 576191db..69a3b1a7 100644 --- a/src/osd_peering.cpp +++ b/src/osd_peering.cpp @@ -32,7 +32,16 @@ void osd_t::handle_peers() if (p.second.state & PG_HAS_UNCLEAN) peering_state = peering_state | OSD_FLUSHING_PGS; else if (p.second.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED)) + { peering_state = peering_state | OSD_RECOVERING; + if (p.second.state & PG_HAS_DEGRADED) + { + // Restart recovery from degraded objects + recovery_last_degraded = true; + recovery_last_pg = {}; + recovery_last_oid = {}; + } + } ringloop->wakeup(); return; }