forked from vitalif/vitastor
Add a new recovery_pg_switch setting to mix all PGs during recovery
parent
d7bd36dc32
commit
998e24adf8
|
@ -17,6 +17,7 @@ initialization and can be changed with an OSD restart.
|
|||
- [autosync_interval](#autosync_interval)
|
||||
- [autosync_writes](#autosync_writes)
|
||||
- [recovery_queue_depth](#recovery_queue_depth)
|
||||
- [recovery_pg_switch](#recovery_pg_switch)
|
||||
- [recovery_sync_batch](#recovery_sync_batch)
|
||||
- [readonly](#readonly)
|
||||
- [no_recovery](#no_recovery)
|
||||
|
@ -115,6 +116,16 @@ Maximum recovery operations per one primary OSD at any given moment of time.
|
|||
Currently it's the only parameter available to tune the speed or recovery
|
||||
and rebalancing, but it's planned to implement more.
|
||||
|
||||
## recovery_pg_switch
|
||||
|
||||
- Type: integer
|
||||
- Default: 128
|
||||
|
||||
Number of recovery operations before switching to recovery of the next PG.
|
||||
The idea is to mix all PGs during recovery for more even space and load
|
||||
distribution but still benefit from recovery queue depth greater than 1.
|
||||
Degraded PGs are anyway scanned first.
|
||||
|
||||
## recovery_sync_batch
|
||||
|
||||
- Type: integer
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
- [autosync_interval](#autosync_interval)
|
||||
- [autosync_writes](#autosync_writes)
|
||||
- [recovery_queue_depth](#recovery_queue_depth)
|
||||
- [recovery_pg_switch](#recovery_pg_switch)
|
||||
- [recovery_sync_batch](#recovery_sync_batch)
|
||||
- [readonly](#readonly)
|
||||
- [no_recovery](#no_recovery)
|
||||
|
@ -119,6 +120,17 @@ OSD, чтобы успевать очищать журнал - без них OSD
|
|||
для ускорения или замедления восстановления и перебалансировки данных, но
|
||||
в планах реализация других параметров.
|
||||
|
||||
## recovery_pg_switch
|
||||
|
||||
- Тип: целое число
|
||||
- Значение по умолчанию: 128
|
||||
|
||||
Число операций восстановления перед переключением на восстановление другой PG.
|
||||
Идея заключается в том, чтобы восстанавливать все PG одновременно для более
|
||||
равномерного распределения места и нагрузки, но при этом всё равно выигрывать
|
||||
от глубины очереди восстановления, большей, чем 1. Деградированные PG в любом
|
||||
случае сканируются первыми.
|
||||
|
||||
## recovery_sync_batch
|
||||
|
||||
- Тип: целое число
|
||||
|
|
|
@ -102,6 +102,20 @@
|
|||
момент времени. На данный момент единственный параметр, который можно менять
|
||||
для ускорения или замедления восстановления и перебалансировки данных, но
|
||||
в планах реализация других параметров.
|
||||
- name: recovery_pg_switch
|
||||
type: int
|
||||
default: 128
|
||||
info: |
|
||||
Number of recovery operations before switching to recovery of the next PG.
|
||||
The idea is to mix all PGs during recovery for more even space and load
|
||||
distribution but still benefit from recovery queue depth greater than 1.
|
||||
Degraded PGs are anyway scanned first.
|
||||
info_ru: |
|
||||
Число операций восстановления перед переключением на восстановление другой PG.
|
||||
Идея заключается в том, чтобы восстанавливать все PG одновременно для более
|
||||
равномерного распределения места и нагрузки, но при этом всё равно выигрывать
|
||||
от глубины очереди восстановления, большей, чем 1. Деградированные PG в любом
|
||||
случае сканируются первыми.
|
||||
- name: recovery_sync_batch
|
||||
type: int
|
||||
default: 16
|
||||
|
|
|
@ -163,6 +163,9 @@ void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
|
|||
recovery_queue_depth = config["recovery_queue_depth"].uint64_value();
|
||||
if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
|
||||
recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
||||
recovery_pg_switch = config["recovery_pg_switch"].uint64_value();
|
||||
if (recovery_pg_switch < 1)
|
||||
recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
|
||||
recovery_sync_batch = config["recovery_sync_batch"].uint64_value();
|
||||
if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
|
||||
recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||
|
|
|
@ -34,6 +34,7 @@
|
|||
#define DEFAULT_AUTOSYNC_WRITES 128
|
||||
#define MAX_RECOVERY_QUEUE 2048
|
||||
#define DEFAULT_RECOVERY_QUEUE 4
|
||||
#define DEFAULT_RECOVERY_PG_SWITCH 128
|
||||
#define DEFAULT_RECOVERY_BATCH 16
|
||||
|
||||
//#define OSD_STUB
|
||||
|
@ -108,6 +109,7 @@ class osd_t
|
|||
int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // "emergency" sync every 5 seconds
|
||||
int autosync_writes = DEFAULT_AUTOSYNC_WRITES;
|
||||
int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
||||
int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
|
||||
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||
int inode_vanish_time = 60;
|
||||
int log_level = 0;
|
||||
|
@ -135,7 +137,10 @@ class osd_t
|
|||
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
|
||||
int peering_state = 0;
|
||||
std::map<object_id, osd_recovery_op_t> recovery_ops;
|
||||
int recovery_done = 0;
|
||||
bool recovery_last_degraded = true;
|
||||
pool_pg_num_t recovery_last_pg;
|
||||
object_id recovery_last_oid;
|
||||
int recovery_pg_done = 0, recovery_done = 0;
|
||||
osd_op_t *autosync_op = NULL;
|
||||
|
||||
// Unstable writes
|
||||
|
|
|
@ -226,42 +226,51 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
|||
|
||||
bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
|
||||
{
|
||||
if (!no_recovery)
|
||||
if (!pgs.size())
|
||||
{
|
||||
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
|
||||
{
|
||||
if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED))
|
||||
{
|
||||
for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++)
|
||||
{
|
||||
if (recovery_ops.find(obj_it->first) == recovery_ops.end())
|
||||
{
|
||||
op.degraded = true;
|
||||
op.oid = obj_it->first;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
if (!no_rebalance)
|
||||
// Restart scanning from the same degraded/misplaced status as the last time
|
||||
for (int tried_degraded = 0; tried_degraded < 2; tried_degraded++)
|
||||
{
|
||||
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
|
||||
if (recovery_last_degraded ? !no_recovery : !no_rebalance)
|
||||
{
|
||||
// Don't try to "recover" misplaced objects if "recovery" would make them degraded
|
||||
if ((pg_it->second.state & (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
|
||||
auto mask = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED);
|
||||
auto check = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_HAS_MISPLACED);
|
||||
// Restart scanning from the same PG as the last time
|
||||
for (auto pg_it = pgs.lower_bound(recovery_last_pg); pg_it != pgs.end(); pg_it++)
|
||||
{
|
||||
for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
|
||||
if ((pg_it->second.state & mask) == check)
|
||||
{
|
||||
if (recovery_ops.find(obj_it->first) == recovery_ops.end())
|
||||
auto & src = recovery_last_degraded ? pg_it->second.degraded_objects : pg_it->second.misplaced_objects;
|
||||
assert(src.size() > 0);
|
||||
// Restart scanning from the next object
|
||||
for (auto obj_it = src.upper_bound(recovery_last_oid); obj_it != src.end(); obj_it++)
|
||||
{
|
||||
op.degraded = false;
|
||||
op.oid = obj_it->first;
|
||||
return true;
|
||||
if (recovery_ops.find(obj_it->first) == recovery_ops.end())
|
||||
{
|
||||
op.degraded = recovery_last_degraded;
|
||||
recovery_last_oid = op.oid = obj_it->first;
|
||||
recovery_pg_done++;
|
||||
// Switch to another PG after recovery_pg_switch operations
|
||||
// to always mix all PGs during recovery but still benefit
|
||||
// from recovery queue depth greater than 1
|
||||
if (recovery_pg_done >= recovery_pg_switch)
|
||||
{
|
||||
recovery_pg_done = 0;
|
||||
recovery_last_pg.pg_num++;
|
||||
recovery_last_oid = {};
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
recovery_last_degraded = !recovery_last_degraded;
|
||||
recovery_last_pg = {};
|
||||
recovery_last_oid = {};
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
|
10
src/osd_id.h
10
src/osd_id.h
|
@ -28,3 +28,13 @@ inline bool operator < (const pool_pg_num_t & a, const pool_pg_num_t & b)
|
|||
{
|
||||
return a.pool_id < b.pool_id || a.pool_id == b.pool_id && a.pg_num < b.pg_num;
|
||||
}
|
||||
|
||||
inline bool operator == (const pool_pg_num_t & a, const pool_pg_num_t & b)
|
||||
{
|
||||
return a.pool_id == b.pool_id && a.pg_num == b.pg_num;
|
||||
}
|
||||
|
||||
inline bool operator != (const pool_pg_num_t & a, const pool_pg_num_t & b)
|
||||
{
|
||||
return a.pool_id != b.pool_id || a.pg_num != b.pg_num;
|
||||
}
|
||||
|
|
|
@ -32,7 +32,16 @@ void osd_t::handle_peers()
|
|||
if (p.second.state & PG_HAS_UNCLEAN)
|
||||
peering_state = peering_state | OSD_FLUSHING_PGS;
|
||||
else if (p.second.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED))
|
||||
{
|
||||
peering_state = peering_state | OSD_RECOVERING;
|
||||
if (p.second.state & PG_HAS_DEGRADED)
|
||||
{
|
||||
// Restart recovery from degraded objects
|
||||
recovery_last_degraded = true;
|
||||
recovery_last_pg = {};
|
||||
recovery_last_oid = {};
|
||||
}
|
||||
}
|
||||
ringloop->wakeup();
|
||||
return;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue