forked from vitalif/vitastor
Add a new recovery_pg_switch setting to mix all PGs during recovery
parent
d7bd36dc32
commit
998e24adf8
|
@ -17,6 +17,7 @@ initialization and can be changed with an OSD restart.
|
||||||
- [autosync_interval](#autosync_interval)
|
- [autosync_interval](#autosync_interval)
|
||||||
- [autosync_writes](#autosync_writes)
|
- [autosync_writes](#autosync_writes)
|
||||||
- [recovery_queue_depth](#recovery_queue_depth)
|
- [recovery_queue_depth](#recovery_queue_depth)
|
||||||
|
- [recovery_pg_switch](#recovery_pg_switch)
|
||||||
- [recovery_sync_batch](#recovery_sync_batch)
|
- [recovery_sync_batch](#recovery_sync_batch)
|
||||||
- [readonly](#readonly)
|
- [readonly](#readonly)
|
||||||
- [no_recovery](#no_recovery)
|
- [no_recovery](#no_recovery)
|
||||||
|
@ -115,6 +116,16 @@ Maximum recovery operations per one primary OSD at any given moment of time.
|
||||||
Currently it's the only parameter available to tune the speed or recovery
|
Currently it's the only parameter available to tune the speed or recovery
|
||||||
and rebalancing, but it's planned to implement more.
|
and rebalancing, but it's planned to implement more.
|
||||||
|
|
||||||
|
## recovery_pg_switch
|
||||||
|
|
||||||
|
- Type: integer
|
||||||
|
- Default: 128
|
||||||
|
|
||||||
|
Number of recovery operations before switching to recovery of the next PG.
|
||||||
|
The idea is to mix all PGs during recovery for more even space and load
|
||||||
|
distribution but still benefit from recovery queue depth greater than 1.
|
||||||
|
Degraded PGs are anyway scanned first.
|
||||||
|
|
||||||
## recovery_sync_batch
|
## recovery_sync_batch
|
||||||
|
|
||||||
- Type: integer
|
- Type: integer
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
- [autosync_interval](#autosync_interval)
|
- [autosync_interval](#autosync_interval)
|
||||||
- [autosync_writes](#autosync_writes)
|
- [autosync_writes](#autosync_writes)
|
||||||
- [recovery_queue_depth](#recovery_queue_depth)
|
- [recovery_queue_depth](#recovery_queue_depth)
|
||||||
|
- [recovery_pg_switch](#recovery_pg_switch)
|
||||||
- [recovery_sync_batch](#recovery_sync_batch)
|
- [recovery_sync_batch](#recovery_sync_batch)
|
||||||
- [readonly](#readonly)
|
- [readonly](#readonly)
|
||||||
- [no_recovery](#no_recovery)
|
- [no_recovery](#no_recovery)
|
||||||
|
@ -119,6 +120,17 @@ OSD, чтобы успевать очищать журнал - без них OSD
|
||||||
для ускорения или замедления восстановления и перебалансировки данных, но
|
для ускорения или замедления восстановления и перебалансировки данных, но
|
||||||
в планах реализация других параметров.
|
в планах реализация других параметров.
|
||||||
|
|
||||||
|
## recovery_pg_switch
|
||||||
|
|
||||||
|
- Тип: целое число
|
||||||
|
- Значение по умолчанию: 128
|
||||||
|
|
||||||
|
Число операций восстановления перед переключением на восстановление другой PG.
|
||||||
|
Идея заключается в том, чтобы восстанавливать все PG одновременно для более
|
||||||
|
равномерного распределения места и нагрузки, но при этом всё равно выигрывать
|
||||||
|
от глубины очереди восстановления, большей, чем 1. Деградированные PG в любом
|
||||||
|
случае сканируются первыми.
|
||||||
|
|
||||||
## recovery_sync_batch
|
## recovery_sync_batch
|
||||||
|
|
||||||
- Тип: целое число
|
- Тип: целое число
|
||||||
|
|
|
@ -102,6 +102,20 @@
|
||||||
момент времени. На данный момент единственный параметр, который можно менять
|
момент времени. На данный момент единственный параметр, который можно менять
|
||||||
для ускорения или замедления восстановления и перебалансировки данных, но
|
для ускорения или замедления восстановления и перебалансировки данных, но
|
||||||
в планах реализация других параметров.
|
в планах реализация других параметров.
|
||||||
|
- name: recovery_pg_switch
|
||||||
|
type: int
|
||||||
|
default: 128
|
||||||
|
info: |
|
||||||
|
Number of recovery operations before switching to recovery of the next PG.
|
||||||
|
The idea is to mix all PGs during recovery for more even space and load
|
||||||
|
distribution but still benefit from recovery queue depth greater than 1.
|
||||||
|
Degraded PGs are anyway scanned first.
|
||||||
|
info_ru: |
|
||||||
|
Число операций восстановления перед переключением на восстановление другой PG.
|
||||||
|
Идея заключается в том, чтобы восстанавливать все PG одновременно для более
|
||||||
|
равномерного распределения места и нагрузки, но при этом всё равно выигрывать
|
||||||
|
от глубины очереди восстановления, большей, чем 1. Деградированные PG в любом
|
||||||
|
случае сканируются первыми.
|
||||||
- name: recovery_sync_batch
|
- name: recovery_sync_batch
|
||||||
type: int
|
type: int
|
||||||
default: 16
|
default: 16
|
||||||
|
|
|
@ -163,6 +163,9 @@ void osd_t::parse_config(const json11::Json & config, bool allow_disk_params)
|
||||||
recovery_queue_depth = config["recovery_queue_depth"].uint64_value();
|
recovery_queue_depth = config["recovery_queue_depth"].uint64_value();
|
||||||
if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
|
if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
|
||||||
recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
||||||
|
recovery_pg_switch = config["recovery_pg_switch"].uint64_value();
|
||||||
|
if (recovery_pg_switch < 1)
|
||||||
|
recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
|
||||||
recovery_sync_batch = config["recovery_sync_batch"].uint64_value();
|
recovery_sync_batch = config["recovery_sync_batch"].uint64_value();
|
||||||
if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
|
if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
|
||||||
recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||||
|
|
|
@ -34,6 +34,7 @@
|
||||||
#define DEFAULT_AUTOSYNC_WRITES 128
|
#define DEFAULT_AUTOSYNC_WRITES 128
|
||||||
#define MAX_RECOVERY_QUEUE 2048
|
#define MAX_RECOVERY_QUEUE 2048
|
||||||
#define DEFAULT_RECOVERY_QUEUE 4
|
#define DEFAULT_RECOVERY_QUEUE 4
|
||||||
|
#define DEFAULT_RECOVERY_PG_SWITCH 128
|
||||||
#define DEFAULT_RECOVERY_BATCH 16
|
#define DEFAULT_RECOVERY_BATCH 16
|
||||||
|
|
||||||
//#define OSD_STUB
|
//#define OSD_STUB
|
||||||
|
@ -108,6 +109,7 @@ class osd_t
|
||||||
int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // "emergency" sync every 5 seconds
|
int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // "emergency" sync every 5 seconds
|
||||||
int autosync_writes = DEFAULT_AUTOSYNC_WRITES;
|
int autosync_writes = DEFAULT_AUTOSYNC_WRITES;
|
||||||
int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
|
||||||
|
int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH;
|
||||||
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
|
||||||
int inode_vanish_time = 60;
|
int inode_vanish_time = 60;
|
||||||
int log_level = 0;
|
int log_level = 0;
|
||||||
|
@ -135,7 +137,10 @@ class osd_t
|
||||||
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
|
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
|
||||||
int peering_state = 0;
|
int peering_state = 0;
|
||||||
std::map<object_id, osd_recovery_op_t> recovery_ops;
|
std::map<object_id, osd_recovery_op_t> recovery_ops;
|
||||||
int recovery_done = 0;
|
bool recovery_last_degraded = true;
|
||||||
|
pool_pg_num_t recovery_last_pg;
|
||||||
|
object_id recovery_last_oid;
|
||||||
|
int recovery_pg_done = 0, recovery_done = 0;
|
||||||
osd_op_t *autosync_op = NULL;
|
osd_op_t *autosync_op = NULL;
|
||||||
|
|
||||||
// Unstable writes
|
// Unstable writes
|
||||||
|
|
|
@ -226,42 +226,51 @@ bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t
|
||||||
|
|
||||||
bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
|
bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
|
||||||
{
|
{
|
||||||
if (!no_recovery)
|
if (!pgs.size())
|
||||||
{
|
{
|
||||||
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
|
return false;
|
||||||
{
|
|
||||||
if ((pg_it->second.state & (PG_ACTIVE | PG_HAS_DEGRADED)) == (PG_ACTIVE | PG_HAS_DEGRADED))
|
|
||||||
{
|
|
||||||
for (auto obj_it = pg_it->second.degraded_objects.begin(); obj_it != pg_it->second.degraded_objects.end(); obj_it++)
|
|
||||||
{
|
|
||||||
if (recovery_ops.find(obj_it->first) == recovery_ops.end())
|
|
||||||
{
|
|
||||||
op.degraded = true;
|
|
||||||
op.oid = obj_it->first;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (!no_rebalance)
|
// Restart scanning from the same degraded/misplaced status as the last time
|
||||||
|
for (int tried_degraded = 0; tried_degraded < 2; tried_degraded++)
|
||||||
{
|
{
|
||||||
for (auto pg_it = pgs.begin(); pg_it != pgs.end(); pg_it++)
|
if (recovery_last_degraded ? !no_recovery : !no_rebalance)
|
||||||
{
|
{
|
||||||
// Don't try to "recover" misplaced objects if "recovery" would make them degraded
|
// Don't try to "recover" misplaced objects if "recovery" would make them degraded
|
||||||
if ((pg_it->second.state & (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED)) == (PG_ACTIVE | PG_HAS_MISPLACED))
|
auto mask = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_DEGRADED | PG_HAS_MISPLACED);
|
||||||
|
auto check = recovery_last_degraded ? (PG_ACTIVE | PG_HAS_DEGRADED) : (PG_ACTIVE | PG_HAS_MISPLACED);
|
||||||
|
// Restart scanning from the same PG as the last time
|
||||||
|
for (auto pg_it = pgs.lower_bound(recovery_last_pg); pg_it != pgs.end(); pg_it++)
|
||||||
{
|
{
|
||||||
for (auto obj_it = pg_it->second.misplaced_objects.begin(); obj_it != pg_it->second.misplaced_objects.end(); obj_it++)
|
if ((pg_it->second.state & mask) == check)
|
||||||
{
|
{
|
||||||
if (recovery_ops.find(obj_it->first) == recovery_ops.end())
|
auto & src = recovery_last_degraded ? pg_it->second.degraded_objects : pg_it->second.misplaced_objects;
|
||||||
|
assert(src.size() > 0);
|
||||||
|
// Restart scanning from the next object
|
||||||
|
for (auto obj_it = src.upper_bound(recovery_last_oid); obj_it != src.end(); obj_it++)
|
||||||
{
|
{
|
||||||
op.degraded = false;
|
if (recovery_ops.find(obj_it->first) == recovery_ops.end())
|
||||||
op.oid = obj_it->first;
|
{
|
||||||
return true;
|
op.degraded = recovery_last_degraded;
|
||||||
|
recovery_last_oid = op.oid = obj_it->first;
|
||||||
|
recovery_pg_done++;
|
||||||
|
// Switch to another PG after recovery_pg_switch operations
|
||||||
|
// to always mix all PGs during recovery but still benefit
|
||||||
|
// from recovery queue depth greater than 1
|
||||||
|
if (recovery_pg_done >= recovery_pg_switch)
|
||||||
|
{
|
||||||
|
recovery_pg_done = 0;
|
||||||
|
recovery_last_pg.pg_num++;
|
||||||
|
recovery_last_oid = {};
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
recovery_last_degraded = !recovery_last_degraded;
|
||||||
|
recovery_last_pg = {};
|
||||||
|
recovery_last_oid = {};
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
10
src/osd_id.h
10
src/osd_id.h
|
@ -28,3 +28,13 @@ inline bool operator < (const pool_pg_num_t & a, const pool_pg_num_t & b)
|
||||||
{
|
{
|
||||||
return a.pool_id < b.pool_id || a.pool_id == b.pool_id && a.pg_num < b.pg_num;
|
return a.pool_id < b.pool_id || a.pool_id == b.pool_id && a.pg_num < b.pg_num;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline bool operator == (const pool_pg_num_t & a, const pool_pg_num_t & b)
|
||||||
|
{
|
||||||
|
return a.pool_id == b.pool_id && a.pg_num == b.pg_num;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool operator != (const pool_pg_num_t & a, const pool_pg_num_t & b)
|
||||||
|
{
|
||||||
|
return a.pool_id != b.pool_id || a.pg_num != b.pg_num;
|
||||||
|
}
|
||||||
|
|
|
@ -32,7 +32,16 @@ void osd_t::handle_peers()
|
||||||
if (p.second.state & PG_HAS_UNCLEAN)
|
if (p.second.state & PG_HAS_UNCLEAN)
|
||||||
peering_state = peering_state | OSD_FLUSHING_PGS;
|
peering_state = peering_state | OSD_FLUSHING_PGS;
|
||||||
else if (p.second.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED))
|
else if (p.second.state & (PG_HAS_DEGRADED | PG_HAS_MISPLACED))
|
||||||
|
{
|
||||||
peering_state = peering_state | OSD_RECOVERING;
|
peering_state = peering_state | OSD_RECOVERING;
|
||||||
|
if (p.second.state & PG_HAS_DEGRADED)
|
||||||
|
{
|
||||||
|
// Restart recovery from degraded objects
|
||||||
|
recovery_last_degraded = true;
|
||||||
|
recovery_last_pg = {};
|
||||||
|
recovery_last_oid = {};
|
||||||
|
}
|
||||||
|
}
|
||||||
ringloop->wakeup();
|
ringloop->wakeup();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue