Plug the new PG combinator into monitor
Test / buildenv (push) Successful in 10s
Details
Test / build (push) Successful in 12s
Details
Test / test_cas (push) Successful in 8s
Details
Test / make_test (push) Successful in 37s
Details
Test / test_change_pg_count (push) Successful in 38s
Details
Test / test_change_pg_size (push) Successful in 8s
Details
Test / test_change_pg_count_ec (push) Successful in 35s
Details
Test / test_create_nomaxid (push) Successful in 7s
Details
Test / test_etcd_fail (push) Successful in 1m24s
Details
Test / test_add_osd (push) Successful in 2m46s
Details
Test / test_interrupted_rebalance (push) Successful in 2m58s
Details
Test / test_interrupted_rebalance_imm (push) Successful in 2m59s
Details
Test / test_failure_domain (push) Successful in 10s
Details
Test / test_interrupted_rebalance_ec (push) Successful in 2m15s
Details
Test / test_snapshot (push) Successful in 48s
Details
Test / test_snapshot_ec (push) Successful in 42s
Details
Test / test_minsize_1 (push) Successful in 16s
Details
Test / test_rm (push) Successful in 15s
Details
Test / test_move_reappear (push) Successful in 22s
Details
Test / test_snapshot_down (push) Successful in 25s
Details
Test / test_interrupted_rebalance_ec_imm (push) Successful in 2m46s
Details
Test / test_snapshot_down_ec (push) Successful in 24s
Details
Test / test_splitbrain (push) Successful in 17s
Details
Test / test_snapshot_chain (push) Successful in 2m36s
Details
Test / test_snapshot_chain_ec (push) Successful in 3m1s
Details
Test / test_rebalance_verify_imm (push) Successful in 3m17s
Details
Test / test_rebalance_verify (push) Successful in 3m50s
Details
Test / test_switch_primary (push) Successful in 33s
Details
Test / test_write (push) Successful in 50s
Details
Test / test_write_xor (push) Successful in 56s
Details
Test / test_write_no_same (push) Successful in 14s
Details
Test / test_rebalance_verify_ec_imm (push) Successful in 3m29s
Details
Test / test_rebalance_verify_ec (push) Successful in 5m23s
Details
Test / test_heal_pg_size_2 (push) Successful in 4m23s
Details
Test / test_heal_ec (push) Successful in 4m57s
Details
Test / test_heal_csum_32k_dmj (push) Successful in 5m21s
Details
Test / test_heal_csum_32k_dj (push) Successful in 6m33s
Details
Test / test_heal_csum_32k (push) Successful in 6m55s
Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m54s
Details
Test / test_scrub (push) Successful in 1m32s
Details
Test / test_scrub_zero_osd_2 (push) Successful in 1m12s
Details
Test / test_heal_csum_4k_dj (push) Successful in 7m12s
Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m1s
Details
Test / test_scrub_pg_size_3 (push) Successful in 1m41s
Details
Test / test_heal_csum_4k (push) Successful in 6m22s
Details
Test / test_scrub_ec (push) Successful in 44s
Details
Test / test_nfs (push) Successful in 16s
Details
Test / test_scrub_xor (push) Successful in 18s
Details
Test / buildenv (push) Successful in 10s
Details
Test / build (push) Successful in 12s
Details
Test / test_cas (push) Successful in 8s
Details
Test / make_test (push) Successful in 37s
Details
Test / test_change_pg_count (push) Successful in 38s
Details
Test / test_change_pg_size (push) Successful in 8s
Details
Test / test_change_pg_count_ec (push) Successful in 35s
Details
Test / test_create_nomaxid (push) Successful in 7s
Details
Test / test_etcd_fail (push) Successful in 1m24s
Details
Test / test_add_osd (push) Successful in 2m46s
Details
Test / test_interrupted_rebalance (push) Successful in 2m58s
Details
Test / test_interrupted_rebalance_imm (push) Successful in 2m59s
Details
Test / test_failure_domain (push) Successful in 10s
Details
Test / test_interrupted_rebalance_ec (push) Successful in 2m15s
Details
Test / test_snapshot (push) Successful in 48s
Details
Test / test_snapshot_ec (push) Successful in 42s
Details
Test / test_minsize_1 (push) Successful in 16s
Details
Test / test_rm (push) Successful in 15s
Details
Test / test_move_reappear (push) Successful in 22s
Details
Test / test_snapshot_down (push) Successful in 25s
Details
Test / test_interrupted_rebalance_ec_imm (push) Successful in 2m46s
Details
Test / test_snapshot_down_ec (push) Successful in 24s
Details
Test / test_splitbrain (push) Successful in 17s
Details
Test / test_snapshot_chain (push) Successful in 2m36s
Details
Test / test_snapshot_chain_ec (push) Successful in 3m1s
Details
Test / test_rebalance_verify_imm (push) Successful in 3m17s
Details
Test / test_rebalance_verify (push) Successful in 3m50s
Details
Test / test_switch_primary (push) Successful in 33s
Details
Test / test_write (push) Successful in 50s
Details
Test / test_write_xor (push) Successful in 56s
Details
Test / test_write_no_same (push) Successful in 14s
Details
Test / test_rebalance_verify_ec_imm (push) Successful in 3m29s
Details
Test / test_rebalance_verify_ec (push) Successful in 5m23s
Details
Test / test_heal_pg_size_2 (push) Successful in 4m23s
Details
Test / test_heal_ec (push) Successful in 4m57s
Details
Test / test_heal_csum_32k_dmj (push) Successful in 5m21s
Details
Test / test_heal_csum_32k_dj (push) Successful in 6m33s
Details
Test / test_heal_csum_32k (push) Successful in 6m55s
Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m54s
Details
Test / test_scrub (push) Successful in 1m32s
Details
Test / test_scrub_zero_osd_2 (push) Successful in 1m12s
Details
Test / test_heal_csum_4k_dj (push) Successful in 7m12s
Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m1s
Details
Test / test_scrub_pg_size_3 (push) Successful in 1m41s
Details
Test / test_heal_csum_4k (push) Successful in 6m22s
Details
Test / test_scrub_ec (push) Successful in 44s
Details
Test / test_nfs (push) Successful in 16s
Details
Test / test_scrub_xor (push) Successful in 18s
Details
parent
29284bef40
commit
3629dbc54d
|
@ -15,6 +15,7 @@ These parameters only apply to Monitors.
|
|||
- [mon_stats_timeout](#mon_stats_timeout)
|
||||
- [osd_out_time](#osd_out_time)
|
||||
- [placement_levels](#placement_levels)
|
||||
- [use_old_pg_combinator](#use_old_pg_combinator)
|
||||
|
||||
## etcd_mon_ttl
|
||||
|
||||
|
@ -77,3 +78,11 @@ values. Smaller priority means higher level in tree. For example,
|
|||
levels are always predefined and can't be removed. If one of them is not
|
||||
present in the configuration, then it is defined with the default priority
|
||||
(100 for "host", 101 for "osd").
|
||||
|
||||
## use_old_pg_combinator
|
||||
|
||||
- Type: boolean
|
||||
- Default: false
|
||||
|
||||
Use the old PG combination generator which doesn't support [level_placement](pool.en.md#level_placement)
|
||||
and [raw_placement](pool.en.md#raw_placement) for pools which don't use this features.
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
- [mon_stats_timeout](#mon_stats_timeout)
|
||||
- [osd_out_time](#osd_out_time)
|
||||
- [placement_levels](#placement_levels)
|
||||
- [use_old_pg_combinator](#use_old_pg_combinator)
|
||||
|
||||
## etcd_mon_ttl
|
||||
|
||||
|
@ -78,3 +79,11 @@ OSD перед обновлением агрегированной статис
|
|||
"host" и "osd" являются предопределёнными и не могут быть удалены. Если
|
||||
один из них отсутствует в конфигурации, он доопределяется с приоритетом по
|
||||
умолчанию (100 для уровня "host", 101 для "osd").
|
||||
|
||||
## use_old_pg_combinator
|
||||
|
||||
- Тип: булево (да/нет)
|
||||
- Значение по умолчанию: false
|
||||
|
||||
Использовать старый генератор комбинаций PG, не поддерживающий [level_placement](pool.ru.md#level_placement)
|
||||
и [raw_placement](pool.ru.md#raw_placement) для пулов, которые не используют данные функции.
|
||||
|
|
|
@ -32,6 +32,8 @@ Parameters:
|
|||
- [pg_minsize](#pg_minsize)
|
||||
- [pg_count](#pg_count)
|
||||
- [failure_domain](#failure_domain)
|
||||
- [level_placement](#level_placement)
|
||||
- [raw_placement](#raw_placement)
|
||||
- [max_osd_combinations](#max_osd_combinations)
|
||||
- [block_size](#block_size)
|
||||
- [bitmap_granularity](#bitmap_granularity)
|
||||
|
@ -209,6 +211,69 @@ never put on OSDs in the same failure domain (for example, on the same host).
|
|||
So failure domain specifies the unit which failure you are protecting yourself
|
||||
from.
|
||||
|
||||
## level_placement
|
||||
|
||||
- Type: string
|
||||
|
||||
Additional failure domain rules, applied in conjuction with failure_domain.
|
||||
Must be specified in the following form:
|
||||
|
||||
`<placement level>=<sequence of characters>, <level2>=<sequence2>, ...`
|
||||
|
||||
Sequence should be exactly [pg_size](#pg_size) character long. Each character
|
||||
corresponds to an OSD in the PG of this pool. Equal characters mean that
|
||||
corresponding items of the PG should be placed into the same placement tree
|
||||
item at this level. Different characters mean that items should be placed into
|
||||
different items.
|
||||
|
||||
For example, if you want a EC 4+2 pool and you want every 2 chunks to be stored
|
||||
in its own datacenter and you also want each chunk to be stored on a different
|
||||
host, you should set `level_placement` to `dc=112233 host=123456`.
|
||||
|
||||
Or you can set `level_placement` to `dc=112233` and leave `failure_domain` empty,
|
||||
because `host` is the default `failure_domain` and it will be applied anyway.
|
||||
|
||||
Without this rule, it may happen that 3 chunks will be stored on OSDs in the
|
||||
same datacenter, and the data will become inaccessibly if that datacenter goes
|
||||
down in this case.
|
||||
|
||||
Of course, you should group your hosts into datacenters before applying the rule
|
||||
by setting [placement_levels](monitor.en.md#placement_levels) to something like
|
||||
`{"dc":90,"host":100,"osd":110}` and add DCs to [node_placement](#placement-tree),
|
||||
like `{"dc1":{"level":"dc"},"host1":{"parent":"dc1"},...}`.
|
||||
|
||||
## raw_placement
|
||||
|
||||
- Type: string
|
||||
|
||||
Raw PG placement rules, specified in the form of a DSL (domain-specific language).
|
||||
Use only if you really know what you're doing :)
|
||||
|
||||
DSL specification:
|
||||
|
||||
```
|
||||
dsl := item | item ("\n" | ",") items
|
||||
item := "any" | rules
|
||||
rules := rule | rule rules
|
||||
rule := level operator arg
|
||||
level := /\w+/
|
||||
operator := "!=" | "=" | ">" | "?="
|
||||
arg := value | "(" values ")"
|
||||
values := value | value "," values
|
||||
value := item_ref | constant_id
|
||||
item_ref := /\d+/
|
||||
constant_id := /"([^"]+)"/
|
||||
```
|
||||
|
||||
"?=" operator means "preferred". I.e. `dc ?= "meow"` means "prefer datacenter meow
|
||||
for this chunk, but put into another dc if it's unavailable".
|
||||
|
||||
Examples:
|
||||
|
||||
- Simple 3 replicas with failure_domain=host: `any, host!=1, host!=(1,2)`
|
||||
- EC 4+2 in 3 DC: `any, dc=1 host!=1, dc!=1, dc=3 host!=3, dc!=(1,3), dc=5 host!=5`
|
||||
- 1 replica in fixed DC + 2 in random DCs: `dc?=meow, dc!=1, dc!=(1,2)`
|
||||
|
||||
## max_osd_combinations
|
||||
|
||||
- Type: integer
|
||||
|
|
|
@ -31,6 +31,8 @@
|
|||
- [pg_minsize](#pg_minsize)
|
||||
- [pg_count](#pg_count)
|
||||
- [failure_domain](#failure_domain)
|
||||
- [level_placement](#level_placement)
|
||||
- [raw_placement](#raw_placement)
|
||||
- [max_osd_combinations](#max_osd_combinations)
|
||||
- [block_size](#block_size)
|
||||
- [bitmap_granularity](#bitmap_granularity)
|
||||
|
@ -161,7 +163,7 @@ OSD, PG деактивируется на чтение и запись. Иным
|
|||
Для примера, разница между pg_minsize 2 и 1 в реплицированном пуле с 3 копиями
|
||||
данных (pg_size=3), проявляется следующим образом:
|
||||
- Если 2 сервера отключаются при pg_minsize=2, пул становится неактивным и
|
||||
остаётся неактивным в течение [osd_out_time](monitor.en.md#osd_out_time)
|
||||
остаётся неактивным в течение [osd_out_time](monitor.ru.md#osd_out_time)
|
||||
(10 минут), после чего монитор назначает другие OSD/серверы на замену, пул
|
||||
поднимается и начинает восстанавливать недостающие копии данных. Соответственно,
|
||||
если OSD на замену нет - то есть, если у вас всего 3 сервера с OSD и 2 из них
|
||||
|
@ -169,7 +171,7 @@ OSD, PG деактивируется на чтение и запись. Иным
|
|||
или не добавите хотя бы 1 сервер (или не переключите failure_domain на "osd").
|
||||
- Если 2 сервера отключаются при pg_minsize=1, ввод-вывод лишь приостанавливается
|
||||
на короткое время, до тех пор, пока монитор не поймёт, что OSD отключены
|
||||
(что занимает 5-10 секунд при стандартном [etcd_report_interval](osd.en.md#etcd_report_interval)).
|
||||
(что занимает 5-10 секунд при стандартном [etcd_report_interval](osd.ru.md#etcd_report_interval)).
|
||||
После этого ввод-вывод восстанавливается, но новые данные временно пишутся
|
||||
всего в 1 копии. Когда же проходит osd_out_time, монитор точно так же назначает
|
||||
другие OSD на замену выбывшим и пул начинает восстанавливать копии данных.
|
||||
|
@ -211,6 +213,71 @@ PG в Vitastor эферемерны, то есть вы можете менят
|
|||
Иными словами, домен отказа - это то, от отказа чего вы защищаете себя избыточным
|
||||
хранением.
|
||||
|
||||
## level_placement
|
||||
|
||||
- Тип: строка
|
||||
|
||||
Правила дополнительных доменов отказа, применяемые вместе с failure_domain.
|
||||
Должны задаваться в следующем виде:
|
||||
|
||||
`<уровень>=<последовательность символов>, <уровень2>=<последовательность2>, ...`
|
||||
|
||||
Каждая `<последовательность>` должна состоять ровно из [pg_size](#pg_size) символов.
|
||||
Каждый символ соответствует одному OSD (размещению одной части PG) этого пула.
|
||||
Одинаковые символы означают, что соответствующие части размещаются в один и тот же
|
||||
узел дерева OSD на заданном `<уровне>`. Разные символы означают, что части
|
||||
размещаются в разные узлы.
|
||||
|
||||
Например, если вы хотите сделать пул EC 4+2 и хотите поместить каждые 2 части
|
||||
данных в свой датацентр, и также вы хотите, чтобы каждая часть размещалась на
|
||||
другом хосте, то вы должны задать `level_placement` равным `dc=112233 host=123456`.
|
||||
|
||||
Либо вы просто можете задать `level_placement` равным `dc=112233` и оставить
|
||||
`failure_domain` пустым, т.к. `host` это его значение по умолчанию и оно также
|
||||
применится автоматически.
|
||||
|
||||
Без этого правила может получиться так, что в одном из датацентров окажется
|
||||
3 части данных одной PG и данные окажутся недоступными при временном отключении
|
||||
этого датацентра.
|
||||
|
||||
Естественно, перед установкой правила вам нужно сгруппировать ваши хосты в
|
||||
датацентры, установив [placement_levels](monitor.ru.md#placement_levels) во что-то
|
||||
типа `{"dc":90,"host":100,"osd":110}` и добавив датацентры в [node_placement](#дерево-размещения),
|
||||
примерно так: `{"dc1":{"level":"dc"},"host1":{"parent":"dc1"},...}`.
|
||||
|
||||
## raw_placement
|
||||
|
||||
- Type: string
|
||||
|
||||
Низкоуровневые правила генерации PG в форме DSL (доменно-специфичного языка).
|
||||
Используйте, только если действительно знаете, зачем вам это надо :)
|
||||
|
||||
Спецификация DSL:
|
||||
|
||||
```
|
||||
dsl := item | item ("\n" | ",") items
|
||||
item := "any" | rules
|
||||
rules := rule | rule rules
|
||||
rule := level operator arg
|
||||
level := /\w+/
|
||||
operator := "!=" | "=" | ">" | "?="
|
||||
arg := value | "(" values ")"
|
||||
values := value | value "," values
|
||||
value := item_ref | constant_id
|
||||
item_ref := /\d+/
|
||||
constant_id := /"([^"]+)"/
|
||||
```
|
||||
|
||||
Оператор "?=" означает "предпочитаемый". Т.е. `dc ?= "meow"` означает "предпочитать
|
||||
датацентр meow для этой части данных, но разместить её в другом датацентре, если
|
||||
meow недоступен".
|
||||
|
||||
Примеры:
|
||||
|
||||
- Простые 3 реплики с failure_domain=host: `any, host!=1, host!=(1,2)`
|
||||
- EC 4+2 в 3 датацентрах: `any, dc=1 host!=1, dc!=1, dc=3 host!=3, dc!=(1,3), dc=5 host!=5`
|
||||
- 1 копия в фиксированном ДЦ + 2 в других ДЦ: `dc?=meow, dc!=1, dc!=(1,2)`
|
||||
|
||||
## max_osd_combinations
|
||||
|
||||
- Тип: целое число
|
||||
|
|
|
@ -63,3 +63,12 @@
|
|||
"host" и "osd" являются предопределёнными и не могут быть удалены. Если
|
||||
один из них отсутствует в конфигурации, он доопределяется с приоритетом по
|
||||
умолчанию (100 для уровня "host", 101 для "osd").
|
||||
- name: use_old_pg_combinator
|
||||
type: bool
|
||||
default: false
|
||||
info: |
|
||||
Use the old PG combination generator which doesn't support [level_placement](pool.en.md#level_placement)
|
||||
and [raw_placement](pool.en.md#raw_placement) for pools which don't use this features.
|
||||
info_ru: |
|
||||
Использовать старый генератор комбинаций PG, не поддерживающий [level_placement](pool.ru.md#level_placement)
|
||||
и [raw_placement](pool.ru.md#raw_placement) для пулов, которые не используют данные функции.
|
||||
|
|
|
@ -269,6 +269,8 @@ Optional parameters:
|
|||
| `--block_size 128k` | Put pool only on OSDs with this data block size |
|
||||
| `--bitmap_granularity 4k` | Put pool only on OSDs with this logical sector size |
|
||||
| `--immediate_commit none` | Put pool only on OSDs with this or larger immediate_commit (none < small < all) |
|
||||
| `--level_placement <rules>` | Use additional failure domain rules (example: "dc=112233") |
|
||||
| `--raw_placement <rules>` | Specify raw PG generation rules ([details](../config/pool.en.md#raw_placement)) |
|
||||
| `--primary_affinity_tags tags` | Prefer to put primary copies on OSDs with all specified tags |
|
||||
| `--scrub_interval <time>` | Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y |
|
||||
| `--used_for_fs <name>` | Mark pool as used for VitastorFS with metadata in image <name> |
|
||||
|
|
|
@ -286,6 +286,8 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
|
|||
| `--block_size 128k` | ...только OSD с данным размером блока |
|
||||
| `--bitmap_granularity 4k` | ...только OSD с данным размером логического сектора |
|
||||
| `--immediate_commit none` | ...только OSD с этим или большим immediate_commit (none < small < all) |
|
||||
| `--level_placement <rules>` | Задать правила дополнительных доменов отказа (пример: "dc=112233") |
|
||||
| `--raw_placement <rules>` | Задать низкоуровневые правила генерации PG ([детали](../config/pool.ru.md#raw_placement)) |
|
||||
| `--primary_affinity_tags tags` | Предпочитать OSD со всеми данными тегами для роли первичных |
|
||||
| `--scrub_interval <time>` | Включить скрабы с заданным интервалом времени (число + единица s/m/h/d/M/y) |
|
||||
| `--pg_stripe_size <number>` | Увеличить блок группировки объектов по PG |
|
||||
|
|
|
@ -2,6 +2,27 @@ const { select_murmur3 } = require('./murmur3.js');
|
|||
|
||||
const NO_OSD = 'Z';
|
||||
|
||||
class RuleCombinator
|
||||
{
|
||||
constructor(osd_tree, rules, max_combinations, ordered)
|
||||
{
|
||||
this.osd_tree = index_tree(Object.values(osd_tree).filter(o => o.id));
|
||||
this.rules = rules;
|
||||
this.max_combinations = max_combinations;
|
||||
this.ordered = ordered;
|
||||
}
|
||||
|
||||
random_combinations()
|
||||
{
|
||||
return random_custom_combinations(this.osd_tree, this.rules, this.max_combinations, this.ordered);
|
||||
}
|
||||
|
||||
check_combinations(pgs)
|
||||
{
|
||||
return check_custom_combinations(this.osd_tree, this.rules, pgs);
|
||||
}
|
||||
}
|
||||
|
||||
// Convert alternative "level-index" format to rules
|
||||
// level_index = { [level: string]: string | string[] }
|
||||
// level_sequence = optional, levels from upper to lower, i.e. [ 'dc', 'host' ]
|
||||
|
@ -236,7 +257,7 @@ function random_custom_combinations(osd_tree, rules, count, ordered)
|
|||
}
|
||||
const size = selected.filter(s => s.id !== null).length;
|
||||
max_size = max_size < size ? size : max_size;
|
||||
const pg = selected.map(s => s.id === null ? NO_OSD : s.id);
|
||||
const pg = selected.map(s => s.id === null ? NO_OSD : (0|s.id));
|
||||
if (!ordered)
|
||||
pg.sort();
|
||||
r['pg_'+pg.join('_')] = pg;
|
||||
|
@ -253,7 +274,7 @@ function random_custom_combinations(osd_tree, rules, count, ordered)
|
|||
}
|
||||
const size = selected.filter(s => s.id !== null).length;
|
||||
max_size = max_size < size ? size : max_size;
|
||||
const pg = selected.map(s => s.id === null ? NO_OSD : s.id);
|
||||
const pg = selected.map(s => s.id === null ? NO_OSD : (0|s.id));
|
||||
if (!ordered)
|
||||
pg.sort();
|
||||
r['pg_'+pg.join('_')] = pg;
|
||||
|
@ -376,6 +397,7 @@ function check_custom_combinations(osd_tree, rules, pgs)
|
|||
}
|
||||
|
||||
module.exports = {
|
||||
RuleCombinator,
|
||||
NO_OSD,
|
||||
|
||||
index_tree,
|
||||
|
|
90
mon/mon.js
90
mon/mon.js
|
@ -6,7 +6,8 @@ const http = require('http');
|
|||
const crypto = require('crypto');
|
||||
const os = require('os');
|
||||
const WebSocket = require('ws');
|
||||
const { SimpleCombinator } = require('./simple_pgs.js');
|
||||
const { RuleCombinator, parse_level_indexes, parse_pg_dsl } = require('./dsl_pgs.js');
|
||||
const { SimpleCombinator, flatten_tree } = require('./simple_pgs.js');
|
||||
const LPOptimizer = require('./lp-optimizer.js');
|
||||
const stableStringify = require('./stable-stringify.js');
|
||||
const PGUtil = require('./PGUtil.js');
|
||||
|
@ -64,6 +65,7 @@ const etcd_tree = {
|
|||
mon_stats_timeout: 1000, // ms. min: 100
|
||||
osd_out_time: 600, // seconds. min: 0
|
||||
placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
|
||||
use_old_pg_combinator: false,
|
||||
// client and osd
|
||||
tcp_header_buffer_size: 65536,
|
||||
use_sync_send_recv: false,
|
||||
|
@ -186,7 +188,12 @@ const etcd_tree = {
|
|||
// number of parity chunks, required for EC
|
||||
parity_chunks?: 1,
|
||||
pg_count: 100,
|
||||
failure_domain: 'host',
|
||||
// default is failure_domain=host
|
||||
failure_domain?: 'host',
|
||||
// additional failure domain rules; failure_domain=x is equivalent to x=123..N
|
||||
level_placement?: 'dc=112233 host=123456',
|
||||
raw_placement?: 'any, dc=1 host!=1, dc=1 host!=(1,2)',
|
||||
old_combinator: false,
|
||||
max_osd_combinations: 10000,
|
||||
// block_size, bitmap_granularity, immediate_commit must match all OSDs used in that pool
|
||||
block_size: 131072,
|
||||
|
@ -1096,7 +1103,6 @@ class Mon
|
|||
pool_cfg.pg_minsize = Math.floor(pool_cfg.pg_minsize);
|
||||
pool_cfg.parity_chunks = Math.floor(pool_cfg.parity_chunks) || undefined;
|
||||
pool_cfg.pg_count = Math.floor(pool_cfg.pg_count);
|
||||
pool_cfg.failure_domain = pool_cfg.failure_domain || 'host';
|
||||
pool_cfg.max_osd_combinations = Math.floor(pool_cfg.max_osd_combinations) || 10000;
|
||||
if (!/^[1-9]\d*$/.exec(''+pool_id))
|
||||
{
|
||||
|
@ -1176,6 +1182,10 @@ class Mon
|
|||
console.log('Pool '+pool_id+' has invalid primary_affinity_tags (must be a string or array of strings)');
|
||||
return false;
|
||||
}
|
||||
if (!this.get_pg_rules(pool_id, pool_cfg, true))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -1249,6 +1259,74 @@ class Mon
|
|||
return aff_osds;
|
||||
}
|
||||
|
||||
get_pg_rules(pool_id, pool_cfg, warn)
|
||||
{
|
||||
if (pool_cfg.level_placement)
|
||||
{
|
||||
const pg_size = (0|pool_cfg.pg_size);
|
||||
let rules = pool_cfg.level_placement;
|
||||
if (typeof rules === 'string')
|
||||
{
|
||||
rules = rules.split(/\s+/).map(s => s.split(/=/, 2)).reduce((a, c) => { a[c[0]] = c[1]; return a; }, {});
|
||||
}
|
||||
else
|
||||
{
|
||||
rules = { ...rules };
|
||||
}
|
||||
// Always add failure_domain to prevent rules from being totally incorrect
|
||||
const all_diff = [];
|
||||
for (let i = 1; i <= pg_size; i++)
|
||||
{
|
||||
all_diff.push(i);
|
||||
}
|
||||
rules[pool_cfg.failure_domain || 'host'] = all_diff;
|
||||
const levels = this.config.placement_levels||{};
|
||||
levels.host = levels.host || 100;
|
||||
levels.osd = levels.osd || 101;
|
||||
for (const k in rules)
|
||||
{
|
||||
if (!levels[k] || typeof rules[k] !== 'string' &&
|
||||
(!rules[k] instanceof Array ||
|
||||
rules[k].filter(s => typeof s !== 'string' && typeof s !== 'number').length > 0))
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' configuration is invalid: level_placement should be { [level]: string | (string|number)[] }');
|
||||
return null;
|
||||
}
|
||||
else if (rules[k].length != pg_size)
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' configuration is invalid: values in level_placement should contain exactly pg_size ('+pg_size+') items');
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return parse_level_indexes(rules);
|
||||
}
|
||||
else if (typeof pool_cfg.raw_placement === 'string')
|
||||
{
|
||||
try
|
||||
{
|
||||
return parse_pg_dsl(pool_cfg.raw_placement);
|
||||
}
|
||||
catch (e)
|
||||
{
|
||||
if (warn)
|
||||
console.log('Pool '+pool_id+' configuration is invalid: invalid raw_placement: '+e.message);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
let rules = [ [] ];
|
||||
let prev = [ 1 ];
|
||||
for (let i = 1; i < pool_cfg.pg_size; i++)
|
||||
{
|
||||
rules.push([ [ pool_cfg.failure_domain||'host', '!=', prev ] ]);
|
||||
prev = [ ...prev, i+1 ];
|
||||
}
|
||||
return rules;
|
||||
}
|
||||
}
|
||||
|
||||
async generate_pool_pgs(pool_id, osd_tree, levels)
|
||||
{
|
||||
const pool_cfg = this.state.config.pools[pool_id];
|
||||
|
@ -1282,7 +1360,11 @@ class Mon
|
|||
const old_pg_count = prev_pgs.length;
|
||||
const optimize_cfg = {
|
||||
osd_weights: Object.values(pool_tree).filter(item => item.level === 'osd').reduce((a, c) => { a[c.id] = c.size; return a; }, {}),
|
||||
combinator: new SimpleCombinator(flatten_tree(osd_tree, levels, pool_cfg.failure_domain, 'osd'), pool_cfg.pg_size, pool_cfg.max_osd_combinations),
|
||||
combinator: !this.config.use_old_pg_combinator || pool_cfg.level_placement || pool_cfg.raw_placement
|
||||
// new algorithm:
|
||||
? new RuleCombinator(osd_tree, this.get_pg_rules(pool_id, pool_cfg), pool_cfg.max_osd_combinations)
|
||||
// old algorithm:
|
||||
: new SimpleCombinator(flatten_tree(osd_tree[''].children, levels, pool_cfg.failure_domain, 'osd'), pool_cfg.pg_size, pool_cfg.max_osd_combinations),
|
||||
pg_count: pool_cfg.pg_count,
|
||||
pg_size: pool_cfg.pg_size,
|
||||
pg_minsize: pool_cfg.pg_minsize,
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
const { select_murmur3 } = require('./murmur3.js');
|
||||
|
||||
const NO_OSD = 'Z';
|
||||
|
||||
class SimpleCombinator
|
||||
|
@ -61,14 +63,6 @@ function extract_osds(osd_tree, levels, osd_level, osds = {})
|
|||
// ordered = don't treat (x,y) and (y,x) as equal
|
||||
function random_combinations(osd_tree, pg_size, count, ordered)
|
||||
{
|
||||
let seed = 0x5f020e43;
|
||||
let rng = () =>
|
||||
{
|
||||
seed ^= seed << 13;
|
||||
seed ^= seed >> 17;
|
||||
seed ^= seed << 5;
|
||||
return seed + 2147483648;
|
||||
};
|
||||
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
|
||||
const hosts = Object.keys(osd_tree).sort().filter(h => osds[h].length > 0);
|
||||
const r = {};
|
||||
|
@ -82,8 +76,8 @@ function random_combinations(osd_tree, pg_size, count, ordered)
|
|||
cur_hosts.splice(h, 1);
|
||||
for (let i = 1; i < pg_size && i < hosts.length; i++)
|
||||
{
|
||||
const next_host = rng() % cur_hosts.length;
|
||||
const next_osd = rng() % osds[cur_hosts[next_host]].length;
|
||||
const next_host = select_murmur3(cur_hosts.length, i => pg[0]+':i:'+cur_hosts[i]);
|
||||
const next_osd = select_murmur3(osds[cur_hosts[next_host]].length, i => pg[0]+':i:'+osds[cur_hosts[next_host]][i]);
|
||||
pg.push(osds[cur_hosts[next_host]][next_osd]);
|
||||
cur_hosts.splice(next_host, 1);
|
||||
}
|
||||
|
@ -104,7 +98,7 @@ function random_combinations(osd_tree, pg_size, count, ordered)
|
|||
{
|
||||
for (let i = 0; i < max_hosts; i++)
|
||||
{
|
||||
const r = rng() % cur_hosts.length;
|
||||
const r = select_murmur3(cur_hosts.length, i => count+':h:'+cur_hosts[i]);
|
||||
host_idx[i] = cur_hosts[r];
|
||||
cur_hosts.splice(r, 1);
|
||||
}
|
||||
|
@ -113,12 +107,12 @@ function random_combinations(osd_tree, pg_size, count, ordered)
|
|||
{
|
||||
for (let i = 0; i < max_hosts; i++)
|
||||
{
|
||||
const r = rng() % (cur_hosts.length - (max_hosts - i - 1));
|
||||
const r = select_murmur3(cur_hosts.length - (max_hosts - i - 1), i => count+':h:'+cur_hosts[i]);
|
||||
host_idx[i] = cur_hosts[r];
|
||||
cur_hosts.splice(0, r+1);
|
||||
}
|
||||
}
|
||||
let pg = host_idx.map(h => osds[hosts[h]][rng() % osds[hosts[h]].length]);
|
||||
let pg = host_idx.map(h => osds[hosts[h]][select_murmur3(osds[hosts[h]].length, i => count+':o:'+osds[hosts[h]][i])]);
|
||||
while (pg.length < pg_size)
|
||||
{
|
||||
pg.push(NO_OSD);
|
||||
|
|
|
@ -101,4 +101,18 @@ check(
|
|||
[ 'pg_1_2', 'pg_1_3', 'pg_2_3' ]
|
||||
);
|
||||
|
||||
check(
|
||||
Object.keys(random_custom_combinations(index_tree([
|
||||
{ id: 'h1', level: 'host' },
|
||||
{ id: 'h2', level: 'host' },
|
||||
{ id: 'h3', level: 'host' },
|
||||
{ id: '1', size: 1, level: 'osd', parent: 'h1' },
|
||||
{ id: '2', size: 1, level: 'osd', parent: 'h2' },
|
||||
{ id: '3', size: 1, level: 'osd', parent: 'h2' },
|
||||
{ id: '4', size: 1, level: 'osd', parent: 'h3' },
|
||||
{ id: '5', size: 1, level: 'osd', parent: 'h3' },
|
||||
]), parse_level_indexes({ host: '1122', osd: '1234' }), 10000)).sort(),
|
||||
[ 'pg_2_3_4_5' ]
|
||||
);
|
||||
|
||||
console.log('OK');
|
||||
|
|
|
@ -129,6 +129,8 @@ static const char* help_text =
|
|||
" --block_size 128k Put pool only on OSDs with this data block size\n"
|
||||
" --bitmap_granularity 4k Put pool only on OSDs with this logical sector size\n"
|
||||
" --immediate_commit none Put pool only on OSDs with this or larger immediate_commit (none < small < all)\n"
|
||||
" --level_placement <rules> Use additional failure domain rules (example: \"dc=112233\")\n"
|
||||
" --raw_placement <rules> Specify raw PG generation rules (see documentation for details)\n"
|
||||
" --primary_affinity_tags tags Prefer to put primary copies on OSDs with all specified tags\n"
|
||||
" --scrub_interval <time> Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y\n"
|
||||
" --used_for_fs <name> Mark pool as used for VitastorFS with metadata in image <name>\n"
|
||||
|
@ -145,6 +147,7 @@ static const char* help_text =
|
|||
" [-s|--pg_size <number>] [--pg_minsize <number>] [-n|--pg_count <count>]\n"
|
||||
" [--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>] [--used_for_fs <name>]\n"
|
||||
" [--max_osd_combinations <number>] [--primary_affinity_tags <tags>] [--scrub_interval <time>]\n"
|
||||
" [--level_placement <rules>] [--raw_placement <rules>]\n"
|
||||
" Non-modifiable parameters (changing them WILL lead to data loss):\n"
|
||||
" [--block_size <size>] [--bitmap_granularity <size>]\n"
|
||||
" [--immediate_commit <all|small|none>] [--pg_stripe_size <size>]\n"
|
||||
|
|
|
@ -82,9 +82,38 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
|
|||
value = value.uint64_value();
|
||||
}
|
||||
else if (key == "name" || key == "scheme" || key == "immediate_commit" ||
|
||||
key == "failure_domain" || key == "root_node" || key == "scrub_interval" || key == "used_for_fs")
|
||||
key == "failure_domain" || key == "root_node" || key == "scrub_interval" || key == "used_for_fs" ||
|
||||
key == "raw_placement")
|
||||
{
|
||||
// OK
|
||||
if (!value.is_string())
|
||||
{
|
||||
return key+" must be a string";
|
||||
}
|
||||
}
|
||||
else if (key == "level_placement")
|
||||
{
|
||||
// level=rule, level=rule, ...
|
||||
if (!value.is_object())
|
||||
{
|
||||
json11::Json::object obj;
|
||||
for (auto & item: explode(",", value.string_value(), true))
|
||||
{
|
||||
auto pair = explode("=", item, true);
|
||||
if (pair.size() >= 2)
|
||||
{
|
||||
obj[pair[0]] = pair[1];
|
||||
}
|
||||
}
|
||||
if (obj.size())
|
||||
{
|
||||
value = obj;
|
||||
}
|
||||
else
|
||||
{
|
||||
new_cfg.erase(kv_it++);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (key == "osd_tags" || key == "primary_affinity_tags")
|
||||
{
|
||||
|
@ -184,6 +213,38 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
|
|||
return "PG size can't be greater than 256";
|
||||
}
|
||||
|
||||
// PG rules
|
||||
if (!cfg["level_placement"].is_null())
|
||||
{
|
||||
for (auto & lr: cfg["level_placement"].object_items())
|
||||
{
|
||||
int len = 0;
|
||||
if (lr.second.is_array())
|
||||
{
|
||||
for (auto & lri: lr.second.array_items())
|
||||
{
|
||||
if (!lri.is_string() && !lri.is_number())
|
||||
{
|
||||
return "--level_placement contains an array with non-scalar value: "+lri.dump();
|
||||
}
|
||||
}
|
||||
len = lr.second.array_items().size();
|
||||
}
|
||||
else if (!lr.second.is_string())
|
||||
{
|
||||
return "--level_placement contains a non-array and non-string value: "+lr.second.dump();
|
||||
}
|
||||
else
|
||||
{
|
||||
len = lr.second.string_value().size();
|
||||
}
|
||||
if (len != pg_size)
|
||||
{
|
||||
return "values in --level_placement should be exactly pg_size ("+std::to_string(pg_size)+") long";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// parity_chunks
|
||||
uint64_t parity_chunks = 1;
|
||||
if (scheme == POOL_SCHEME_EC)
|
||||
|
|
Loading…
Reference in New Issue