Compare commits

..

9 Commits

Author SHA1 Message Date
Vitaliy Filippov 59d5ba52a9 Plug the new PG combinator into monitor
Test / test_minsize_1 (push) Failing after 2m15s Details
Test / test_rm (push) Failing after 2m15s Details
Test / test_snapshot_chain (push) Failing after 2m15s Details
Test / test_snapshot_chain_ec (push) Failing after 2m15s Details
Test / test_snapshot_down (push) Failing after 2m16s Details
Test / test_snapshot_down_ec (push) Failing after 2m14s Details
Test / test_splitbrain (push) Failing after 2m16s Details
Test / test_rebalance_verify (push) Failing after 2m16s Details
Test / test_rebalance_verify_imm (push) Failing after 2m16s Details
Test / test_rebalance_verify_ec (push) Failing after 2m15s Details
Test / test_rebalance_verify_ec_imm (push) Failing after 2m15s Details
Test / test_switch_primary (push) Failing after 2m15s Details
Test / test_write (push) Failing after 2m15s Details
Test / test_write_xor (push) Failing after 2m15s Details
Test / test_write_no_same (push) Failing after 2m15s Details
Test / test_heal_pg_size_2 (push) Failing after 2m16s Details
Test / test_heal_ec (push) Failing after 2m15s Details
Test / test_heal_csum_32k_dmj (push) Failing after 2m15s Details
Test / test_heal_csum_32k_dj (push) Failing after 2m15s Details
Test / test_heal_csum_32k (push) Failing after 2m15s Details
Test / test_heal_csum_4k_dmj (push) Failing after 2m15s Details
Test / test_heal_csum_4k_dj (push) Failing after 2m14s Details
Test / test_heal_csum_4k (push) Failing after 2m15s Details
Test / test_scrub (push) Failing after 2m16s Details
Test / test_scrub_zero_osd_2 (push) Failing after 2m15s Details
Test / test_scrub_xor (push) Failing after 2m15s Details
Test / test_scrub_pg_size_3 (push) Failing after 2m15s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Failing after 2m16s Details
Test / test_scrub_ec (push) Failing after 2m15s Details
Test / test_nfs (push) Failing after 2m15s Details
2024-04-07 00:43:41 +03:00
Vitaliy Filippov 29284bef40 Implement new DSL/rule-based PG generation algorithm 2024-04-07 00:36:20 +03:00
Vitaliy Filippov 6a924d6066 Extract PG combinator into a separate module 2024-04-07 00:36:20 +03:00
Vitaliy Filippov 9fe779a691 Do not die on invalid pool configurations
Test / test_rm (push) Successful in 16s Details
Test / test_move_reappear (push) Successful in 24s Details
Test / test_snapshot_down (push) Successful in 26s Details
Test / test_snapshot_down_ec (push) Successful in 31s Details
Test / test_splitbrain (push) Successful in 17s Details
Test / test_snapshot_chain (push) Successful in 2m34s Details
Test / test_snapshot_chain_ec (push) Successful in 3m12s Details
Test / test_rebalance_verify_imm (push) Successful in 2m59s Details
Test / test_rebalance_verify (push) Successful in 3m27s Details
Test / test_switch_primary (push) Successful in 34s Details
Test / test_write (push) Successful in 55s Details
Test / test_write_xor (push) Successful in 54s Details
Test / test_write_no_same (push) Successful in 15s Details
Test / test_rebalance_verify_ec (push) Successful in 4m37s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 4m8s Details
Test / test_heal_pg_size_2 (push) Successful in 3m48s Details
Test / test_heal_ec (push) Successful in 3m47s Details
Test / test_heal_csum_32k_dmj (push) Successful in 6m8s Details
Test / test_heal_csum_32k_dj (push) Successful in 6m18s Details
Test / test_heal_csum_32k (push) Successful in 7m9s Details
Test / test_heal_csum_4k_dmj (push) Successful in 7m7s Details
Test / test_scrub (push) Successful in 1m9s Details
Test / test_scrub_zero_osd_2 (push) Successful in 1m8s Details
Test / test_scrub_xor (push) Successful in 1m7s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m20s Details
Test / test_heal_csum_4k (push) Successful in 5m58s Details
Test / test_scrub_pg_size_3 (push) Successful in 2m9s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m4s Details
Test / test_nfs (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 21s Details
2024-04-07 00:36:20 +03:00
Vitaliy Filippov 31c2751b9b Move NBD/VDUSE map/unmap functions to a separate file 2024-04-07 00:36:09 +03:00
Vitaliy Filippov c5195666cd Fix journal sequencing: make each journal write wait for all previous journal writes
Test / test_snapshot_ec (push) Successful in 31s Details
Test / test_rm (push) Successful in 15s Details
Test / test_snapshot_down (push) Successful in 30s Details
Test / test_snapshot_down_ec (push) Successful in 32s Details
Test / test_splitbrain (push) Successful in 25s Details
Test / test_snapshot_chain (push) Successful in 2m11s Details
Test / test_snapshot_chain_ec (push) Successful in 3m1s Details
Test / test_rebalance_verify_imm (push) Successful in 2m35s Details
Test / test_rebalance_verify (push) Successful in 3m10s Details
Test / test_switch_primary (push) Successful in 39s Details
Test / test_write (push) Successful in 43s Details
Test / test_write_no_same (push) Successful in 18s Details
Test / test_write_xor (push) Successful in 1m3s Details
Test / test_rebalance_verify_ec (push) Successful in 4m38s Details
Test / test_heal_pg_size_2 (push) Successful in 3m22s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 5m11s Details
Test / test_heal_ec (push) Successful in 4m23s Details
Test / test_heal_csum_32k_dmj (push) Successful in 4m55s Details
Test / test_heal_csum_32k_dj (push) Successful in 6m31s Details
Test / test_heal_csum_32k (push) Successful in 6m29s Details
Test / test_heal_csum_4k_dmj (push) Successful in 7m18s Details
Test / test_scrub_zero_osd_2 (push) Successful in 1m0s Details
Test / test_scrub (push) Failing after 3m19s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m39s Details
Test / test_scrub_xor (push) Successful in 58s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m13s Details
Test / test_scrub_ec (push) Successful in 50s Details
Test / test_scrub_pg_size_3 (push) Successful in 1m51s Details
Test / test_heal_csum_4k (push) Successful in 5m13s Details
Test / test_nfs (push) Successful in 23s Details
2024-04-06 23:53:12 +03:00
Vitaliy Filippov f36d7eb76c Fix monitor thinking that OSD weight is 0 after deleting /osd/config/ key
Test / test_rm (push) Successful in 16s Details
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m30s Details
Test / test_snapshot_down (push) Successful in 31s Details
Test / test_snapshot_down_ec (push) Successful in 32s Details
Test / test_splitbrain (push) Successful in 23s Details
Test / test_snapshot_chain (push) Successful in 2m13s Details
Test / test_snapshot_chain_ec (push) Successful in 3m1s Details
Test / test_rebalance_verify_imm (push) Successful in 3m22s Details
Test / test_rebalance_verify (push) Successful in 4m5s Details
Test / test_switch_primary (push) Successful in 34s Details
Test / test_write (push) Successful in 52s Details
Test / test_write_xor (push) Successful in 52s Details
Test / test_write_no_same (push) Successful in 14s Details
Test / test_rebalance_verify_ec (push) Successful in 4m41s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 3m51s Details
Test / test_heal_pg_size_2 (push) Successful in 3m55s Details
Test / test_heal_ec (push) Successful in 4m35s Details
Test / test_heal_csum_32k_dmj (push) Successful in 6m0s Details
Test / test_heal_csum_32k_dj (push) Successful in 5m51s Details
Test / test_heal_csum_32k (push) Successful in 6m48s Details
Test / test_heal_csum_4k_dmj (push) Successful in 7m7s Details
Test / test_scrub (push) Successful in 1m36s Details
Test / test_scrub_zero_osd_2 (push) Successful in 1m20s Details
Test / test_scrub_xor (push) Successful in 56s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m39s Details
Test / test_heal_csum_4k (push) Successful in 6m37s Details
Test / test_nfs (push) Successful in 18s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 47s Details
Test / test_scrub_ec (push) Successful in 27s Details
Test / test_scrub_pg_size_3 (push) Successful in 1m3s Details
2024-04-05 23:14:46 +03:00
Vitaliy Filippov dd7f651de1 Add --max-request-bytes=104857600 to etcd params in tests 2024-04-05 23:14:46 +03:00
Vitaliy Filippov a2994ecd0d Fix flusher possibly not trimming journal on rollback 2024-04-05 23:14:39 +03:00
18 changed files with 340 additions and 46 deletions

View File

@ -15,6 +15,7 @@ These parameters only apply to Monitors.
- [mon_stats_timeout](#mon_stats_timeout)
- [osd_out_time](#osd_out_time)
- [placement_levels](#placement_levels)
- [use_old_pg_combinator](#use_old_pg_combinator)
## etcd_mon_ttl
@ -77,3 +78,11 @@ values. Smaller priority means higher level in tree. For example,
levels are always predefined and can't be removed. If one of them is not
present in the configuration, then it is defined with the default priority
(100 for "host", 101 for "osd").
## use_old_pg_combinator
- Type: boolean
- Default: false
Use the old PG combination generator which doesn't support [level_placement](pool.en.md#level_placement)
and [raw_placement](pool.en.md#raw_placement) for pools which don't use this features.

View File

@ -15,6 +15,7 @@
- [mon_stats_timeout](#mon_stats_timeout)
- [osd_out_time](#osd_out_time)
- [placement_levels](#placement_levels)
- [use_old_pg_combinator](#use_old_pg_combinator)
## etcd_mon_ttl
@ -78,3 +79,11 @@ OSD перед обновлением агрегированной статис
"host" и "osd" являются предопределёнными и не могут быть удалены. Если
один из них отсутствует в конфигурации, он доопределяется с приоритетом по
умолчанию (100 для уровня "host", 101 для "osd").
## use_old_pg_combinator
- Тип: булево (да/нет)
- Значение по умолчанию: false
Использовать старый генератор комбинаций PG, не поддерживающий [level_placement](pool.ru.md#level_placement)
и [raw_placement](pool.ru.md#raw_placement) для пулов, которые не используют данные функции.

View File

@ -32,6 +32,8 @@ Parameters:
- [pg_minsize](#pg_minsize)
- [pg_count](#pg_count)
- [failure_domain](#failure_domain)
- [level_placement](#level_placement)
- [raw_placement](#raw_placement)
- [max_osd_combinations](#max_osd_combinations)
- [block_size](#block_size)
- [bitmap_granularity](#bitmap_granularity)
@ -209,6 +211,69 @@ never put on OSDs in the same failure domain (for example, on the same host).
So failure domain specifies the unit which failure you are protecting yourself
from.
## level_placement
- Type: string
Additional failure domain rules, applied in conjuction with failure_domain.
Must be specified in the following form:
`<placement level>=<sequence of characters>, <level2>=<sequence2>, ...`
Sequence should be exactly [pg_size](#pg_size) character long. Each character
corresponds to an OSD in the PG of this pool. Equal characters mean that
corresponding items of the PG should be placed into the same placement tree
item at this level. Different characters mean that items should be placed into
different items.
For example, if you want a EC 4+2 pool and you want every 2 chunks to be stored
in its own datacenter and you also want each chunk to be stored on a different
host, you should set `level_placement` to `dc=112233 host=123456`.
Or you can set `level_placement` to `dc=112233` and leave `failure_domain` empty,
because `host` is the default `failure_domain` and it will be applied anyway.
Without this rule, it may happen that 3 chunks will be stored on OSDs in the
same datacenter, and the data will become inaccessibly if that datacenter goes
down in this case.
Of course, you should group your hosts into datacenters before applying the rule
by setting [placement_levels](monitor.en.md#placement_levels) to something like
`{"dc":90,"host":100,"osd":110}` and add DCs to [node_placement](#placement-tree),
like `{"dc1":{"level":"dc"},"host1":{"parent":"dc1"},...}`.
## raw_placement
- Type: string
Raw PG placement rules, specified in the form of a DSL (domain-specific language).
Use only if you really know what you're doing :)
DSL specification:
```
dsl := item | item ("\n" | ",") items
item := "any" | rules
rules := rule | rule rules
rule := level operator arg
level := /\w+/
operator := "!=" | "=" | ">" | "?="
arg := value | "(" values ")"
values := value | value "," values
value := item_ref | constant_id
item_ref := /\d+/
constant_id := /"([^"]+)"/
```
"?=" operator means "preferred". I.e. `dc ?= "meow"` means "prefer datacenter meow
for this chunk, but put into another dc if it's unavailable".
Examples:
- Simple 3 replicas with failure_domain=host: `any, host!=1, host!=(1,2)`
- EC 4+2 in 3 DC: `any, dc=1 host!=1, dc!=1, dc=3 host!=3, dc!=(1,3), dc=5 host!=5`
- 1 replica in fixed DC + 2 in random DCs: `dc?=meow, dc!=1, dc!=(1,2)`
## max_osd_combinations
- Type: integer

View File

@ -31,6 +31,8 @@
- [pg_minsize](#pg_minsize)
- [pg_count](#pg_count)
- [failure_domain](#failure_domain)
- [level_placement](#level_placement)
- [raw_placement](#raw_placement)
- [max_osd_combinations](#max_osd_combinations)
- [block_size](#block_size)
- [bitmap_granularity](#bitmap_granularity)
@ -161,7 +163,7 @@ OSD, PG деактивируется на чтение и запись. Иным
Для примера, разница между pg_minsize 2 и 1 в реплицированном пуле с 3 копиями
данных (pg_size=3), проявляется следующим образом:
- Если 2 сервера отключаются при pg_minsize=2, пул становится неактивным и
остаётся неактивным в течение [osd_out_time](monitor.en.md#osd_out_time)
остаётся неактивным в течение [osd_out_time](monitor.ru.md#osd_out_time)
(10 минут), после чего монитор назначает другие OSD/серверы на замену, пул
поднимается и начинает восстанавливать недостающие копии данных. Соответственно,
если OSD на замену нет - то есть, если у вас всего 3 сервера с OSD и 2 из них
@ -169,7 +171,7 @@ OSD, PG деактивируется на чтение и запись. Иным
или не добавите хотя бы 1 сервер (или не переключите failure_domain на "osd").
- Если 2 сервера отключаются при pg_minsize=1, ввод-вывод лишь приостанавливается
на короткое время, до тех пор, пока монитор не поймёт, что OSD отключены
(что занимает 5-10 секунд при стандартном [etcd_report_interval](osd.en.md#etcd_report_interval)).
(что занимает 5-10 секунд при стандартном [etcd_report_interval](osd.ru.md#etcd_report_interval)).
После этого ввод-вывод восстанавливается, но новые данные временно пишутся
всего в 1 копии. Когда же проходит osd_out_time, монитор точно так же назначает
другие OSD на замену выбывшим и пул начинает восстанавливать копии данных.
@ -211,6 +213,71 @@ PG в Vitastor эферемерны, то есть вы можете менят
Иными словами, домен отказа - это то, от отказа чего вы защищаете себя избыточным
хранением.
## level_placement
- Тип: строка
Правила дополнительных доменов отказа, применяемые вместе с failure_domain.
Должны задаваться в следующем виде:
`<уровень>=<последовательность символов>, <уровень2>=<последовательность2>, ...`
Каждая `<последовательность>` должна состоять ровно из [pg_size](#pg_size) символов.
Каждый символ соответствует одному OSD (размещению одной части PG) этого пула.
Одинаковые символы означают, что соответствующие части размещаются в один и тот же
узел дерева OSD на заданном `<уровне>`. Разные символы означают, что части
размещаются в разные узлы.
Например, если вы хотите сделать пул EC 4+2 и хотите поместить каждые 2 части
данных в свой датацентр, и также вы хотите, чтобы каждая часть размещалась на
другом хосте, то вы должны задать `level_placement` равным `dc=112233 host=123456`.
Либо вы просто можете задать `level_placement` равным `dc=112233` и оставить
`failure_domain` пустым, т.к. `host` это его значение по умолчанию и оно также
применится автоматически.
Без этого правила может получиться так, что в одном из датацентров окажется
3 части данных одной PG и данные окажутся недоступными при временном отключении
этого датацентра.
Естественно, перед установкой правила вам нужно сгруппировать ваши хосты в
датацентры, установив [placement_levels](monitor.ru.md#placement_levels) во что-то
типа `{"dc":90,"host":100,"osd":110}` и добавив датацентры в [node_placement](#дерево-размещения),
примерно так: `{"dc1":{"level":"dc"},"host1":{"parent":"dc1"},...}`.
## raw_placement
- Type: string
Низкоуровневые правила генерации PG в форме DSL (доменно-специфичного языка).
Используйте, только если действительно знаете, зачем вам это надо :)
Спецификация DSL:
```
dsl := item | item ("\n" | ",") items
item := "any" | rules
rules := rule | rule rules
rule := level operator arg
level := /\w+/
operator := "!=" | "=" | ">" | "?="
arg := value | "(" values ")"
values := value | value "," values
value := item_ref | constant_id
item_ref := /\d+/
constant_id := /"([^"]+)"/
```
Оператор "?=" означает "предпочитаемый". Т.е. `dc ?= "meow"` означает "предпочитать
датацентр meow для этой части данных, но разместить её в другом датацентре, если
meow недоступен".
Примеры:
- Простые 3 реплики с failure_domain=host: `any, host!=1, host!=(1,2)`
- EC 4+2 в 3 датацентрах: `any, dc=1 host!=1, dc!=1, dc=3 host!=3, dc!=(1,3), dc=5 host!=5`
- 1 копия в фиксированном ДЦ + 2 в других ДЦ: `dc?=meow, dc!=1, dc!=(1,2)`
## max_osd_combinations
- Тип: целое число

View File

@ -63,3 +63,12 @@
"host" и "osd" являются предопределёнными и не могут быть удалены. Если
один из них отсутствует в конфигурации, он доопределяется с приоритетом по
умолчанию (100 для уровня "host", 101 для "osd").
- name: use_old_pg_combinator
type: bool
default: false
info: |
Use the old PG combination generator which doesn't support [level_placement](pool.en.md#level_placement)
and [raw_placement](pool.en.md#raw_placement) for pools which don't use this features.
info_ru: |
Использовать старый генератор комбинаций PG, не поддерживающий [level_placement](pool.ru.md#level_placement)
и [raw_placement](pool.ru.md#raw_placement) для пулов, которые не используют данные функции.

View File

@ -269,6 +269,8 @@ Optional parameters:
| `--block_size 128k` | Put pool only on OSDs with this data block size |
| `--bitmap_granularity 4k` | Put pool only on OSDs with this logical sector size |
| `--immediate_commit none` | Put pool only on OSDs with this or larger immediate_commit (none < small < all) |
| `--level_placement <rules>` | Use additional failure domain rules (example: "dc=112233") |
| `--raw_placement <rules>` | Specify raw PG generation rules ([details](../config/pool.en.md#raw_placement)) |
| `--primary_affinity_tags tags` | Prefer to put primary copies on OSDs with all specified tags |
| `--scrub_interval <time>` | Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y |
| `--used_for_fs <name>` | Mark pool as used for VitastorFS with metadata in image <name> |

View File

@ -286,6 +286,8 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
| `--block_size 128k` | ...только OSD с данным размером блока |
| `--bitmap_granularity 4k` | ...только OSD с данным размером логического сектора |
| `--immediate_commit none` | ...только OSD с этим или большим immediate_commit (none < small < all) |
| `--level_placement <rules>` | Задать правила дополнительных доменов отказа (пример: "dc=112233") |
| `--raw_placement <rules>` | Задать низкоуровневые правила генерации PG ([детали](../config/pool.ru.md#raw_placement)) |
| `--primary_affinity_tags tags` | Предпочитать OSD со всеми данными тегами для роли первичных |
| `--scrub_interval <time>` | Включить скрабы с заданным интервалом времени (число + единица s/m/h/d/M/y) |
| `--pg_stripe_size <number>` | Увеличить блок группировки объектов по PG |

View File

@ -257,7 +257,7 @@ function random_custom_combinations(osd_tree, rules, count, ordered)
}
const size = selected.filter(s => s.id !== null).length;
max_size = max_size < size ? size : max_size;
const pg = selected.map(s => s.id === null ? NO_OSD : s.id);
const pg = selected.map(s => s.id === null ? NO_OSD : (0|s.id));
if (!ordered)
pg.sort();
r['pg_'+pg.join('_')] = pg;
@ -274,7 +274,7 @@ function random_custom_combinations(osd_tree, rules, count, ordered)
}
const size = selected.filter(s => s.id !== null).length;
max_size = max_size < size ? size : max_size;
const pg = selected.map(s => s.id === null ? NO_OSD : s.id);
const pg = selected.map(s => s.id === null ? NO_OSD : (0|s.id));
if (!ordered)
pg.sort();
r['pg_'+pg.join('_')] = pg;

View File

@ -65,7 +65,7 @@ const etcd_tree = {
mon_stats_timeout: 1000, // ms. min: 100
osd_out_time: 600, // seconds. min: 0
placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
force_new_pg_combinator: false,
use_old_pg_combinator: false,
// client and osd
tcp_header_buffer_size: 65536,
use_sync_send_recv: false,
@ -190,8 +190,10 @@ const etcd_tree = {
pg_count: 100,
// default is failure_domain=host
failure_domain?: 'host',
level_rules?: { host: '123' },
pg_rules?: 'any, dc=1 host!=1, dc=1 host!=(1,2)',
// additional failure domain rules; failure_domain=x is equivalent to x=123..N
level_placement?: 'dc=112233 host=123456',
raw_placement?: 'any, dc=1 host!=1, dc=1 host!=(1,2)',
old_combinator: false,
max_osd_combinations: 10000,
// block_size, bitmap_granularity, immediate_commit must match all OSDs used in that pool
block_size: 131072,
@ -892,7 +894,7 @@ class Mon
{
// Numeric IDs are reserved for OSDs
const osd_cfg = this.state.config.osd[osd_num];
let reweight = osd_cfg && Number(osd_cfg.reweight);
let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight);
if (reweight < 0 || isNaN(reweight))
reweight = 1;
if (this.state.osd.state[osd_num] && reweight > 0)
@ -1259,33 +1261,57 @@ class Mon
get_pg_rules(pool_id, pool_cfg, warn)
{
if (pool_cfg.level_rules instanceof Object)
if (pool_cfg.level_placement)
{
const pg_size = (0|pool_cfg.pg_size);
let rules = pool_cfg.level_placement;
if (typeof rules === 'string')
{
rules = rules.split(/\s+/).map(s => s.split(/=/, 2)).reduce((a, c) => { a[c[0]] = c[1]; return a; }, {});
}
else
{
rules = { ...rules };
}
// Always add failure_domain to prevent rules from being totally incorrect
const all_diff = [];
for (let i = 1; i <= pg_size; i++)
{
all_diff.push(i);
}
rules[pool_cfg.failure_domain || 'host'] = all_diff;
const levels = this.config.placement_levels||{};
levels.host = levels.host || 100;
levels.osd = levels.osd || 101;
for (const k in pool_cfg.level_rules)
for (const k in rules)
{
if (!levels[k] || typeof pool_cfg.level_rules[k] !== 'string' &&
(!pool_cfg.level_rules[k] instanceof Array || pool_cfg.level_rules[k].filter(s => typeof s !== 'string').length > 0))
if (!levels[k] || typeof rules[k] !== 'string' &&
(!rules[k] instanceof Array ||
rules[k].filter(s => typeof s !== 'string' && typeof s !== 'number').length > 0))
{
if (warn)
console.log('Pool '+pool_id+' configuration is invalid: level_rules should be { [level]: string or array of strings }');
console.log('Pool '+pool_id+' configuration is invalid: level_placement should be { [level]: string | (string|number)[] }');
return null;
}
else if (rules[k].length != pg_size)
{
if (warn)
console.log('Pool '+pool_id+' configuration is invalid: values in level_placement should contain exactly pg_size ('+pg_size+') items');
return null;
}
}
return parse_level_indexes(pool_cfg.level_rules);
return parse_level_indexes(rules);
}
else if (typeof pool_cfg.pg_rules === 'string')
else if (typeof pool_cfg.raw_placement === 'string')
{
try
{
return parse_pg_dsl(pool_cfg.pg_rules);
return parse_pg_dsl(pool_cfg.raw_placement);
}
catch (e)
{
if (warn)
console.log('Pool '+pool_id+' configuration is invalid: invalid pg_rules: '+e.message);
console.log('Pool '+pool_id+' configuration is invalid: invalid raw_placement: '+e.message);
}
}
else
@ -1334,7 +1360,7 @@ class Mon
const old_pg_count = prev_pgs.length;
const optimize_cfg = {
osd_weights: Object.values(pool_tree).filter(item => item.level === 'osd').reduce((a, c) => { a[c.id] = c.size; return a; }, {}),
combinator: 1 || this.config.force_new_pg_combinator || !pool_cfg.level_rules && !pool_cfg.pg_rules
combinator: this.config.use_old_pg_combinator && !pool_cfg.level_placement && !pool_cfg.raw_placement
// new algorithm:
? new RuleCombinator(osd_tree, this.get_pg_rules(pool_id, pool_cfg), pool_cfg.max_osd_combinations)
// old algorithm:

View File

@ -101,4 +101,18 @@ check(
[ 'pg_1_2', 'pg_1_3', 'pg_2_3' ]
);
check(
Object.keys(random_custom_combinations(index_tree([
{ id: 'h1', level: 'host' },
{ id: 'h2', level: 'host' },
{ id: 'h3', level: 'host' },
{ id: '1', size: 1, level: 'osd', parent: 'h1' },
{ id: '2', size: 1, level: 'osd', parent: 'h2' },
{ id: '3', size: 1, level: 'osd', parent: 'h2' },
{ id: '4', size: 1, level: 'osd', parent: 'h3' },
{ id: '5', size: 1, level: 'osd', parent: 'h3' },
]), parse_level_indexes({ host: '1122', osd: '1234' }), 10000)).sort(),
[ 'pg_2_3_4_5' ]
);
console.log('OK');

View File

@ -86,6 +86,8 @@ void journal_flusher_t::loop()
cur_flusher_count--;
}
}
if (trim_wanted)
co[0].try_trim = true;
for (int i = 0; (active_flushers > 0 || dequeuing || trim_wanted > 0) && i < cur_flusher_count; i++)
co[i].loop();
}
@ -364,10 +366,10 @@ resume_0:
!flusher->flush_queue.size() || !flusher->dequeuing)
{
stop_flusher:
if (flusher->trim_wanted > 0 && cur.oid.inode != 0)
if (flusher->trim_wanted > 0 && try_trim)
{
// Attempt forced trim
cur.oid = {};
try_trim = false;
flusher->active_flushers++;
goto trim_journal;
}
@ -375,6 +377,7 @@ stop_flusher:
wait_state = 0;
return true;
}
try_trim = true;
cur.oid = flusher->flush_queue.front();
cur.version = flusher->flush_versions[cur.oid];
flusher->flush_queue.pop_front();

View File

@ -60,6 +60,7 @@ class journal_flusher_co
std::map<object_id, uint64_t>::iterator repeat_it;
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_rj, simple_callback_w;
bool try_trim = false;
bool skip_copy, has_delete, has_writes;
std::vector<copy_buffer_t> v;
std::vector<copy_buffer_t>::iterator it;

View File

@ -177,6 +177,7 @@ void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_
ring_data_t *data = ((ring_data_t*)sqe->user_data);
journal.sector_info[cur_sector].written = true;
journal.sector_info[cur_sector].submit_id = ++journal.submit_id;
assert(journal.submit_id != 0); // check overflow
journal.submitting_sectors.push_back(cur_sector);
journal.sector_info[cur_sector].flush_count++;
data->iov = (struct iovec){
@ -192,8 +193,8 @@ void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_
}
journal.sector_info[cur_sector].dirty = false;
// But always remember that this operation has to wait until this exact journal write is finished
journal.flushing_ops.insert((pending_journaling_t){
.flush_id = journal.sector_info[cur_sector].submit_id,
journal.flushing_ops.emplace(journal.sector_info[cur_sector].submit_id, (pending_journaling_t){
.pending = 1,
.sector = cur_sector,
.op = op,
});
@ -213,23 +214,43 @@ void blockstore_impl_t::handle_journal_write(ring_data_t *data, uint64_t flush_i
// FIXME: our state becomes corrupted after a write error. maybe do something better than just die
disk_error_abort("journal write", data->res, data->iov.iov_len);
}
auto fl_it = journal.flushing_ops.upper_bound((pending_journaling_t){ .flush_id = flush_id });
if (fl_it != journal.flushing_ops.end() && fl_it->flush_id == flush_id)
auto fl_it = journal.flushing_ops.lower_bound(flush_id);
if (fl_it != journal.flushing_ops.end() && fl_it->first == flush_id && fl_it->second.sector >= 0)
{
journal.sector_info[fl_it->sector].flush_count--;
journal.sector_info[fl_it->second.sector].flush_count--;
}
while (fl_it != journal.flushing_ops.end() && fl_it->flush_id == flush_id)
auto is_first = fl_it == journal.flushing_ops.begin();
while (fl_it != journal.flushing_ops.end())
{
auto priv = PRIV(fl_it->op);
priv->pending_ops--;
assert(priv->pending_ops >= 0);
if (priv->pending_ops == 0)
bool del = false;
if (fl_it->first == flush_id)
{
release_journal_sectors(fl_it->op);
priv->op_state++;
ringloop->wakeup();
fl_it->second.pending = 0;
del = is_first;
}
else
{
del = !fl_it->second.pending;
}
if (del)
{
// Do not complete this operation if previous writes are unfinished
// Otherwise also complete following operations waiting for this one
auto priv = PRIV(fl_it->second.op);
priv->pending_ops--;
assert(priv->pending_ops >= 0);
if (priv->pending_ops == 0)
{
release_journal_sectors(fl_it->second.op);
priv->op_state++;
ringloop->wakeup();
}
journal.flushing_ops.erase(fl_it++);
}
else
{
fl_it++;
}
journal.flushing_ops.erase(fl_it++);
}
}

View File

@ -156,16 +156,11 @@ struct journal_sector_info_t
struct pending_journaling_t
{
uint64_t flush_id;
int pending;
int sector;
blockstore_op_t *op;
};
inline bool operator < (const pending_journaling_t & a, const pending_journaling_t & b)
{
return a.flush_id < b.flush_id || a.flush_id == b.flush_id && a.op < b.op;
}
struct journal_t
{
int fd;
@ -191,7 +186,7 @@ struct journal_t
int cur_sector = 0;
int in_sector_pos = 0;
std::vector<int> submitting_sectors;
std::set<pending_journaling_t> flushing_ops;
std::multimap<uint64_t, pending_journaling_t> flushing_ops;
uint64_t submit_id = 0;
// Used sector map

View File

@ -427,7 +427,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
);
write_iodepth++;
// Got SQEs. Prepare previous journal sector write if required
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
if (immediate_commit == IMMEDIATE_NONE &&
!journal.entry_fits(sizeof(journal_entry_small_write) + dyn_size))
{
@ -503,7 +502,15 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
}
BS_SUBMIT_GET_SQE(sqe2, data2);
data2->iov = (struct iovec){ op->buf, op->len };
data2->callback = cb;
++journal.submit_id;
assert(journal.submit_id != 0); // check overflow
// Make subsequent journal writes wait for our data write
journal.flushing_ops.emplace(journal.submit_id, (pending_journaling_t){
.pending = 1,
.sector = -1,
.op = op,
});
data2->callback = [this, flush_id = journal.submit_id](ring_data_t *data) { handle_journal_write(data, flush_id); };
my_uring_prep_writev(
sqe2, dsk.journal_fd, &data2->iov, 1, journal.offset + journal.next_free
);

View File

@ -129,6 +129,8 @@ static const char* help_text =
" --block_size 128k Put pool only on OSDs with this data block size\n"
" --bitmap_granularity 4k Put pool only on OSDs with this logical sector size\n"
" --immediate_commit none Put pool only on OSDs with this or larger immediate_commit (none < small < all)\n"
" --level_placement <rules> Use additional failure domain rules (example: \"dc=112233\")\n"
" --raw_placement <rules> Specify raw PG generation rules (see documentation for details)\n"
" --primary_affinity_tags tags Prefer to put primary copies on OSDs with all specified tags\n"
" --scrub_interval <time> Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y\n"
" --used_for_fs <name> Mark pool as used for VitastorFS with metadata in image <name>\n"
@ -145,6 +147,7 @@ static const char* help_text =
" [-s|--pg_size <number>] [--pg_minsize <number>] [-n|--pg_count <count>]\n"
" [--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>] [--used_for_fs <name>]\n"
" [--max_osd_combinations <number>] [--primary_affinity_tags <tags>] [--scrub_interval <time>]\n"
" [--level_placement <rules>] [--raw_placement <rules>]\n"
" Non-modifiable parameters (changing them WILL lead to data loss):\n"
" [--block_size <size>] [--bitmap_granularity <size>]\n"
" [--immediate_commit <all|small|none>] [--pg_stripe_size <size>]\n"

View File

@ -82,9 +82,38 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
value = value.uint64_value();
}
else if (key == "name" || key == "scheme" || key == "immediate_commit" ||
key == "failure_domain" || key == "root_node" || key == "scrub_interval" || key == "used_for_fs")
key == "failure_domain" || key == "root_node" || key == "scrub_interval" || key == "used_for_fs" ||
key == "raw_placement")
{
// OK
if (!value.is_string())
{
return key+" must be a string";
}
}
else if (key == "level_placement")
{
// level=rule, level=rule, ...
if (!value.is_object())
{
json11::Json::object obj;
for (auto & item: explode(",", value.string_value(), true))
{
auto pair = explode("=", item, true);
if (pair.size() >= 2)
{
obj[pair[0]] = pair[1];
}
}
if (obj.size())
{
value = obj;
}
else
{
new_cfg.erase(kv_it++);
continue;
}
}
}
else if (key == "osd_tags" || key == "primary_affinity_tags")
{
@ -184,6 +213,38 @@ std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json ol
return "PG size can't be greater than 256";
}
// PG rules
if (!cfg["level_placement"].is_null())
{
for (auto & lr: cfg["level_placement"].object_items())
{
int len = 0;
if (lr.second.is_array())
{
for (auto & lri: lr.second.array_items())
{
if (!lri.is_string() && !lri.is_number())
{
return "--level_placement contains an array with non-scalar value: "+lri.dump();
}
}
len = lr.second.array_items().size();
}
else if (!lr.second.is_string())
{
return "--level_placement contains a non-array and non-string value: "+lr.second.dump();
}
else
{
len = lr.second.string_value().size();
}
if (len != pg_size)
{
return "values in --level_placement should be exactly pg_size ("+std::to_string(pg_size)+") long";
}
}
}
// parity_chunks
uint64_t parity_chunks = 1;
if (scheme == POOL_SCHEME_EC)

View File

@ -48,7 +48,7 @@ start_etcd()
--advertise-client-urls http://$ETCD_IP:$((ETCD_PORT+2*i-2)) --listen-client-urls http://$ETCD_IP:$((ETCD_PORT+2*i-2)) \
--initial-advertise-peer-urls http://$ETCD_IP:$((ETCD_PORT+2*i-1)) --listen-peer-urls http://$ETCD_IP:$((ETCD_PORT+2*i-1)) \
--initial-cluster-token vitastor-tests-etcd --initial-cluster-state new \
--initial-cluster "$ETCD_CLUSTER" \
--initial-cluster "$ETCD_CLUSTER" --max-request-bytes=104857600 \
--max-txn-ops=100000 --auto-compaction-retention=10 --auto-compaction-mode=revision &>./testdata/etcd$i.log &
eval ETCD${i}_PID=$!
}