Compare commits

...

138 Commits

Author SHA1 Message Date
Vitaliy Filippov 23a9aa93b5 Fix pool create/modify --block_size validation
Test / test_splitbrain (push) Has been skipped Details
Test / test_rebalance_verify (push) Has been skipped Details
Test / test_rebalance_verify_imm (push) Has been skipped Details
Test / test_rebalance_verify_ec (push) Has been skipped Details
Test / test_rebalance_verify_ec_imm (push) Has been skipped Details
Test / test_root_node (push) Has been skipped Details
Test / test_switch_primary (push) Has been skipped Details
Test / test_write (push) Has been skipped Details
Test / test_write_xor (push) Has been skipped Details
Test / test_write_no_same (push) Has been skipped Details
Test / test_heal_pg_size_2 (push) Has been skipped Details
Test / test_heal_ec (push) Has been skipped Details
Test / test_heal_csum_32k_dmj (push) Has been skipped Details
Test / test_heal_csum_32k_dj (push) Has been skipped Details
Test / test_heal_csum_32k (push) Has been skipped Details
Test / test_heal_csum_4k_dmj (push) Has been skipped Details
Test / test_heal_csum_4k_dj (push) Has been skipped Details
Test / test_heal_csum_4k (push) Has been skipped Details
Test / test_osd_tags (push) Has been skipped Details
Test / test_enospc (push) Has been skipped Details
Test / test_enospc_xor (push) Has been skipped Details
Test / test_enospc_imm (push) Has been skipped Details
Test / test_enospc_imm_xor (push) Has been skipped Details
Test / test_scrub (push) Has been skipped Details
Test / test_scrub_zero_osd_2 (push) Has been skipped Details
Test / test_scrub_xor (push) Has been skipped Details
Test / test_scrub_pg_size_3 (push) Has been skipped Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Has been skipped Details
Test / test_scrub_ec (push) Has been skipped Details
Test / test_nfs (push) Has been skipped Details
2024-05-04 16:33:22 +03:00
Vitaliy Filippov 2412d9e239 Fix TTL comparison for lease/keepalive
Test / test_snapshot_chain_ec (push) Successful in 3m5s Details
Test / test_rebalance_verify_imm (push) Successful in 3m29s Details
Test / test_root_node (push) Successful in 9s Details
Test / test_rebalance_verify (push) Successful in 4m3s Details
Test / test_switch_primary (push) Successful in 35s Details
Test / test_write (push) Successful in 54s Details
Test / test_write_no_same (push) Successful in 13s Details
Test / test_write_xor (push) Successful in 54s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 3m58s Details
Test / test_rebalance_verify_ec (push) Successful in 4m58s Details
Test / test_heal_pg_size_2 (push) Successful in 4m6s Details
Test / test_heal_ec (push) Successful in 4m15s Details
Test / test_heal_csum_32k_dmj (push) Successful in 5m52s Details
Test / test_heal_csum_32k_dj (push) Successful in 5m59s Details
Test / test_heal_csum_32k (push) Successful in 7m7s Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m57s Details
Test / test_osd_tags (push) Successful in 28s Details
Test / test_enospc (push) Successful in 1m58s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m53s Details
Test / test_heal_csum_4k (push) Successful in 6m20s Details
Test / test_enospc_xor (push) Successful in 2m9s Details
Test / test_enospc_imm (push) Successful in 41s Details
Test / test_scrub_zero_osd_2 (push) Successful in 35s Details
Test / test_scrub (push) Successful in 38s Details
Test / test_scrub_xor (push) Successful in 34s Details
Test / test_enospc_imm_xor (push) Successful in 58s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 35s Details
Test / test_scrub_ec (push) Successful in 33s Details
Test / test_nfs (push) Successful in 19s Details
Test / test_scrub_pg_size_3 (push) Successful in 41s Details
2024-04-30 01:53:05 +03:00
Vitaliy Filippov 9301c857b1 Release 1.6.1
Test / test_snapshot_chain_ec (push) Successful in 2m59s Details
Test / test_rebalance_verify_imm (push) Successful in 3m16s Details
Test / test_root_node (push) Successful in 10s Details
Test / test_rebalance_verify (push) Successful in 3m50s Details
Test / test_switch_primary (push) Successful in 37s Details
Test / test_write (push) Successful in 39s Details
Test / test_write_no_same (push) Successful in 13s Details
Test / test_write_xor (push) Successful in 1m20s Details
Test / test_rebalance_verify_ec (push) Successful in 4m20s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 3m54s Details
Test / test_heal_pg_size_2 (push) Successful in 3m25s Details
Test / test_heal_csum_32k_dmj (push) Successful in 5m52s Details
Test / test_heal_ec (push) Successful in 6m12s Details
Test / test_heal_csum_32k_dj (push) Successful in 5m40s Details
Test / test_heal_csum_32k (push) Successful in 6m21s Details
Test / test_osd_tags (push) Successful in 21s Details
Test / test_enospc (push) Successful in 2m25s Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m5s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m3s Details
Test / test_heal_csum_4k (push) Successful in 6m1s Details
Test / test_scrub (push) Successful in 43s Details
Test / test_enospc_imm (push) Successful in 47s Details
Test / test_enospc_xor (push) Successful in 1m38s Details
Test / test_enospc_imm_xor (push) Successful in 1m0s Details
Test / test_scrub_zero_osd_2 (push) Successful in 26s Details
Test / test_scrub_xor (push) Successful in 36s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 33s Details
Test / test_scrub_ec (push) Successful in 26s Details
Test / test_scrub_pg_size_3 (push) Successful in 47s Details
Test / test_nfs (push) Successful in 16s Details
A bunch of monitor fixes

- Add noout flag for OSDs (/vitastor/config/osd/xx)
- Fix "effective" size of degraded PGs (and thus "used space") calculation in monitor
- Fix monitor not clearing PGs of deleted pools
- Fix incorrect PG generation with hosts with 0 OSDs
- Fix monitor crashing during primary OSD recheck when pool has no PGs
- Fix monitor crashing when node_placement included non-existing OSDs
- Fix possible data movement after removing OSDs reweighted to 0
- Remove extra empty keys from pool configurations created by vitastor-cli create-pool
- Fix 32-bit build
2024-04-22 02:01:29 +03:00
Vitaliy Filippov 3094358ec2 Fix autovivification leading to extra empty keys in pool-create
Test / test_snapshot_chain_ec (push) Successful in 2m48s Details
Test / test_rebalance_verify_imm (push) Successful in 3m4s Details
Test / test_root_node (push) Successful in 10s Details
Test / test_rebalance_verify (push) Successful in 3m44s Details
Test / test_switch_primary (push) Successful in 36s Details
Test / test_write (push) Successful in 39s Details
Test / test_write_no_same (push) Successful in 19s Details
Test / test_write_xor (push) Successful in 1m4s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 3m36s Details
Test / test_rebalance_verify_ec (push) Successful in 4m21s Details
Test / test_heal_pg_size_2 (push) Successful in 3m33s Details
Test / test_heal_csum_32k_dmj (push) Successful in 5m41s Details
Test / test_heal_ec (push) Successful in 6m5s Details
Test / test_heal_csum_32k_dj (push) Successful in 5m29s Details
Test / test_heal_csum_32k (push) Successful in 6m11s Details
Test / test_osd_tags (push) Successful in 22s Details
Test / test_enospc (push) Successful in 2m30s Details
Test / test_heal_csum_4k (push) Successful in 6m9s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m11s Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m14s Details
Test / test_scrub (push) Successful in 42s Details
Test / test_enospc_imm (push) Successful in 47s Details
Test / test_enospc_xor (push) Successful in 1m4s Details
Test / test_enospc_imm_xor (push) Successful in 1m1s Details
Test / test_scrub_zero_osd_2 (push) Successful in 27s Details
Test / test_scrub_xor (push) Successful in 27s Details
Test / test_nfs (push) Successful in 20s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 34s Details
Test / test_scrub_pg_size_3 (push) Successful in 49s Details
Test / test_scrub_ec (push) Successful in 31s Details
2024-04-20 02:04:09 +03:00
Vitaliy Filippov 87f666d2a2 Filter out OSDs reweighted to 0 2024-04-20 02:03:53 +03:00
Vitaliy Filippov bd7fe4ef8f Filter out non-existing OSDs added in node_placement 2024-04-20 02:03:36 +03:00
Vitaliy Filippov 1b3f9a1416 Do not set non-existing OSD weight to 0, we'll remove them instead 2024-04-20 02:03:11 +03:00
Vitaliy Filippov a7b7354f38 Do not recheck primary distribution when pool has no PGs 2024-04-20 02:02:47 +03:00
Vitaliy Filippov 765befa22f Remove empty nodes from tree because PG DSL expects that all leaf nodes are OSDs 2024-04-20 02:02:28 +03:00
Vitaliy Filippov 87b3ab94fe Do not disable require-atomic-updates and no-unused-vars 2024-04-20 02:02:13 +03:00
Vitaliy Filippov 2c0801f6e4 Configure ESLint and add it to CI
Test / test_snapshot_chain_ec (push) Successful in 3m0s Details
Test / test_rebalance_verify_imm (push) Successful in 3m20s Details
Test / test_root_node (push) Successful in 10s Details
Test / test_rebalance_verify (push) Successful in 3m50s Details
Test / test_switch_primary (push) Successful in 40s Details
Test / test_write (push) Successful in 41s Details
Test / test_write_no_same (push) Successful in 18s Details
Test / test_write_xor (push) Successful in 1m5s Details
Test / test_rebalance_verify_ec (push) Successful in 4m38s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 4m17s Details
Test / test_heal_pg_size_2 (push) Successful in 3m25s Details
Test / test_heal_ec (push) Successful in 4m46s Details
Test / test_heal_csum_32k_dmj (push) Successful in 5m38s Details
Test / test_heal_csum_32k_dj (push) Successful in 6m16s Details
Test / test_heal_csum_32k (push) Successful in 6m45s Details
Test / test_osd_tags (push) Successful in 27s Details
Test / test_heal_csum_4k_dmj (push) Successful in 7m12s Details
Test / test_enospc (push) Successful in 2m6s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m34s Details
Test / test_enospc_imm (push) Successful in 1m43s Details
Test / test_heal_csum_4k (push) Successful in 6m23s Details
Test / test_enospc_xor (push) Successful in 1m57s Details
Test / test_enospc_imm_xor (push) Successful in 1m0s Details
Test / test_scrub (push) Successful in 32s Details
Test / test_scrub_zero_osd_2 (push) Successful in 31s Details
Test / test_scrub_xor (push) Successful in 33s Details
Test / test_nfs (push) Successful in 18s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 27s Details
Test / test_scrub_ec (push) Successful in 28s Details
Test / test_scrub_pg_size_3 (push) Successful in 57s Details
2024-04-16 02:39:31 +03:00
Vitaliy Filippov fd83fef1d9 Fix pool deletion
Test / test_snapshot_chain_ec (push) Successful in 3m1s Details
Test / test_rebalance_verify_imm (push) Successful in 3m11s Details
Test / test_root_node (push) Successful in 9s Details
Test / test_rebalance_verify (push) Successful in 3m53s Details
Test / test_switch_primary (push) Successful in 39s Details
Test / test_write (push) Successful in 39s Details
Test / test_write_no_same (push) Successful in 18s Details
Test / test_write_xor (push) Successful in 1m9s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 3m53s Details
Test / test_rebalance_verify_ec (push) Successful in 4m33s Details
Test / test_heal_pg_size_2 (push) Successful in 3m27s Details
Test / test_heal_csum_4k_dmj (push) Has been cancelled Details
Test / test_heal_csum_4k_dj (push) Has been cancelled Details
Test / test_heal_csum_4k (push) Has been cancelled Details
Test / test_osd_tags (push) Has been cancelled Details
Test / test_enospc (push) Has been cancelled Details
Test / test_enospc_xor (push) Has been cancelled Details
Test / test_enospc_imm (push) Has been cancelled Details
Test / test_enospc_imm_xor (push) Has been cancelled Details
Test / test_scrub (push) Has been cancelled Details
Test / test_scrub_zero_osd_2 (push) Has been cancelled Details
Test / test_scrub_xor (push) Has been cancelled Details
Test / test_scrub_pg_size_3 (push) Has been cancelled Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Has been cancelled Details
Test / test_scrub_ec (push) Has been cancelled Details
Test / test_nfs (push) Has been cancelled Details
Test / test_heal_csum_32k_dj (push) Has been cancelled Details
Test / test_heal_ec (push) Has been cancelled Details
Test / test_heal_csum_32k_dmj (push) Has been cancelled Details
Test / test_heal_csum_32k (push) Has been cancelled Details
2024-04-16 02:20:26 +03:00
Vitaliy Filippov 8d1067971b Fix pg_effsize (and thus "used space") calculation in monitor 2024-04-16 02:20:18 +03:00
Vitaliy Filippov ae5af04fde Add noout flag for OSDs 2024-04-16 02:19:55 +03:00
Vitaliy Filippov 266d038b11 Fix 32-bit build warnings and one error again :-)
Test / test_snapshot_chain_ec (push) Successful in 2m52s Details
Test / test_rebalance_verify_imm (push) Successful in 3m7s Details
Test / test_root_node (push) Successful in 8s Details
Test / test_rebalance_verify (push) Successful in 3m36s Details
Test / test_switch_primary (push) Successful in 40s Details
Test / test_write (push) Successful in 41s Details
Test / test_write_no_same (push) Successful in 18s Details
Test / test_write_xor (push) Successful in 1m6s Details
Test / test_rebalance_verify_ec (push) Successful in 4m25s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 3m52s Details
Test / test_heal_pg_size_2 (push) Successful in 3m21s Details
Test / test_heal_ec (push) Successful in 5m27s Details
Test / test_heal_csum_32k_dmj (push) Successful in 5m56s Details
Test / test_heal_csum_32k_dj (push) Successful in 5m49s Details
Test / test_heal_csum_32k (push) Successful in 6m43s Details
Test / test_osd_tags (push) Successful in 21s Details
Test / test_enospc (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m43s Details
Test / test_heal_csum_4k (push) Successful in 6m27s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m29s Details
Test / test_enospc_imm (push) Successful in 1m5s Details
Test / test_enospc_xor (push) Successful in 1m38s Details
Test / test_scrub (push) Successful in 37s Details
Test / test_scrub_zero_osd_2 (push) Successful in 32s Details
Test / test_enospc_imm_xor (push) Successful in 45s Details
Test / test_scrub_xor (push) Successful in 33s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 34s Details
Test / test_scrub_ec (push) Successful in 34s Details
Test / test_scrub_pg_size_3 (push) Successful in 43s Details
Test / test_nfs (push) Successful in 13s Details
2024-04-11 22:49:33 +03:00
Vitaliy Filippov ff4414d37e Release 1.6.0
Test / test_snapshot_chain_ec (push) Successful in 3m1s Details
Test / test_rebalance_verify_imm (push) Successful in 3m25s Details
Test / test_root_node (push) Successful in 8s Details
Test / test_rebalance_verify (push) Successful in 4m4s Details
Test / test_switch_primary (push) Successful in 34s Details
Test / test_write (push) Successful in 54s Details
Test / test_write_xor (push) Successful in 52s Details
Test / test_write_no_same (push) Successful in 13s Details
Test / test_rebalance_verify_ec (push) Successful in 4m13s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 4m9s Details
Test / test_heal_pg_size_2 (push) Successful in 4m30s Details
Test / test_heal_ec (push) Successful in 5m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 6m21s Details
Test / test_heal_csum_32k_dj (push) Successful in 5m40s Details
Test / test_heal_csum_32k (push) Successful in 6m42s Details
Test / test_osd_tags (push) Successful in 39s Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m35s Details
Test / test_enospc (push) Successful in 1m43s Details
Test / test_heal_csum_4k (push) Successful in 6m27s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m31s Details
Test / test_enospc_xor (push) Successful in 1m42s Details
Test / test_scrub_zero_osd_2 (push) Successful in 39s Details
Test / test_scrub (push) Successful in 41s Details
Test / test_enospc_imm (push) Successful in 46s Details
Test / test_enospc_imm_xor (push) Successful in 52s Details
Test / test_scrub_xor (push) Successful in 32s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 39s Details
Test / test_scrub_ec (push) Successful in 34s Details
Test / test_nfs (push) Successful in 16s Details
Test / test_scrub_pg_size_3 (push) Successful in 46s Details
New features:

- Implement "hierarchical failure domains" and other complex distribution rules, for example
  EC 4+2 over 3 DC, with 2 chunks per each DC ([documentation](docs/config/pool.en.md#level_placement))
- Make OSDs handle ENOSPC - now cluster stays online even if some OSDs fill up
  to 100 %, only writes requiring free space hang
- Implement Stage/Unstage & volume locking for CSI to prevent parallel mounting
  and/or modifications of the same volume
- Warn about full and almost full OSDs in vitastor-cli status
- Add an experimental NBD netlink map mode as an option ([documentation](docs/usage/nbd.en.md))
- Add --pg parameter to vitastor-cli describe, print objects with 0x in human-readable format too
- Add [administration docs](docs/usage/admin.en.md)

Bug fixes:

- Fix client operation retry timeout - previously the timeout wasn't applied and writes were
  retries almost instantly
- Fix monitors crashing on invalid pool configurations
- Fix journaling - make each journal write wait for all previous journal writes
- Fix monitor thinking that OSD weight is 0 after deleting /osd/config/ key online
- Fix a write stall caused by flusher possibly not trimming journal on rollback
- Set 32k csum_block_size for HDD by default in vitastor-disk
2024-04-09 16:57:59 +03:00
Vitaliy Filippov 0fa7ecc03f Add also a test for OSD tags 2024-04-09 16:57:59 +03:00
Vitaliy Filippov c29bfe12eb Oops - fix filter_by_root_node, add a test for it 2024-04-09 15:48:44 +03:00
Vitaliy Filippov 57bf84ddb2 Fix filtering in mon 2024-04-09 14:51:05 +03:00
Vitaliy Filippov dff4879c8c Check if NBD_ATTR_BACKEND_IDENTIFIER is defined 2024-04-09 13:16:58 +03:00
Vitaliy Filippov af9a853db6 Move NBD netlink map&unmap to separate commands, add "netlink-revive" command
Test / test_splitbrain (push) Successful in 21s Details
Test / test_snapshot_chain (push) Successful in 2m57s Details
Test / test_snapshot_chain_ec (push) Successful in 3m18s Details
Test / test_rebalance_verify_imm (push) Successful in 3m40s Details
Test / test_rebalance_verify (push) Successful in 4m19s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write (push) Successful in 53s Details
Test / test_write_xor (push) Successful in 58s Details
Test / test_write_no_same (push) Successful in 13s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 4m29s Details
Test / test_rebalance_verify_ec (push) Successful in 5m12s Details
Test / test_heal_pg_size_2 (push) Successful in 3m50s Details
Test / test_heal_ec (push) Successful in 3m46s Details
Test / test_heal_csum_32k_dmj (push) Successful in 6m12s Details
Test / test_heal_csum_32k_dj (push) Successful in 6m40s Details
Test / test_heal_csum_32k (push) Successful in 6m52s Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m51s Details
Test / test_enospc (push) Successful in 1m42s Details
Test / test_enospc_xor (push) Successful in 2m23s Details
Test / test_enospc_imm (push) Successful in 1m42s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m12s Details
Test / test_heal_csum_4k (push) Successful in 5m40s Details
Test / test_enospc_imm_xor (push) Successful in 1m26s Details
Test / test_scrub_zero_osd_2 (push) Successful in 32s Details
Test / test_scrub (push) Successful in 35s Details
Test / test_scrub_xor (push) Successful in 27s Details
Test / test_nfs (push) Successful in 23s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 32s Details
Test / test_scrub_ec (push) Successful in 30s Details
Test / test_scrub_pg_size_3 (push) Successful in 43s Details
2024-04-08 16:34:41 +03:00
Vitaliy Filippov b7a3275af3 Make netlink optional 2024-04-08 01:51:28 +03:00
Vitaliy Filippov 64c5c4ca26 Fix code style 2024-04-08 01:35:03 +03:00
idelson 442a9d838d nbd-proxy: add configuration via netlink to support kinds of timeouts.
PR #58 - https://github.com/vitalif/vitastor/pull/58/commits

By MIND Software LLC

By submitting this pull request, I accept Vitastor CLA
2024-04-08 00:50:08 +03:00
Vitaliy Filippov 6366972fe8 Warn about full and almost full OSDs in status
Test / test_splitbrain (push) Successful in 18s Details
Test / test_snapshot_chain (push) Successful in 2m23s Details
Test / test_snapshot_chain_ec (push) Successful in 2m53s Details
Test / test_rebalance_verify_imm (push) Successful in 3m21s Details
Test / test_rebalance_verify (push) Successful in 3m46s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write (push) Successful in 54s Details
Test / test_write_xor (push) Successful in 48s Details
Test / test_write_no_same (push) Successful in 14s Details
Test / test_rebalance_verify_ec (push) Successful in 4m38s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 4m22s Details
Test / test_heal_pg_size_2 (push) Successful in 3m34s Details
Test / test_heal_ec (push) Successful in 3m38s Details
Test / test_heal_csum_32k_dmj (push) Successful in 5m44s Details
Test / test_heal_csum_32k_dj (push) Successful in 5m51s Details
Test / test_heal_csum_32k (push) Successful in 6m45s Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m34s Details
Test / test_enospc (push) Successful in 1m47s Details
Test / test_enospc_xor (push) Successful in 2m41s Details
Test / test_enospc_imm (push) Successful in 1m31s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m39s Details
Test / test_heal_csum_4k (push) Successful in 6m15s Details
Test / test_scrub_zero_osd_2 (push) Successful in 32s Details
Test / test_scrub (push) Successful in 35s Details
Test / test_scrub_xor (push) Successful in 26s Details
Test / test_enospc_imm_xor (push) Successful in 1m13s Details
Test / test_nfs (push) Successful in 24s Details
Test / test_scrub_ec (push) Successful in 33s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 34s Details
Test / test_scrub_pg_size_3 (push) Successful in 42s Details
2024-04-07 19:39:51 +03:00
Vitaliy Filippov 2b863fb715 Add ENOSPC handling tests 2024-04-07 19:39:33 +03:00
Vitaliy Filippov 3bf4dd5abd Fix client op retry timeout - do not retry immediately 2024-04-07 19:08:36 +03:00
Vitaliy Filippov 3b84dcaedd Handle ENOSPC during write - rollback partial EC writes, remember partial replica writes
Test / test_rm (push) Successful in 14s Details
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m59s Details
Test / test_snapshot_down (push) Successful in 28s Details
Test / test_snapshot_down_ec (push) Successful in 30s Details
Test / test_splitbrain (push) Successful in 27s Details
Test / test_snapshot_chain (push) Successful in 2m41s Details
Test / test_snapshot_chain_ec (push) Successful in 3m12s Details
Test / test_rebalance_verify_imm (push) Successful in 3m33s Details
Test / test_rebalance_verify (push) Successful in 4m24s Details
Test / test_switch_primary (push) Successful in 34s Details
Test / test_write (push) Successful in 53s Details
Test / test_write_xor (push) Successful in 51s Details
Test / test_write_no_same (push) Successful in 11s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 3m11s Details
Test / test_rebalance_verify_ec (push) Successful in 6m3s Details
Test / test_heal_pg_size_2 (push) Successful in 4m57s Details
Test / test_heal_ec (push) Successful in 4m52s Details
Test / test_heal_csum_32k_dmj (push) Successful in 4m37s Details
Test / test_heal_csum_32k_dj (push) Successful in 6m55s Details
Test / test_heal_csum_32k (push) Successful in 6m42s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m41s Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m45s Details
Test / test_scrub_zero_osd_2 (push) Successful in 44s Details
Test / test_scrub (push) Successful in 48s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m6s Details
Test / test_scrub_pg_size_3 (push) Successful in 1m30s Details
Test / test_scrub_ec (push) Successful in 51s Details
Test / test_nfs (push) Successful in 39s Details
Test / test_heal_csum_4k (push) Successful in 5m22s Details
Test / test_scrub_xor (push) Successful in 18s Details
2024-04-07 18:02:05 +03:00
Vitaliy Filippov 20fbc4a745 Add --pg parameter to vitastor-cli describe, print objects with 0x in human-readable format too
Test / test_rm (push) Successful in 16s Details
Test / test_interrupted_rebalance_ec_imm (push) Successful in 2m4s Details
Test / test_snapshot_down (push) Successful in 26s Details
Test / test_snapshot_down_ec (push) Successful in 30s Details
Test / test_splitbrain (push) Successful in 18s Details
Test / test_snapshot_chain (push) Successful in 2m32s Details
Test / test_snapshot_chain_ec (push) Successful in 3m11s Details
Test / test_rebalance_verify_imm (push) Successful in 3m34s Details
Test / test_rebalance_verify (push) Successful in 4m12s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write (push) Successful in 52s Details
Test / test_write_xor (push) Successful in 54s Details
Test / test_write_no_same (push) Successful in 13s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 4m6s Details
Test / test_rebalance_verify_ec (push) Successful in 5m1s Details
Test / test_heal_pg_size_2 (push) Successful in 4m15s Details
Test / test_heal_ec (push) Successful in 4m11s Details
Test / test_heal_csum_32k_dmj (push) Successful in 6m4s Details
Test / test_heal_csum_32k_dj (push) Successful in 5m59s Details
Test / test_heal_csum_32k (push) Successful in 6m42s Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m40s Details
Test / test_scrub_zero_osd_2 (push) Successful in 55s Details
Test / test_scrub (push) Successful in 58s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m22s Details
Test / test_heal_csum_4k (push) Successful in 6m24s Details
Test / test_scrub_pg_size_3 (push) Successful in 2m11s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 25s Details
Test / test_scrub_ec (push) Successful in 23s Details
Test / test_nfs (push) Successful in 13s Details
Test / test_scrub_xor (push) Successful in 18s Details
2024-04-07 12:39:46 +03:00
Vitaliy Filippov 02993ee1dd Implement Stage/Unstage & volume locking for CSI to prevent parallel modifications of the same volume 2024-04-07 11:48:19 +03:00
Vitaliy Filippov 3629dbc54d Plug the new PG combinator into monitor
Test / test_move_reappear (push) Successful in 22s Details
Test / test_snapshot_down (push) Successful in 25s Details
Test / test_interrupted_rebalance_ec_imm (push) Successful in 2m46s Details
Test / test_snapshot_down_ec (push) Successful in 24s Details
Test / test_splitbrain (push) Successful in 17s Details
Test / test_snapshot_chain (push) Successful in 2m36s Details
Test / test_snapshot_chain_ec (push) Successful in 3m1s Details
Test / test_rebalance_verify_imm (push) Successful in 3m17s Details
Test / test_rebalance_verify (push) Successful in 3m50s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write (push) Successful in 50s Details
Test / test_write_xor (push) Successful in 56s Details
Test / test_write_no_same (push) Successful in 14s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 3m29s Details
Test / test_rebalance_verify_ec (push) Successful in 5m23s Details
Test / test_heal_pg_size_2 (push) Successful in 4m23s Details
Test / test_heal_ec (push) Successful in 4m57s Details
Test / test_heal_csum_32k_dmj (push) Successful in 5m21s Details
Test / test_heal_csum_32k_dj (push) Successful in 6m33s Details
Test / test_heal_csum_32k (push) Successful in 6m55s Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m54s Details
Test / test_scrub (push) Successful in 1m32s Details
Test / test_scrub_zero_osd_2 (push) Successful in 1m12s Details
Test / test_heal_csum_4k_dj (push) Successful in 7m12s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m1s Details
Test / test_scrub_pg_size_3 (push) Successful in 1m41s Details
Test / test_heal_csum_4k (push) Successful in 6m22s Details
Test / test_scrub_ec (push) Successful in 44s Details
Test / test_nfs (push) Successful in 16s Details
Test / test_scrub_xor (push) Successful in 18s Details
2024-04-07 02:44:17 +03:00
Vitaliy Filippov 29284bef40 Implement new DSL/rule-based PG generation algorithm 2024-04-07 00:36:20 +03:00
Vitaliy Filippov 6a924d6066 Extract PG combinator into a separate module 2024-04-07 00:36:20 +03:00
Vitaliy Filippov 9fe779a691 Do not die on invalid pool configurations
Test / test_rm (push) Successful in 16s Details
Test / test_move_reappear (push) Successful in 24s Details
Test / test_snapshot_down (push) Successful in 26s Details
Test / test_snapshot_down_ec (push) Successful in 31s Details
Test / test_splitbrain (push) Successful in 17s Details
Test / test_snapshot_chain (push) Successful in 2m34s Details
Test / test_snapshot_chain_ec (push) Successful in 3m12s Details
Test / test_rebalance_verify_imm (push) Successful in 2m59s Details
Test / test_rebalance_verify (push) Successful in 3m27s Details
Test / test_switch_primary (push) Successful in 34s Details
Test / test_write (push) Successful in 55s Details
Test / test_write_xor (push) Successful in 54s Details
Test / test_write_no_same (push) Successful in 15s Details
Test / test_rebalance_verify_ec (push) Successful in 4m37s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 4m8s Details
Test / test_heal_pg_size_2 (push) Successful in 3m48s Details
Test / test_heal_ec (push) Successful in 3m47s Details
Test / test_heal_csum_32k_dmj (push) Successful in 6m8s Details
Test / test_heal_csum_32k_dj (push) Successful in 6m18s Details
Test / test_heal_csum_32k (push) Successful in 7m9s Details
Test / test_heal_csum_4k_dmj (push) Successful in 7m7s Details
Test / test_scrub (push) Successful in 1m9s Details
Test / test_scrub_zero_osd_2 (push) Successful in 1m8s Details
Test / test_scrub_xor (push) Successful in 1m7s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m20s Details
Test / test_heal_csum_4k (push) Successful in 5m58s Details
Test / test_scrub_pg_size_3 (push) Successful in 2m9s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m4s Details
Test / test_nfs (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 21s Details
2024-04-07 00:36:20 +03:00
Vitaliy Filippov 31c2751b9b Move NBD/VDUSE map/unmap functions to a separate file 2024-04-07 00:36:09 +03:00
Vitaliy Filippov c5195666cd Fix journal sequencing: make each journal write wait for all previous journal writes
Test / test_snapshot_ec (push) Successful in 31s Details
Test / test_rm (push) Successful in 15s Details
Test / test_snapshot_down (push) Successful in 30s Details
Test / test_snapshot_down_ec (push) Successful in 32s Details
Test / test_splitbrain (push) Successful in 25s Details
Test / test_snapshot_chain (push) Successful in 2m11s Details
Test / test_snapshot_chain_ec (push) Successful in 3m1s Details
Test / test_rebalance_verify_imm (push) Successful in 2m35s Details
Test / test_rebalance_verify (push) Successful in 3m10s Details
Test / test_switch_primary (push) Successful in 39s Details
Test / test_write (push) Successful in 43s Details
Test / test_write_no_same (push) Successful in 18s Details
Test / test_write_xor (push) Successful in 1m3s Details
Test / test_rebalance_verify_ec (push) Successful in 4m38s Details
Test / test_heal_pg_size_2 (push) Successful in 3m22s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 5m11s Details
Test / test_heal_ec (push) Successful in 4m23s Details
Test / test_heal_csum_32k_dmj (push) Successful in 4m55s Details
Test / test_heal_csum_32k_dj (push) Successful in 6m31s Details
Test / test_heal_csum_32k (push) Successful in 6m29s Details
Test / test_heal_csum_4k_dmj (push) Successful in 7m18s Details
Test / test_scrub_zero_osd_2 (push) Successful in 1m0s Details
Test / test_scrub (push) Failing after 3m19s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m39s Details
Test / test_scrub_xor (push) Successful in 58s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m13s Details
Test / test_scrub_ec (push) Successful in 50s Details
Test / test_scrub_pg_size_3 (push) Successful in 1m51s Details
Test / test_heal_csum_4k (push) Successful in 5m13s Details
Test / test_nfs (push) Successful in 23s Details
2024-04-06 23:53:12 +03:00
Vitaliy Filippov f36d7eb76c Fix monitor thinking that OSD weight is 0 after deleting /osd/config/ key
Test / test_rm (push) Successful in 16s Details
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m30s Details
Test / test_snapshot_down (push) Successful in 31s Details
Test / test_snapshot_down_ec (push) Successful in 32s Details
Test / test_splitbrain (push) Successful in 23s Details
Test / test_snapshot_chain (push) Successful in 2m13s Details
Test / test_snapshot_chain_ec (push) Successful in 3m1s Details
Test / test_rebalance_verify_imm (push) Successful in 3m22s Details
Test / test_rebalance_verify (push) Successful in 4m5s Details
Test / test_switch_primary (push) Successful in 34s Details
Test / test_write (push) Successful in 52s Details
Test / test_write_xor (push) Successful in 52s Details
Test / test_write_no_same (push) Successful in 14s Details
Test / test_rebalance_verify_ec (push) Successful in 4m41s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 3m51s Details
Test / test_heal_pg_size_2 (push) Successful in 3m55s Details
Test / test_heal_ec (push) Successful in 4m35s Details
Test / test_heal_csum_32k_dmj (push) Successful in 6m0s Details
Test / test_heal_csum_32k_dj (push) Successful in 5m51s Details
Test / test_heal_csum_32k (push) Successful in 6m48s Details
Test / test_heal_csum_4k_dmj (push) Successful in 7m7s Details
Test / test_scrub (push) Successful in 1m36s Details
Test / test_scrub_zero_osd_2 (push) Successful in 1m20s Details
Test / test_scrub_xor (push) Successful in 56s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m39s Details
Test / test_heal_csum_4k (push) Successful in 6m37s Details
Test / test_nfs (push) Successful in 18s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 47s Details
Test / test_scrub_ec (push) Successful in 27s Details
Test / test_scrub_pg_size_3 (push) Successful in 1m3s Details
2024-04-05 23:14:46 +03:00
Vitaliy Filippov dd7f651de1 Add --max-request-bytes=104857600 to etcd params in tests 2024-04-05 23:14:46 +03:00
Vitaliy Filippov a2994ecd0d Fix flusher possibly not trimming journal on rollback 2024-04-05 23:14:39 +03:00
Vitaliy Filippov 5d3aaf016b Add administration docs 2024-03-31 01:54:52 +03:00
Vitaliy Filippov 0b097ca3f2 Set 32k csum_block_size for HDD by default 2024-03-30 16:16:49 +03:00
Vitaliy Filippov 989675a780 s/etcd_ws_keepalive_timeout/etcd_ws_keepalive_interval/ in docs 2024-03-26 01:56:08 +03:00
Vitaliy Filippov f8c403ec9e Add newer benchmark results 2024-03-23 18:28:48 +03:00
Vitaliy Filippov bfbb85e653 Replace -Oanything with -O3, not just -O/-O1/-O2
Test / test_move_reappear (push) Successful in 22s Details
Test / test_rm (push) Successful in 13s Details
Test / test_snapshot_down (push) Successful in 31s Details
Test / test_snapshot_down_ec (push) Successful in 32s Details
Test / test_splitbrain (push) Successful in 21s Details
Test / test_snapshot_chain (push) Successful in 2m11s Details
Test / test_snapshot_chain_ec (push) Successful in 3m14s Details
Test / test_rebalance_verify_imm (push) Successful in 3m29s Details
Test / test_rebalance_verify (push) Successful in 4m4s Details
Test / test_switch_primary (push) Successful in 40s Details
Test / test_write (push) Successful in 42s Details
Test / test_write_no_same (push) Successful in 17s Details
Test / test_write_xor (push) Successful in 1m3s Details
Test / test_rebalance_verify_ec (push) Successful in 4m36s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 3m43s Details
Test / test_heal_pg_size_2 (push) Successful in 3m36s Details
Test / test_heal_ec (push) Successful in 6m7s Details
Test / test_heal_csum_32k_dmj (push) Successful in 5m39s Details
Test / test_heal_csum_32k_dj (push) Successful in 5m33s Details
Test / test_heal_csum_32k (push) Successful in 6m38s Details
Test / test_scrub (push) Successful in 1m52s Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m32s Details
Test / test_heal_csum_4k_dj (push) Successful in 6m30s Details
Test / test_heal_csum_4k (push) Successful in 6m25s Details
Test / test_scrub_zero_osd_2 (push) Successful in 1m32s Details
Test / test_scrub_xor (push) Successful in 31s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 35s Details
Test / test_scrub_pg_size_3 (push) Successful in 43s Details
Test / test_nfs (push) Successful in 13s Details
Test / test_scrub_ec (push) Successful in 19s Details
2024-03-18 02:03:44 +03:00
Vitaliy Filippov 9ad6822353 Release 1.5.0
Test / test_rm (push) Successful in 14s Details
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m36s Details
Test / test_snapshot_down (push) Successful in 31s Details
Test / test_snapshot_down_ec (push) Successful in 30s Details
Test / test_splitbrain (push) Successful in 24s Details
Test / test_snapshot_chain (push) Successful in 2m20s Details
Test / test_snapshot_chain_ec (push) Successful in 3m5s Details
Test / test_rebalance_verify_imm (push) Successful in 5m11s Details
Test / test_rebalance_verify (push) Successful in 5m55s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 4m26s Details
Test / test_write (push) Successful in 54s Details
Test / test_write_xor (push) Successful in 57s Details
Test / test_write_no_same (push) Successful in 19s Details
Test / test_rebalance_verify_ec (push) Successful in 7m21s Details
Test / test_heal_pg_size_2 (push) Successful in 4m36s Details
Test / test_heal_csum_32k_dmj (push) Successful in 4m33s Details
Test / test_heal_ec (push) Successful in 6m15s Details
Test / test_heal_csum_32k_dj (push) Successful in 6m31s Details
Test / test_heal_csum_32k (push) Successful in 6m29s Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m15s Details
Test / test_scrub_zero_osd_2 (push) Successful in 1m16s Details
Test / test_scrub (push) Successful in 1m18s Details
Test / test_scrub_xor (push) Successful in 1m13s Details
Test / test_heal_csum_4k_dj (push) Successful in 7m10s Details
Test / test_scrub_ec (push) Successful in 56s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 59s Details
Test / test_heal_csum_4k (push) Successful in 6m2s Details
Test / test_scrub_pg_size_3 (push) Successful in 2m11s Details
Test / test_nfs (push) Successful in 11s Details
After half a year of hard work, VitastorFS is finally here ! :-)

New features:
- VitastorFS, a full-featured clustered (read-write-many) file system.
  Documentation: [VitastorFS](docs/usage/nfs.en.md)
- Embedded key-value database implementation based on Parallel Optimistic B-Tree
  algorithm and used for the metadata of VitastorFS
- Pool management commands in vitastor-cli (create-pool, list-pools, rm-pool, modify-pool).
  Thanks MIND Software (https://mindsw.io) for their contribution!
  [Documentation](docs/usage/cli.en.md#create-pool)

Bug fixes:
- Fix a very rare "infinite loop" in the client library
- Fix a rare OSD hang on during start when zeroing out bad metadata entries left from the previous run
2024-03-16 15:35:10 +03:00
Vitaliy Filippov 2043b4e374 Fix build errors for gcc 8 2024-03-16 15:35:10 +03:00
Vitaliy Filippov de840e6fe3 Reduce kv-cli loadjson load parallelism to 16 2024-03-16 15:35:10 +03:00
Vitaliy Filippov b5e04bf809 Fix build warning 2024-03-16 15:35:10 +03:00
Vitaliy Filippov 8807a1623b Fix markdown tables 2024-03-16 15:35:10 +03:00
Vitaliy Filippov f12855c31b Add vitastor-kv to packages 2024-03-16 15:35:10 +03:00
Vitaliy Filippov e75dcc9a71 Add documentation for VitastorFS 2024-03-16 15:16:43 +03:00
Vitaliy Filippov 88516ab4bd Remove extra log 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 6221126b4f Allow to print simple-offsets just given the device size 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 6783d4a13c Implement fool protection for FS pools 2024-03-16 13:24:36 +03:00
Vitaliy Filippov dcbe1afac3 Store pool ID in inode metadata 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 0bde28c24a Make nfs_do_rmw a library function 2024-03-16 13:24:36 +03:00
Vitaliy Filippov bb8ca6184e Support setattr guard 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 87310ef7bb Support ctime 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 4f4b2dab80 Log NFS liveness checks 2024-03-16 13:24:36 +03:00
Vitaliy Filippov f70da82317 Add loadjson command to vitastor-kv 2024-03-16 13:24:36 +03:00
Vitaliy Filippov e42148f347 Allow to specify KV commands on command line 2024-03-16 13:24:36 +03:00
Vitaliy Filippov c289584469 Add JSON dump format 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 018e89f867 Erase verf key left from creation from ientries on every modification 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 603dc68f11 Implement async mtime change 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 7b12342933 Allow to specify additional NFS mount options 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 44bf0f16ee Fix malloc/free in nfs_kv_read/write 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 8840c84572 Fix "bad key in etcd" in mon for FS pools 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 5b747c12ec Check if already mounted before mounting 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 05f5f46162 Fix zero used space, update mtime when moving/changing inode 2024-03-16 13:24:36 +03:00
Vitaliy Filippov b5604191c8 Ignore ECANCELED in nfs-proxy (happens in io_uring on fork) 2024-03-16 13:24:36 +03:00
Vitaliy Filippov e871de27de Support unaligned shared_offsets, align shared file data instead of header 2024-03-16 13:24:36 +03:00
Vitaliy Filippov f600ce98e2 Implement auto-unmount local NFS server mode for vitastor-nfs 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 57605a5c13 Return error on failed shrink 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 29bd4561bb Implement rename over an existing file/directory 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 7142460ec8 Support --logfile in nfs-proxy 2024-03-16 13:24:36 +03:00
Vitaliy Filippov d03f19ebe5 Fix shared file overlap, add FIXMEs 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 88f9d18be3 Create inode, then direntry, not direntry, then inode; retry ID collisions 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 6213fbd8c6 Fix NFS shared/aligned write FIXMEs 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 3aee37eadd Allow to disable per-inode stats for VitastorFS pools 2024-03-16 13:24:36 +03:00
Vitaliy Filippov ecfc753e93 Add basic NFS tests, fix bugs 2024-03-16 13:24:36 +03:00
Vitaliy Filippov a574f9ad71 Return block NFS implementation back as an option too 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 7c235c9103 Move KV FS header into a separate file 2024-03-16 13:24:36 +03:00
Vitaliy Filippov e5bb986164 Implement packing small files into shared inodes 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 181795d748 Split new NFS proxy implementation into multiple files 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 8cdc38805b WIP VitastorFS with metadata storage in VitastorKV 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 0cd455d17f First just recheck version without actually re-reading block in vitastor-kv 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 32ba653ba6 Fix vitastor-kv hang on reopen & unfinished closed listing 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 231d4b15fc Add loadable dump format to vitastor-kv (dump) 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 9dc4d5fd7b Fix freeing r/w buffers on errors in kv_db 2024-03-16 13:24:36 +03:00
Vitaliy Filippov e58538fa47 Fix eviction when random_pos selects the end 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 11ac9e7024 Implement min/max list_count to make listings during performance test reasonable 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 511bc3df1c Fix and improve parallel allocation
- Do not try to allocate more DB blocks in an inode block until it's "confirmed" and "locked" by the first write
- Do not recheck for new zero DB blocks on first write into an inode block - a CAS failure means someone else is already writing into it
- Throw new allocation blocks away regardless of whether the known_version is 0 on a CAS failure
2024-03-16 13:24:36 +03:00
Vitaliy Filippov a64f0d1f73 Implement key_prefix for K/V stress test 2024-03-16 13:24:36 +03:00
Vitaliy Filippov ec5f7c6b87 More fixes
- do not overwrite a block with older version if known version is newer
  (read may start before update and end after update)
- invalidated block versions can't be remembered and trusted
- right boundary for split blocks is right_half when diving down, not key_lt
- restart update also when block is "invalidated", not just on version mismatch
- copy callback in listings to avoid closure destruction bugs too
2024-03-16 13:24:36 +03:00
Vitaliy Filippov 3ebed9a749 Add logging and one more assert 2024-03-16 13:24:36 +03:00
Vitaliy Filippov eab67a6e8f Make get_block() wait for updating when unrelated block is found along the path 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 20993d9b7a Fix a race condition where changed blocks were parsed over existing cached blocks and getting a mix of data 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 5cf9b343c0 Simplify code by removing an unneeded "optimisation" 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 79ae0aadcd Add kv_log_level, print warnings on level 1, trace ops on level 10 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 605afc3583 Fix duplicate keys in listings on parallel updates -- do not rewind key "iterator position" 2024-03-16 13:24:36 +03:00
Vitaliy Filippov c0681d8242 Implement key suffix to avoid collisions of multiple test workers 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 763e77b4f4 Do not complain on empty first block 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 19426aa4c5 Add JSON output for stress-tester 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 08f586bcec Print total stats 2024-03-16 13:24:36 +03:00
Vitaliy Filippov f1cd87473a Do not send more than op_count operations (fix segfault on finish) 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 1bd8d2da56 Add some more resiliency to serialize() 2024-03-16 13:24:36 +03:00
Vitaliy Filippov a7396d2baf Invalidate blocks being updated too 2024-03-16 13:24:36 +03:00
Vitaliy Filippov e98a38810d Change new block allocation method: make each writer choose multiple empty PG blocks and place blocks in them 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 28c4324c36 Remove blocks from cache on unsuccessful updates 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 31ec3fa8f5 Allow to track multiple updates per block (it should never happen though) 2024-03-16 13:24:36 +03:00
Vitaliy Filippov e4fa26f60a Do not call stop_updating after failed write_new_block and after clear_block (both delete the item) 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 59ae27f9e5 Track versions of parent blocks and recheck if changed during update 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 2c6a301d9b Fix resume_split condition (key_lt can also be "") 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 01558349f8 Experiment: transform offsets for better sharding 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 36f4717d0d More post-stress-test fixes
- Prevent _split types of new blocks
- Stop updating new blocks only after the whole update, otherwise pointers
  may become invalid
- Use recheck_none for updates initially
- Use UINT64_MAX as initial block version when postponing ops, otherwise the
  check fails when the block is initially empty. This for example leads to
  writing both leaf items & block pointers (which is incorrect) into the root
  block when starting stress-test with --parallelism 32
- Fix -EINTR comparison
2024-03-16 13:24:36 +03:00
Vitaliy Filippov babaf2a0ce Print operation statistics 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 5773f1a375 K/V fixes after stress-test :-)
- track block versions correctly - per inode block (128kb) instead of tree block (4kb)
- prevent multiple parallel CAS writes of the same inode block
- add logging for EILSEQ which means invalid data in the tree
- fix get_block updated flag which was true for blocks already in cache and was leading to infinite loops on "unrelated block" errors
- apply changes to blocks in cache only after successful writes (using "virtual changes")
- do not replace cached block with an older version from disk
- recheck "unrelated blocks" (read/update collisions) until data stops changing
- track tree path correctly - do not treat split block as parent of its right half
- correctly move blocks when finding new empty place on disk
- restart updates from the beginning when one of blocks is changed by a parallel update
- fix delete using SET opcode and setting key to the empty value instead
- prevent changing the same key more than 1 time in parallel
- fix listing verification
- resume continue_updates in update_find (required because it uses continue_update itself)
- add allow_old_cached parameter to get()
2024-03-16 13:24:36 +03:00
Vitaliy Filippov 57222a9f79 Implement K/V DB stress tester 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 61ef000c6e Evict blocks based on memory limit & block usage 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 7d5e1cc393 Track blocks per level 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 5e7f27a02d Track block level 2024-03-16 13:24:36 +03:00
Vitaliy Filippov fd1d8a8520 Experimental B-Tree Vitastor embedded K/V database implementation! 2024-03-16 13:24:36 +03:00
Vitaliy Filippov c364e14c40 Stop then retry, not retry then stop 2024-03-16 13:24:36 +03:00
Vitaliy Filippov 3ebbfa0428 Fix another rare OSD hang on zeroing out entries on start 2024-03-16 13:24:36 +03:00
Vitaliy Filippov aa79d1db1c Fix incorrect "changing scheme" message in modify-pool
Test / test_rm (push) Successful in 14s Details
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m32s Details
Test / test_move_reappear (push) Successful in 20s Details
Test / test_snapshot_down (push) Successful in 29s Details
Test / test_snapshot_down_ec (push) Successful in 29s Details
Test / test_splitbrain (push) Successful in 28s Details
Test / test_snapshot_chain (push) Successful in 2m5s Details
Test / test_snapshot_chain_ec (push) Successful in 3m3s Details
Test / test_rebalance_verify_imm (push) Successful in 4m0s Details
Test / test_rebalance_verify (push) Successful in 4m40s Details
Test / test_switch_primary (push) Successful in 38s Details
Test / test_write (push) Successful in 41s Details
Test / test_write_no_same (push) Successful in 17s Details
Test / test_write_xor (push) Successful in 1m2s Details
Test / test_rebalance_verify_ec (push) Successful in 5m34s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 5m34s Details
Test / test_heal_pg_size_2 (push) Successful in 3m22s Details
Test / test_heal_ec (push) Successful in 4m58s Details
Test / test_heal_csum_32k_dmj (push) Successful in 5m37s Details
Test / test_heal_csum_32k_dj (push) Successful in 6m21s Details
Test / test_heal_csum_32k (push) Successful in 7m1s Details
Test / test_scrub (push) Successful in 1m37s Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m59s Details
Test / test_scrub_zero_osd_2 (push) Successful in 1m26s Details
Test / test_scrub_xor (push) Successful in 1m3s Details
Test / test_heal_csum_4k_dj (push) Successful in 7m20s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m7s Details
Test / test_scrub_ec (push) Successful in 36s Details
Test / test_scrub_pg_size_3 (push) Successful in 1m37s Details
Test / test_heal_csum_4k (push) Successful in 6m23s Details
2024-03-06 00:41:35 +03:00
Vitaliy Filippov a1fecb7eff Move callback away when calling it in cluster_client 2024-03-06 00:41:35 +03:00
Vitaliy Filippov ff74b19423 Fix rare OSD hang on zeroing out bad entries on start 2024-03-06 00:41:35 +03:00
Vitaliy Filippov 4cf6dceed7 Merge branch 'rel-1.4'
Test / test_minsize_1 (push) Has been cancelled Details
Test / test_move_reappear (push) Has been cancelled Details
Test / test_rm (push) Has been cancelled Details
Test / test_snapshot_chain (push) Has been cancelled Details
Test / test_snapshot_chain_ec (push) Has been cancelled Details
Test / test_snapshot_down (push) Has been cancelled Details
Test / test_snapshot_down_ec (push) Has been cancelled Details
Test / test_splitbrain (push) Has been cancelled Details
Test / test_rebalance_verify (push) Has been cancelled Details
Test / test_rebalance_verify_imm (push) Has been cancelled Details
Test / test_rebalance_verify_ec (push) Has been cancelled Details
Test / test_rebalance_verify_ec_imm (push) Has been cancelled Details
Test / test_switch_primary (push) Has been cancelled Details
Test / test_write (push) Has been cancelled Details
Test / test_write_xor (push) Has been cancelled Details
Test / test_write_no_same (push) Has been cancelled Details
Test / test_heal_pg_size_2 (push) Has been cancelled Details
Test / test_heal_ec (push) Has been cancelled Details
Test / test_heal_csum_32k_dmj (push) Has been cancelled Details
Test / test_heal_csum_32k_dj (push) Has been cancelled Details
Test / test_heal_csum_32k (push) Has been cancelled Details
Test / test_heal_csum_4k_dmj (push) Has been cancelled Details
Test / test_heal_csum_4k_dj (push) Has been cancelled Details
Test / test_heal_csum_4k (push) Has been cancelled Details
Test / test_scrub (push) Has been cancelled Details
Test / test_scrub_zero_osd_2 (push) Has been cancelled Details
Test / test_scrub_xor (push) Has been cancelled Details
Test / test_scrub_pg_size_3 (push) Has been cancelled Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Has been cancelled Details
Test / test_scrub_ec (push) Has been cancelled Details
2024-02-29 09:59:01 +03:00
Vitaliy Filippov 38b8963330 Release 1.4.8
Test / test_rm (push) Successful in 19s Details
Test / test_move_reappear (push) Successful in 26s Details
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m40s Details
Test / test_snapshot_down (push) Successful in 31s Details
Test / test_snapshot_down_ec (push) Successful in 34s Details
Test / test_splitbrain (push) Successful in 27s Details
Test / test_snapshot_chain (push) Successful in 2m18s Details
Test / test_snapshot_chain_ec (push) Successful in 2m59s Details
Test / test_rebalance_verify_imm (push) Successful in 5m32s Details
Test / test_rebalance_verify (push) Successful in 6m11s Details
Test / test_switch_primary (push) Successful in 41s Details
Test / test_write (push) Successful in 45s Details
Test / test_write_no_same (push) Successful in 23s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 5m2s Details
Test / test_write_xor (push) Successful in 55s Details
Test / test_rebalance_verify_ec (push) Successful in 6m22s Details
Test / test_heal_pg_size_2 (push) Successful in 5m41s Details
Test / test_heal_csum_32k_dmj (push) Successful in 5m59s Details
Test / test_heal_csum_32k_dj (push) Successful in 7m19s Details
Test / test_heal_csum_32k (push) Successful in 7m17s Details
Test / test_heal_csum_4k_dmj (push) Successful in 7m14s Details
Test / test_scrub (push) Successful in 1m12s Details
Test / test_heal_ec (push) Successful in 9m2s Details
Test / test_scrub_xor (push) Successful in 56s Details
Test / test_scrub_zero_osd_2 (push) Successful in 1m8s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 2m1s Details
Test / test_heal_csum_4k_dj (push) Successful in 4m45s Details
Test / test_scrub_pg_size_3 (push) Successful in 2m31s Details
Test / test_heal_csum_4k (push) Successful in 4m54s Details
Test / test_scrub_ec (push) Successful in 46s Details
- Do not use \r if output is not a terminal (should fix unexpected job output in proxmox)
- Fix rm/rm-data error return code, add --down-ok option to bypass the error
- Add EIO retry timeout and allow to disable these retries, rename up_wait_retry_interval to client_retry_interval
- Add ubuntu jammy build
- Wait for blockstore initialisation before starting OSD (prevent timeouts when init takes time)
- Fix a rare use-after-free in automatic sync after delete in blockstore
2024-02-29 09:58:34 +03:00
Vitaliy Filippov 77167e2920 Do not use \r if output is not a terminal 2024-02-29 00:21:17 +03:00
Vitaliy Filippov 5af23672d0 Fix rm/rm-data error return code, add --down-ok option to bypass the error 2024-02-29 00:20:10 +03:00
Vitaliy Filippov 6bf1f539a6 Add EIO retry timeout and allow to disable these retries, rename up_wait_retry_interval to client_retry_interval 2024-02-28 13:10:02 +03:00
Vitaliy Filippov 4eab26f968 Add documentation and a very basic test for pool management commands
Test / test_snapshot_ec (push) Successful in 31s Details
Test / test_rm (push) Successful in 17s Details
Test / test_move_reappear (push) Successful in 24s Details
Test / test_snapshot_down (push) Successful in 27s Details
Test / test_snapshot_down_ec (push) Successful in 33s Details
Test / test_splitbrain (push) Successful in 20s Details
Test / test_snapshot_chain (push) Successful in 2m15s Details
Test / test_snapshot_chain_ec (push) Successful in 2m58s Details
Test / test_rebalance_verify_imm (push) Successful in 5m3s Details
Test / test_rebalance_verify (push) Successful in 5m36s Details
Test / test_switch_primary (push) Successful in 37s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 4m3s Details
Test / test_write_no_same (push) Successful in 21s Details
Test / test_write (push) Successful in 58s Details
Test / test_write_xor (push) Successful in 1m31s Details
Test / test_rebalance_verify_ec (push) Successful in 6m20s Details
Test / test_heal_pg_size_2 (push) Successful in 4m7s Details
Test / test_heal_ec (push) Successful in 4m33s Details
Test / test_heal_csum_32k_dmj (push) Successful in 5m53s Details
Test / test_heal_csum_32k_dj (push) Successful in 6m17s Details
Test / test_heal_csum_32k (push) Successful in 7m23s Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m56s Details
Test / test_scrub_zero_osd_2 (push) Successful in 1m26s Details
Test / test_scrub (push) Successful in 1m29s Details
Test / test_heal_csum_4k_dj (push) Successful in 7m1s Details
Test / test_scrub_xor (push) Successful in 1m1s Details
Test / test_heal_csum_4k (push) Successful in 6m34s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 32s Details
Test / test_scrub_pg_size_3 (push) Successful in 1m19s Details
Test / test_scrub_ec (push) Successful in 24s Details
2024-02-28 13:08:04 +03:00
Vitaliy Filippov 86243b7101 Rework & fix pool-create / pool-modify / pool-ls 2024-02-28 13:08:04 +03:00
idelson dc92851322 vitastor-cli: add commands to control pools: pool-create, pool-ls, pool-modify, pool-rm
PR #59 - https://github.com/vitalif/vitastor/pull/58/commits

By MIND Software LLC

By submitting this pull request, I accept Vitastor CLA
2024-02-28 13:08:04 +03:00
Zibort Cloud 02d1f16bbd Add ubuntu jammy build
PR #62 #62

I accept Vitastor CLA agreement: https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md
2024-02-28 11:43:54 +03:00
Vitaliy Filippov fc413038d1 Wait for blockstore initialisation before starting OSD
Test / test_cas (push) Has been cancelled Details
Test / test_change_pg_count (push) Has been cancelled Details
Test / test_change_pg_count_ec (push) Has been cancelled Details
Test / test_change_pg_size (push) Has been cancelled Details
Test / test_create_nomaxid (push) Has been cancelled Details
Test / test_etcd_fail (push) Has been cancelled Details
Test / test_interrupted_rebalance (push) Has been cancelled Details
Test / test_interrupted_rebalance_imm (push) Has been cancelled Details
Test / test_interrupted_rebalance_ec (push) Has been cancelled Details
Test / test_interrupted_rebalance_ec_imm (push) Has been cancelled Details
Test / test_failure_domain (push) Has been cancelled Details
Test / test_snapshot (push) Has been cancelled Details
Test / test_snapshot_ec (push) Has been cancelled Details
Test / test_minsize_1 (push) Has been cancelled Details
Test / test_move_reappear (push) Has been cancelled Details
Test / test_rm (push) Has been cancelled Details
Test / test_snapshot_chain (push) Has been cancelled Details
Test / test_snapshot_chain_ec (push) Has been cancelled Details
Test / test_snapshot_down (push) Has been cancelled Details
Test / test_snapshot_down_ec (push) Has been cancelled Details
Test / test_splitbrain (push) Has been cancelled Details
Test / test_rebalance_verify (push) Has been cancelled Details
Test / test_rebalance_verify_imm (push) Has been cancelled Details
Test / test_rebalance_verify_ec (push) Has been cancelled Details
Test / test_rebalance_verify_ec_imm (push) Has been cancelled Details
Test / test_switch_primary (push) Has been cancelled Details
Test / test_write (push) Has been cancelled Details
Test / test_write_xor (push) Has been cancelled Details
Test / test_write_no_same (push) Has been cancelled Details
Test / test_heal_pg_size_2 (push) Has been cancelled Details
2024-02-27 02:20:04 +03:00
Vitaliy Filippov 1bc0b5aab3 Fix a rare use-after-free in automatic sync after delete in blockstore
Test / test_interrupted_rebalance_ec (push) Successful in 2m49s Details
Test / test_rm (push) Successful in 14s Details
Test / test_move_reappear (push) Successful in 21s Details
Test / test_snapshot_down (push) Successful in 31s Details
Test / test_snapshot_down_ec (push) Successful in 30s Details
Test / test_splitbrain (push) Successful in 23s Details
Test / test_snapshot_chain (push) Successful in 2m29s Details
Test / test_snapshot_chain_ec (push) Successful in 2m48s Details
Test / test_rebalance_verify_imm (push) Successful in 4m9s Details
Test / test_rebalance_verify (push) Successful in 4m42s Details
Test / test_switch_primary (push) Successful in 41s Details
Test / test_write (push) Successful in 43s Details
Test / test_write_no_same (push) Successful in 21s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 3m37s Details
Test / test_write_xor (push) Successful in 1m11s Details
Test / test_rebalance_verify_ec (push) Successful in 7m14s Details
Test / test_heal_pg_size_2 (push) Successful in 4m3s Details
Test / test_heal_ec (push) Successful in 4m18s Details
Test / test_heal_csum_32k_dmj (push) Successful in 5m5s Details
Test / test_heal_csum_32k_dj (push) Successful in 6m52s Details
Test / test_heal_csum_32k (push) Successful in 6m23s Details
Test / test_heal_csum_4k_dmj (push) Successful in 6m23s Details
Test / test_scrub (push) Successful in 1m30s Details
Test / test_scrub_zero_osd_2 (push) Successful in 1m18s Details
Test / test_heal_csum_4k_dj (push) Successful in 7m9s Details
Test / test_scrub_xor (push) Successful in 57s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m5s Details
Test / test_scrub_ec (push) Successful in 1m6s Details
Test / test_scrub_pg_size_3 (push) Successful in 2m3s Details
Test / test_heal_csum_4k (push) Successful in 4m54s Details
ASan report: [0] READ of size 16 at operator() /root/vitastor/src/blockstore_write.cpp:100
...[5] blockstore_impl_t::ack_sync(blockstore_op_t*) /root/vitastor/src/blockstore_sync.cpp:232
2024-02-24 00:06:36 +03:00
159 changed files with 16522 additions and 1989 deletions

View File

@ -22,7 +22,7 @@ RUN apt-get update
RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \
liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
RUN apt-get -y install jq lp-solve sudo
RUN apt-get -y install jq lp-solve sudo nfs-common
RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
RUN set -ex; \

View File

@ -64,6 +64,13 @@ jobs:
# leak sanitizer sometimes crashes
- run: cd /root/vitastor/build && ASAN_OPTIONS=detect_leaks=0 make -j16 test
npm_lint:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- run: cd /root/vitastor/mon && npm run lint
test_add_osd:
runs-on: ubuntu-latest
needs: build
@ -532,6 +539,24 @@ jobs:
echo ""
done
test_root_node:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_root_node.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_switch_primary:
runs-on: ubuntu-latest
needs: build
@ -748,6 +773,96 @@ jobs:
echo ""
done
test_osd_tags:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_osd_tags.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_enospc:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_enospc.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_enospc_xor:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: SCHEME=xor /root/vitastor/tests/test_enospc.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_enospc_imm:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_enospc.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_enospc_imm_xor:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: IMMEDIATE_COMMIT=1 SCHEME=xor /root/vitastor/tests/test_enospc.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_scrub:
runs-on: ubuntu-latest
needs: build
@ -856,3 +971,21 @@ jobs:
echo ""
done
test_nfs:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_nfs.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done

View File

@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
project(vitastor)
set(VERSION "1.4.7")
set(VERSION "1.6.1")
add_subdirectory(src)

View File

@ -6,8 +6,8 @@
Вернём былую скорость кластерному блочному хранилищу!
Vitastor - распределённая блочная SDS (программная СХД), прямой аналог Ceph RBD и
внутренних СХД популярных облачных провайдеров. Однако, в отличие от них, Vitastor
Vitastor - распределённая блочная и файловая SDS (программная СХД), прямой аналог Ceph RBD и CephFS,
а также внутренних СХД популярных облачных провайдеров. Однако, в отличие от них, Vitastor
быстрый и при этом простой. Только пока маленький :-).
Vitastor архитектурно похож на Ceph, что означает атомарность и строгую консистентность,
@ -63,11 +63,13 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
- [fio](docs/usage/fio.ru.md) для тестов производительности
- [NBD](docs/usage/nbd.ru.md) для монтирования ядром
- [QEMU и qemu-img](docs/usage/qemu.ru.md)
- [NFS](docs/usage/nfs.ru.md)-прокси для VMWare и подобных
- [NFS](docs/usage/nfs.ru.md) кластерная файловая система и псевдо-ФС прокси
- [Администрирование](docs/usage/admin.ru.md)
- Производительность
- [Понимание сути производительности](docs/performance/understanding.ru.md)
- [Теоретический максимум](docs/performance/theoretical.ru.md)
- [Пример сравнения с Ceph](docs/performance/comparison1.ru.md)
- [Более новый тест Vitastor 1.3.1](docs/performance/bench2.ru.md)
## Автор и лицензия

View File

@ -6,9 +6,9 @@
Make Clustered Block Storage Fast Again.
Vitastor is a distributed block SDS, direct replacement of Ceph RBD and internal SDS's
of public clouds. However, in contrast to them, Vitastor is fast and simple at the same time.
The only thing is it's slightly young :-).
Vitastor is a distributed block and file SDS, direct replacement of Ceph RBD and CephFS,
and also internal SDS's of public clouds. However, in contrast to them, Vitastor is fast
and simple at the same time. The only thing is it's slightly young :-).
Vitastor is architecturally similar to Ceph which means strong consistency,
primary-replication, symmetric clustering and automatic data distribution over any
@ -63,11 +63,13 @@ Read more details below in the documentation.
- [fio](docs/usage/fio.en.md) for benchmarks
- [NBD](docs/usage/nbd.en.md) for kernel mounts
- [QEMU and qemu-img](docs/usage/qemu.en.md)
- [NFS](docs/usage/nfs.en.md) emulator for VMWare and similar
- [NFS](docs/usage/nfs.en.md) clustered file system and pseudo-FS proxy
- [Administration](docs/usage/admin.en.md)
- Performance
- [Understanding storage performance](docs/performance/understanding.en.md)
- [Theoretical performance](docs/performance/theoretical.en.md)
- [Example comparison with Ceph](docs/performance/comparison1.en.md)
- [Newer benchmark of Vitastor 1.3.1](docs/performance/bench2.en.md)
## Author and License

View File

@ -1,4 +1,4 @@
VERSION ?= v1.4.7
VERSION ?= v1.6.1
all: build push

View File

@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v1.4.7
image: vitalif/vitastor-csi:v1.6.1
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@ -121,7 +121,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v1.4.7
image: vitalif/vitastor-csi:v1.6.1
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "1.4.7"
vitastorCSIDriverVersion = "1.6.1"
)
// Config struct fills the parameters of request or user input

View File

@ -5,14 +5,13 @@ package vitastor
import (
"context"
"errors"
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"sync"
"syscall"
"time"
@ -34,6 +33,9 @@ type NodeServer struct
stateDir string
mounter mount.Interface
restartInterval time.Duration
mu sync.Mutex
cond *sync.Cond
volumeLocks map[string]bool
}
type DeviceState struct
@ -63,7 +65,9 @@ func NewNodeServer(driver *Driver) *NodeServer
useVduse: checkVduseSupport(),
stateDir: stateDir,
mounter: mount.New(""),
volumeLocks: make(map[string]bool),
}
ns.cond = sync.NewCond(&ns.mu)
if (ns.useVduse)
{
ns.restoreVduseDaemons()
@ -81,299 +85,24 @@ func NewNodeServer(driver *Driver) *NodeServer
return ns
}
func checkVduseSupport() bool
func (ns *NodeServer) lockVolume(lockId string)
{
// Check VDUSE support (vdpa, vduse, virtio-vdpa kernel modules)
vduse := true
for _, mod := range []string{"vdpa", "vduse", "virtio-vdpa"}
ns.mu.Lock()
defer ns.mu.Unlock()
for (ns.volumeLocks[lockId])
{
_, err := os.Stat("/sys/module/"+mod)
if (err != nil)
{
if (!errors.Is(err, os.ErrNotExist))
{
klog.Errorf("failed to check /sys/module/%s: %v", mod, err)
}
c := exec.Command("/sbin/modprobe", mod)
c.Stdout = os.Stderr
c.Stderr = os.Stderr
err := c.Run()
if (err != nil)
{
klog.Errorf("/sbin/modprobe %s failed: %v", mod, err)
vduse = false
break
}
}
ns.cond.Wait()
}
// Check that vdpa tool functions
if (vduse)
{
c := exec.Command("/sbin/vdpa", "-j", "dev")
c.Stderr = os.Stderr
err := c.Run()
if (err != nil)
{
klog.Errorf("/sbin/vdpa -j dev failed: %v", err)
vduse = false
}
}
if (!vduse)
{
klog.Errorf(
"Your host apparently has no VDUSE support. VDUSE support disabled, NBD will be used to map devices."+
" For VDUSE you need at least Linux 5.15 and the following kernel modules: vdpa, virtio-vdpa, vduse.",
)
}
return vduse
ns.volumeLocks[lockId] = true
ns.cond.Broadcast()
}
// NodeStageVolume mounts the volume to a staging path on the node.
func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRequest) (*csi.NodeStageVolumeResponse, error)
func (ns *NodeServer) unlockVolume(lockId string)
{
return &csi.NodeStageVolumeResponse{}, nil
}
// NodeUnstageVolume unstages the volume from the staging path
func (ns *NodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstageVolumeRequest) (*csi.NodeUnstageVolumeResponse, error)
{
return &csi.NodeUnstageVolumeResponse{}, nil
}
func Contains(list []string, s string) bool
{
for i := 0; i < len(list); i++
{
if (list[i] == s)
{
return true
}
}
return false
}
func (ns *NodeServer) mapNbd(volName string, ctxVars map[string]string, readonly bool) (string, error)
{
// Map NBD device
// FIXME: Check if already mapped
args := []string{
"map", "--image", volName,
}
if (ctxVars["configPath"] != "")
{
args = append(args, "--config_path", ctxVars["configPath"])
}
if (readonly)
{
args = append(args, "--readonly", "1")
}
stdout, stderr, err := system("/usr/bin/vitastor-nbd", args...)
dev := strings.TrimSpace(string(stdout))
if (dev == "")
{
return "", fmt.Errorf("vitastor-nbd did not return the name of NBD device. output: %s", stderr)
}
return dev, err
}
func (ns *NodeServer) unmapNbd(devicePath string)
{
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
}
func findByPidFile(pidFile string) (*os.Process, error)
{
pidBuf, err := os.ReadFile(pidFile)
if (err != nil)
{
return nil, err
}
pid, err := strconv.ParseInt(strings.TrimSpace(string(pidBuf)), 0, 64)
if (err != nil)
{
return nil, err
}
proc, err := os.FindProcess(int(pid))
if (err != nil)
{
return nil, err
}
return proc, nil
}
func killByPidFile(pidFile string) error
{
klog.Infof("killing process with PID from file %s", pidFile)
proc, err := findByPidFile(pidFile)
if (err != nil)
{
return err
}
return proc.Signal(syscall.SIGTERM)
}
func startStorageDaemon(vdpaId, volName, pidFile, configPath string, readonly bool) error
{
// Start qemu-storage-daemon
blockSpec := map[string]interface{}{
"node-name": "disk1",
"driver": "vitastor",
"image": volName,
"cache": map[string]bool{
"direct": true,
"no-flush": false,
},
"discard": "unmap",
}
if (configPath != "")
{
blockSpec["config-path"] = configPath
}
blockSpecJson, _ := json.Marshal(blockSpec)
writable := "true"
if (readonly)
{
writable = "false"
}
_, _, err := system(
"/usr/bin/qemu-storage-daemon", "--daemonize", "--pidfile", pidFile, "--blockdev", string(blockSpecJson),
"--export", "vduse-blk,id="+vdpaId+",node-name=disk1,name="+vdpaId+",num-queues=16,queue-size=128,writable="+writable,
)
return err
}
func (ns *NodeServer) mapVduse(volName string, ctxVars map[string]string, readonly bool) (string, string, error)
{
// Generate state file
stateFd, err := os.CreateTemp(ns.stateDir, "vitastor-vduse-*.json")
if (err != nil)
{
return "", "", err
}
stateFile := stateFd.Name()
stateFd.Close()
vdpaId := filepath.Base(stateFile)
vdpaId = vdpaId[0:len(vdpaId)-5] // remove ".json"
pidFile := ns.stateDir + vdpaId + ".pid"
// Map VDUSE device via qemu-storage-daemon
err = startStorageDaemon(vdpaId, volName, pidFile, ctxVars["configPath"], readonly)
if (err == nil)
{
// Add device to VDPA bus
_, _, err = system("/sbin/vdpa", "-j", "dev", "add", "name", vdpaId, "mgmtdev", "vduse")
if (err == nil)
{
// Find block device name
var matches []string
matches, err = filepath.Glob("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/*")
if (err == nil && len(matches) == 0)
{
err = errors.New("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/* is not found")
}
if (err == nil)
{
blockdev := "/dev/"+filepath.Base(matches[0])
_, err = os.Stat(blockdev)
if (err == nil)
{
// Generate state file
stateJSON, _ := json.Marshal(&DeviceState{
ConfigPath: ctxVars["configPath"],
VdpaId: vdpaId,
Image: volName,
Blockdev: blockdev,
Readonly: readonly,
PidFile: pidFile,
})
err = os.WriteFile(stateFile, stateJSON, 0600)
if (err == nil)
{
return blockdev, vdpaId, nil
}
}
}
}
killErr := killByPidFile(pidFile)
if (killErr != nil)
{
klog.Errorf("Failed to kill started qemu-storage-daemon: %v", killErr)
}
os.Remove(stateFile)
os.Remove(pidFile)
}
return "", "", err
}
func (ns *NodeServer) unmapVduse(devicePath string)
{
if (len(devicePath) < 6 || devicePath[0:6] != "/dev/v")
{
klog.Errorf("%s does not start with /dev/v", devicePath)
return
}
vduseDev, err := os.Readlink("/sys/block/"+devicePath[5:])
if (err != nil)
{
klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx): %v", devicePath, err)
return
}
vdpaId := ""
p := strings.Index(vduseDev, "/vduse/")
if (p >= 0)
{
vduseDev = vduseDev[p+7:]
p = strings.Index(vduseDev, "/")
if (p >= 0)
{
vdpaId = vduseDev[0:p]
}
}
if (vdpaId == "")
{
klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx), but is %v", devicePath, vduseDev)
return
}
ns.unmapVduseById(vdpaId)
}
func (ns *NodeServer) unmapVduseById(vdpaId string)
{
_, err := os.Stat("/sys/bus/vdpa/devices/"+vdpaId)
if (err != nil)
{
klog.Errorf("failed to stat /sys/bus/vdpa/devices/"+vdpaId+": %v", err)
}
else
{
_, _, _ = system("/sbin/vdpa", "-j", "dev", "del", vdpaId)
}
stateFile := ns.stateDir + vdpaId + ".json"
os.Remove(stateFile)
pidFile := ns.stateDir + vdpaId + ".pid"
_, err = os.Stat(pidFile)
if (os.IsNotExist(err))
{
// ok, already killed
}
else if (err != nil)
{
klog.Errorf("Failed to stat %v: %v", pidFile, err)
return
}
else
{
err = killByPidFile(pidFile)
if (err != nil)
{
klog.Errorf("Failed to kill started qemu-storage-daemon: %v", err)
}
os.Remove(pidFile)
}
ns.mu.Lock()
defer ns.mu.Unlock()
delete(ns.volumeLocks, lockId)
ns.cond.Broadcast()
}
func (ns *NodeServer) restarter()
@ -422,58 +151,83 @@ func (ns *NodeServer) restoreVduseDaemons()
vdpaId := filepath.Base(stateFile)
vdpaId = vdpaId[0:len(vdpaId)-5]
// Check if VDPA device is still added to the bus
if (devs[vdpaId] != nil)
{
// Check if the storage daemon is still active
pidFile := ns.stateDir + vdpaId + ".pid"
exists := false
proc, err := findByPidFile(pidFile)
if (err == nil)
{
exists = proc.Signal(syscall.Signal(0)) == nil
}
if (!exists)
{
// Restart daemon
stateJSON, err := os.ReadFile(stateFile)
if (err != nil)
{
klog.Warningf("error reading state file %v: %v", stateFile, err)
}
else
{
var state DeviceState
err := json.Unmarshal(stateJSON, &state)
if (err != nil)
{
klog.Warningf("state file %v contains invalid JSON (error %v): %v", stateFile, err, string(stateJSON))
}
else
{
klog.Warningf("restarting storage daemon for volume %v (VDPA ID %v)", state.Image, vdpaId)
_ = startStorageDaemon(vdpaId, state.Image, pidFile, state.ConfigPath, state.Readonly)
}
}
}
}
else
if (devs[vdpaId] == nil)
{
// Unused, clean it up
ns.unmapVduseById(vdpaId)
unmapVduseById(ns.stateDir, vdpaId)
continue
}
stateJSON, err := os.ReadFile(stateFile)
if (err != nil)
{
klog.Warningf("error reading state file %v: %v", stateFile, err)
continue
}
var state DeviceState
err = json.Unmarshal(stateJSON, &state)
if (err != nil)
{
klog.Warningf("state file %v contains invalid JSON (error %v): %v", stateFile, err, string(stateJSON))
continue
}
ns.lockVolume(state.ConfigPath+":"+state.Image)
// Recheck state file after locking
_, err = os.ReadFile(stateFile)
if (err != nil)
{
klog.Warningf("state file %v disappeared, skipping volume", stateFile)
ns.unlockVolume(state.ConfigPath+":"+state.Image)
continue
}
// Check if the storage daemon is still active
pidFile := ns.stateDir + vdpaId + ".pid"
exists := false
proc, err := findByPidFile(pidFile)
if (err == nil)
{
exists = proc.Signal(syscall.Signal(0)) == nil
}
if (!exists)
{
// Restart daemon
klog.Warningf("restarting storage daemon for volume %v (VDPA ID %v)", state.Image, vdpaId)
_ = startStorageDaemon(vdpaId, state.Image, pidFile, state.ConfigPath, state.Readonly)
}
ns.unlockVolume(state.ConfigPath+":"+state.Image)
}
}
// NodePublishVolume mounts the volume mounted to the staging path to the target path
func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error)
// NodeStageVolume mounts the volume to a staging path on the node.
func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRequest) (*csi.NodeStageVolumeResponse, error)
{
klog.Infof("received node publish volume request %+v", protosanitizer.StripSecrets(req))
klog.Infof("received node stage volume request %+v", protosanitizer.StripSecrets(req))
targetPath := req.GetTargetPath()
ctxVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
_, err = GetConnectionParams(ctxVars)
if (err != nil)
{
return nil, err
}
volName := ctxVars["name"]
ns.lockVolume(ctxVars["configPath"]+":"+volName)
defer ns.unlockVolume(ctxVars["configPath"]+":"+volName)
targetPath := req.GetStagingTargetPath()
isBlock := req.GetVolumeCapability().GetBlock() != nil
// Check that it's not already mounted
_, err := mount.IsNotMountPoint(ns.mounter, targetPath)
_, err = mount.IsNotMountPoint(ns.mounter, targetPath)
if (err != nil)
{
if (os.IsNotExist(err))
@ -509,28 +263,14 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
}
}
ctxVars := make(map[string]string)
err = json.Unmarshal([]byte(req.VolumeId), &ctxVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := ctxVars["name"]
_, err = GetConnectionParams(ctxVars)
if (err != nil)
{
return nil, err
}
var devicePath, vdpaId string
if (!ns.useVduse)
{
devicePath, err = ns.mapNbd(volName, ctxVars, req.GetReadonly())
devicePath, err = mapNbd(volName, ctxVars, false)
}
else
{
devicePath, vdpaId, err = ns.mapVduse(volName, ctxVars, req.GetReadonly())
devicePath, vdpaId, err = mapVduse(ns.stateDir, volName, ctxVars, false)
}
if (err != nil)
{
@ -614,26 +354,182 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
)
goto unmap
}
return &csi.NodePublishVolumeResponse{}, nil
return &csi.NodeStageVolumeResponse{}, nil
unmap:
if (!ns.useVduse || len(devicePath) >= 8 && devicePath[0:8] == "/dev/nbd")
{
ns.unmapNbd(devicePath)
unmapNbd(devicePath)
}
else
{
ns.unmapVduseById(vdpaId)
unmapVduseById(ns.stateDir, vdpaId)
}
return nil, err
}
// NodeUnstageVolume unstages the volume from the staging path
func (ns *NodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstageVolumeRequest) (*csi.NodeUnstageVolumeResponse, error)
{
klog.Infof("received node unstage volume request %+v", protosanitizer.StripSecrets(req))
ctxVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := ctxVars["name"]
ns.lockVolume(ctxVars["configPath"]+":"+volName)
defer ns.unlockVolume(ctxVars["configPath"]+":"+volName)
targetPath := req.GetStagingTargetPath()
devicePath, refCount, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath)
if (err != nil)
{
if (os.IsNotExist(err))
{
return nil, status.Error(codes.NotFound, "Target path not found")
}
return nil, err
}
if (devicePath == "")
{
// volume not mounted
klog.Warningf("%s is not a mountpoint, deleting", targetPath)
os.Remove(targetPath)
return &csi.NodeUnstageVolumeResponse{}, nil
}
// unmount
err = mount.CleanupMountPoint(targetPath, ns.mounter, false)
if (err != nil)
{
return nil, err
}
// unmap device
if (refCount == 1)
{
if (!ns.useVduse)
{
unmapNbd(devicePath)
}
else
{
unmapVduse(ns.stateDir, devicePath)
}
}
return &csi.NodeUnstageVolumeResponse{}, nil
}
// NodePublishVolume mounts the volume mounted to the staging path to the target path
func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error)
{
klog.Infof("received node publish volume request %+v", protosanitizer.StripSecrets(req))
ctxVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
_, err = GetConnectionParams(ctxVars)
if (err != nil)
{
return nil, err
}
volName := ctxVars["name"]
ns.lockVolume(ctxVars["configPath"]+":"+volName)
defer ns.unlockVolume(ctxVars["configPath"]+":"+volName)
stagingTargetPath := req.GetStagingTargetPath()
targetPath := req.GetTargetPath()
isBlock := req.GetVolumeCapability().GetBlock() != nil
// Check that stagingTargetPath is mounted
_, err = mount.IsNotMountPoint(ns.mounter, stagingTargetPath)
if (err != nil)
{
klog.Errorf("staging path %v is not mounted: %v", stagingTargetPath, err)
return nil, fmt.Errorf("staging path %v is not mounted: %v", stagingTargetPath, err)
}
// Check that targetPath is not already mounted
_, err = mount.IsNotMountPoint(ns.mounter, targetPath)
if (err != nil)
{
if (os.IsNotExist(err))
{
if (isBlock)
{
pathFile, err := os.OpenFile(targetPath, os.O_CREATE|os.O_RDWR, 0o600)
if (err != nil)
{
klog.Errorf("failed to create block device mount target %s with error: %v", targetPath, err)
return nil, err
}
err = pathFile.Close()
if (err != nil)
{
klog.Errorf("failed to close %s with error: %v", targetPath, err)
return nil, err
}
}
else
{
err := os.MkdirAll(targetPath, 0777)
if (err != nil)
{
klog.Errorf("failed to create fs mount target %s with error: %v", targetPath, err)
return nil, err
}
}
}
else
{
return nil, err
}
}
execArgs := []string{"--bind", stagingTargetPath, targetPath}
if (req.GetReadonly())
{
execArgs = append(execArgs, "-o", "ro")
}
cmd := exec.Command("mount", execArgs...)
cmd.Stderr = os.Stderr
klog.Infof("binding volume %v (%v) from %v to %v", volName, ctxVars["configPath"], stagingTargetPath, targetPath)
out, err := cmd.Output()
if (err != nil)
{
return nil, fmt.Errorf("Error running mount %v: %s", strings.Join(execArgs, " "), out)
}
return &csi.NodePublishVolumeResponse{}, nil
}
// NodeUnpublishVolume unmounts the volume from the target path
func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpublishVolumeRequest) (*csi.NodeUnpublishVolumeResponse, error)
{
klog.Infof("received node unpublish volume request %+v", protosanitizer.StripSecrets(req))
ctxVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := ctxVars["name"]
ns.lockVolume(ctxVars["configPath"]+":"+volName)
defer ns.unlockVolume(ctxVars["configPath"]+":"+volName)
targetPath := req.GetTargetPath()
devicePath, refCount, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath)
devicePath, _, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath)
if (err != nil)
{
if (os.IsNotExist(err))
@ -649,24 +545,14 @@ func (ns *NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpu
os.Remove(targetPath)
return &csi.NodeUnpublishVolumeResponse{}, nil
}
// unmount
err = mount.CleanupMountPoint(targetPath, ns.mounter, false)
if (err != nil)
{
return nil, err
}
// unmap NBD device
if (refCount == 1)
{
if (!ns.useVduse)
{
ns.unmapNbd(devicePath)
}
else
{
ns.unmapVduse(devicePath)
}
}
return &csi.NodeUnpublishVolumeResponse{}, nil
}
@ -685,7 +571,17 @@ func (ns *NodeServer) NodeExpandVolume(ctx context.Context, req *csi.NodeExpandV
// NodeGetCapabilities returns the supported capabilities of the node server
func (ns *NodeServer) NodeGetCapabilities(ctx context.Context, req *csi.NodeGetCapabilitiesRequest) (*csi.NodeGetCapabilitiesResponse, error)
{
return &csi.NodeGetCapabilitiesResponse{}, nil
return &csi.NodeGetCapabilitiesResponse{
Capabilities: []*csi.NodeServiceCapability{
&csi.NodeServiceCapability{
Type: &csi.NodeServiceCapability_Rpc{
Rpc: &csi.NodeServiceCapability_RPC{
Type: csi.NodeServiceCapability_RPC_STAGE_UNSTAGE_VOLUME,
},
},
},
},
}, nil
}
// NodeGetInfo returns NodeGetInfoResponse for CO.

301
csi/src/utils.go Normal file
View File

@ -0,0 +1,301 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
package vitastor
import (
"errors"
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"syscall"
"k8s.io/klog"
)
func Contains(list []string, s string) bool
{
for i := 0; i < len(list); i++
{
if (list[i] == s)
{
return true
}
}
return false
}
func checkVduseSupport() bool
{
// Check VDUSE support (vdpa, vduse, virtio-vdpa kernel modules)
vduse := true
for _, mod := range []string{"vdpa", "vduse", "virtio-vdpa"}
{
_, err := os.Stat("/sys/module/"+mod)
if (err != nil)
{
if (!errors.Is(err, os.ErrNotExist))
{
klog.Errorf("failed to check /sys/module/%s: %v", mod, err)
}
c := exec.Command("/sbin/modprobe", mod)
c.Stdout = os.Stderr
c.Stderr = os.Stderr
err := c.Run()
if (err != nil)
{
klog.Errorf("/sbin/modprobe %s failed: %v", mod, err)
vduse = false
break
}
}
}
// Check that vdpa tool functions
if (vduse)
{
c := exec.Command("/sbin/vdpa", "-j", "dev")
c.Stderr = os.Stderr
err := c.Run()
if (err != nil)
{
klog.Errorf("/sbin/vdpa -j dev failed: %v", err)
vduse = false
}
}
if (!vduse)
{
klog.Errorf(
"Your host apparently has no VDUSE support. VDUSE support disabled, NBD will be used to map devices."+
" For VDUSE you need at least Linux 5.15 and the following kernel modules: vdpa, virtio-vdpa, vduse.",
)
}
return vduse
}
func mapNbd(volName string, ctxVars map[string]string, readonly bool) (string, error)
{
// Map NBD device
// FIXME: Check if already mapped
args := []string{
"map", "--image", volName,
}
if (ctxVars["configPath"] != "")
{
args = append(args, "--config_path", ctxVars["configPath"])
}
if (readonly)
{
args = append(args, "--readonly", "1")
}
stdout, stderr, err := system("/usr/bin/vitastor-nbd", args...)
dev := strings.TrimSpace(string(stdout))
if (dev == "")
{
return "", fmt.Errorf("vitastor-nbd did not return the name of NBD device. output: %s", stderr)
}
return dev, err
}
func unmapNbd(devicePath string)
{
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
}
func findByPidFile(pidFile string) (*os.Process, error)
{
pidBuf, err := os.ReadFile(pidFile)
if (err != nil)
{
return nil, err
}
pid, err := strconv.ParseInt(strings.TrimSpace(string(pidBuf)), 0, 64)
if (err != nil)
{
return nil, err
}
proc, err := os.FindProcess(int(pid))
if (err != nil)
{
return nil, err
}
return proc, nil
}
func killByPidFile(pidFile string) error
{
klog.Infof("killing process with PID from file %s", pidFile)
proc, err := findByPidFile(pidFile)
if (err != nil)
{
return err
}
return proc.Signal(syscall.SIGTERM)
}
func startStorageDaemon(vdpaId, volName, pidFile, configPath string, readonly bool) error
{
// Start qemu-storage-daemon
blockSpec := map[string]interface{}{
"node-name": "disk1",
"driver": "vitastor",
"image": volName,
"cache": map[string]bool{
"direct": true,
"no-flush": false,
},
"discard": "unmap",
}
if (configPath != "")
{
blockSpec["config-path"] = configPath
}
blockSpecJson, _ := json.Marshal(blockSpec)
writable := "true"
if (readonly)
{
writable = "false"
}
_, _, err := system(
"/usr/bin/qemu-storage-daemon", "--daemonize", "--pidfile", pidFile, "--blockdev", string(blockSpecJson),
"--export", "vduse-blk,id="+vdpaId+",node-name=disk1,name="+vdpaId+",num-queues=16,queue-size=128,writable="+writable,
)
return err
}
func mapVduse(stateDir string, volName string, ctxVars map[string]string, readonly bool) (string, string, error)
{
// Generate state file
stateFd, err := os.CreateTemp(stateDir, "vitastor-vduse-*.json")
if (err != nil)
{
return "", "", err
}
stateFile := stateFd.Name()
stateFd.Close()
vdpaId := filepath.Base(stateFile)
vdpaId = vdpaId[0:len(vdpaId)-5] // remove ".json"
pidFile := stateDir + vdpaId + ".pid"
// Map VDUSE device via qemu-storage-daemon
err = startStorageDaemon(vdpaId, volName, pidFile, ctxVars["configPath"], readonly)
if (err == nil)
{
// Add device to VDPA bus
_, _, err = system("/sbin/vdpa", "-j", "dev", "add", "name", vdpaId, "mgmtdev", "vduse")
if (err == nil)
{
// Find block device name
var matches []string
matches, err = filepath.Glob("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/*")
if (err == nil && len(matches) == 0)
{
err = errors.New("/sys/bus/vdpa/devices/"+vdpaId+"/virtio*/block/* is not found")
}
if (err == nil)
{
blockdev := "/dev/"+filepath.Base(matches[0])
_, err = os.Stat(blockdev)
if (err == nil)
{
// Generate state file
stateJSON, _ := json.Marshal(&DeviceState{
ConfigPath: ctxVars["configPath"],
VdpaId: vdpaId,
Image: volName,
Blockdev: blockdev,
Readonly: readonly,
PidFile: pidFile,
})
err = os.WriteFile(stateFile, stateJSON, 0600)
if (err == nil)
{
return blockdev, vdpaId, nil
}
}
}
}
killErr := killByPidFile(pidFile)
if (killErr != nil)
{
klog.Errorf("Failed to kill started qemu-storage-daemon: %v", killErr)
}
os.Remove(stateFile)
os.Remove(pidFile)
}
return "", "", err
}
func unmapVduse(stateDir, devicePath string)
{
if (len(devicePath) < 6 || devicePath[0:6] != "/dev/v")
{
klog.Errorf("%s does not start with /dev/v", devicePath)
return
}
vduseDev, err := os.Readlink("/sys/block/"+devicePath[5:])
if (err != nil)
{
klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx): %v", devicePath, err)
return
}
vdpaId := ""
p := strings.Index(vduseDev, "/vduse/")
if (p >= 0)
{
vduseDev = vduseDev[p+7:]
p = strings.Index(vduseDev, "/")
if (p >= 0)
{
vdpaId = vduseDev[0:p]
}
}
if (vdpaId == "")
{
klog.Errorf("%s is not a symbolic link to VDUSE device (../devices/virtual/vduse/xxx), but is %v", devicePath, vduseDev)
return
}
unmapVduseById(stateDir, vdpaId)
}
func unmapVduseById(stateDir, vdpaId string)
{
_, err := os.Stat("/sys/bus/vdpa/devices/"+vdpaId)
if (err != nil)
{
klog.Errorf("failed to stat /sys/bus/vdpa/devices/"+vdpaId+": %v", err)
}
else
{
_, _, _ = system("/sbin/vdpa", "-j", "dev", "del", vdpaId)
}
stateFile := stateDir + vdpaId + ".json"
os.Remove(stateFile)
pidFile := stateDir + vdpaId + ".pid"
_, err = os.Stat(pidFile)
if (os.IsNotExist(err))
{
// ok, already killed
}
else if (err != nil)
{
klog.Errorf("Failed to stat %v: %v", pidFile, err)
return
}
else
{
err = killByPidFile(pidFile)
if (err != nil)
{
klog.Errorf("Failed to kill started qemu-storage-daemon: %v", err)
}
os.Remove(pidFile)
}
}

View File

@ -3,5 +3,5 @@
cat < vitastor.Dockerfile > ../Dockerfile
cd ..
mkdir -p packages
sudo podman build --build-arg REL=bookworm -v `pwd`/packages:/root/packages -f Dockerfile .
sudo podman build --build-arg DISTRO=debian --build-arg REL=bookworm -v `pwd`/packages:/root/packages -f Dockerfile .
rm Dockerfile

View File

@ -3,5 +3,5 @@
cat < vitastor.Dockerfile > ../Dockerfile
cd ..
mkdir -p packages
sudo podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f Dockerfile .
sudo podman build --build-arg DISTRO=debian --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f Dockerfile .
rm Dockerfile

View File

@ -3,5 +3,5 @@
cat < vitastor.Dockerfile > ../Dockerfile
cd ..
mkdir -p packages
sudo podman build --build-arg REL=buster -v `pwd`/packages:/root/packages -f Dockerfile .
sudo podman build --build-arg DISTRO=debian --build-arg REL=buster -v `pwd`/packages:/root/packages -f Dockerfile .
rm Dockerfile

7
debian/build-vitastor-ubuntu-jammy.sh vendored Executable file
View File

@ -0,0 +1,7 @@
#!/bin/bash
cat < vitastor.Dockerfile > ../Dockerfile
cd ..
mkdir -p packages
sudo podman build --build-arg DISTRO=ubuntu --build-arg REL=jammy -v `pwd`/packages:/root/packages -f Dockerfile .
rm Dockerfile

2
debian/changelog vendored
View File

@ -1,4 +1,4 @@
vitastor (1.4.7-1) unstable; urgency=medium
vitastor (1.6.1-1) unstable; urgency=medium
* Bugfixes

2
debian/control vendored
View File

@ -2,7 +2,7 @@ Source: vitastor
Section: admin
Priority: optional
Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev, cmake, pkg-config
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev, cmake, pkg-config, libnl-3-dev, libnl-genl-3-dev
Standards-Version: 4.5.0
Homepage: https://vitastor.io/
Rules-Requires-Root: no

View File

@ -1,13 +1,14 @@
# Build patched libvirt for Debian Buster or Bullseye/Sid inside a container
# cd ..; podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/libvirt.Dockerfile .
# cd ..; podman build --build-arg DISTRO=debian --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/libvirt.Dockerfile .
ARG DISTRO=
ARG REL=
FROM debian:$REL
FROM $DISTRO:$REL
ARG REL=
WORKDIR /root
RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" ]; then \
RUN if ([ "${DISTRO}" = "debian" ]) && ( [ "${REL}" = "buster" -o "${REL}" = "bullseye" ] ); then \
echo "deb http://deb.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
echo >> /etc/apt/preferences; \
echo 'Package: *' >> /etc/apt/preferences; \
@ -23,7 +24,7 @@ RUN apt-get -y build-dep libvirt0
RUN apt-get -y install libglusterfs-dev
RUN apt-get --download-only source libvirt
ADD patches/libvirt-5.0-vitastor.diff patches/libvirt-7.0-vitastor.diff patches/libvirt-7.5-vitastor.diff patches/libvirt-7.6-vitastor.diff /root
ADD patches/libvirt-5.0-vitastor.diff patches/libvirt-7.0-vitastor.diff patches/libvirt-7.5-vitastor.diff patches/libvirt-7.6-vitastor.diff patches/libvirt-8.0-vitastor.diff /root
RUN set -e; \
mkdir -p /root/packages/libvirt-$REL; \
rm -rf /root/packages/libvirt-$REL/*; \

View File

@ -3,4 +3,6 @@ usr/bin/vitastor-cli
usr/bin/vitastor-rm
usr/bin/vitastor-nbd
usr/bin/vitastor-nfs
usr/bin/vitastor-kv
usr/bin/vitastor-kv-stress
usr/lib/*/libvitastor*.so*

View File

@ -1,8 +1,10 @@
# Build Vitastor packages for Debian inside a container
# cd ..; podman build --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/vitastor.Dockerfile .
# cd ..; podman build --build-arg DISTRO=debian --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/vitastor.Dockerfile .
ARG DISTRO=debian
ARG REL=
FROM debian:$REL
FROM $DISTRO:$REL
ARG DISTRO=debian
ARG REL=
WORKDIR /root
@ -23,7 +25,7 @@ RUN apt-get update
RUN apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts
RUN apt-get -y build-dep fio
RUN apt-get --download-only source fio
RUN apt-get update && apt-get -y install libjerasure-dev cmake libibverbs-dev libisal-dev
RUN apt-get update && apt-get -y install libjerasure-dev cmake libibverbs-dev libisal-dev libnl-3-dev libnl-genl-3-dev
ADD . /root/vitastor
RUN set -e -x; \
@ -35,8 +37,8 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-1.4.7; \
cd vitastor-1.4.7; \
cp -r /root/vitastor vitastor-1.6.1; \
cd vitastor-1.6.1; \
ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@ -49,8 +51,8 @@ RUN set -e -x; \
rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.7.orig.tar.xz vitastor-1.4.7; \
cd vitastor-1.4.7; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.6.1.orig.tar.xz vitastor-1.6.1; \
cd vitastor-1.6.1; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

View File

@ -9,6 +9,9 @@
These parameters apply only to Vitastor clients (QEMU, fio, NBD and so on) and
affect their interaction with the cluster.
- [client_retry_interval](#client_retry_interval)
- [client_eio_retry_interval](#client_eio_retry_interval)
- [client_retry_enospc](#client_retry_enospc)
- [client_max_dirty_bytes](#client_max_dirty_bytes)
- [client_max_dirty_ops](#client_max_dirty_ops)
- [client_enable_writeback](#client_enable_writeback)
@ -18,6 +21,36 @@ affect their interaction with the cluster.
- [nbd_timeout](#nbd_timeout)
- [nbd_max_devices](#nbd_max_devices)
- [nbd_max_part](#nbd_max_part)
- [osd_nearfull_ratio](#osd_nearfull_ratio)
## client_retry_interval
- Type: milliseconds
- Default: 50
- Minimum: 10
- Can be changed online: yes
Retry time for I/O requests failed due to inactive PGs or network
connectivity errors.
## client_eio_retry_interval
- Type: milliseconds
- Default: 1000
- Can be changed online: yes
Retry time for I/O requests failed due to data corruption or unfinished
EC object deletions (has_incomplete PG state). 0 disables such retries
and clients are not blocked and just get EIO error code instead.
## client_retry_enospc
- Type: boolean
- Default: true
- Can be changed online: yes
Retry writes on out of space errors to wait until some space is freed on
OSDs.
## client_max_dirty_bytes
@ -135,3 +168,18 @@ Maximum number of NBD devices in the system. This value is passed as
Maximum number of partitions per NBD device. This value is passed as
`max_part` parameter for the nbd kernel module when vitastor-nbd autoloads it.
Note that (nbds_max)*(1+max_part) usually can't exceed 256.
## osd_nearfull_ratio
- Type: number
- Default: 0.95
- Can be changed online: yes
Ratio of used space on OSD to treat it as "almost full" in vitastor-cli status output.
Remember that some client writes may hang or complete with an error if even
just one OSD becomes 100 % full!
However, unlike in Ceph, 100 % full Vitastor OSDs don't crash (in Ceph they're
unable to start at all), so you'll be able to recover from "out of space" errors
without destroying and recreating OSDs.

View File

@ -9,6 +9,9 @@
Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD и т.п.) и
затрагивают логику их работы с кластером.
- [client_retry_interval](#client_retry_interval)
- [client_eio_retry_interval](#client_eio_retry_interval)
- [client_retry_enospc](#client_retry_enospc)
- [client_max_dirty_bytes](#client_max_dirty_bytes)
- [client_max_dirty_ops](#client_max_dirty_ops)
- [client_enable_writeback](#client_enable_writeback)
@ -18,6 +21,37 @@
- [nbd_timeout](#nbd_timeout)
- [nbd_max_devices](#nbd_max_devices)
- [nbd_max_part](#nbd_max_part)
- [osd_nearfull_ratio](#osd_nearfull_ratio)
## client_retry_interval
- Тип: миллисекунды
- Значение по умолчанию: 50
- Минимальное значение: 10
- Можно менять на лету: да
Время повтора запросов ввода-вывода, неудачных из-за неактивных PG или
ошибок сети.
## client_eio_retry_interval
- Тип: миллисекунды
- Значение по умолчанию: 1000
- Можно менять на лету: да
Время повтора запросов ввода-вывода, неудачных из-за повреждения данных
или незавершённых удалений EC-объектов (состояния PG has_incomplete).
0 отключает повторы таких запросов и клиенты не блокируются, а вместо
этого просто получают код ошибки EIO.
## client_retry_enospc
- Тип: булево (да/нет)
- Значение по умолчанию: true
- Можно менять на лету: да
Повторять запросы записи, завершившиеся с ошибками нехватки места, т.е.
ожидать, пока на OSD не освободится место.
## client_max_dirty_bytes
@ -135,3 +169,20 @@
Максимальное число разделов на одном NBD-устройстве. Данное значение передаётся
модулю ядра nbd как параметр `max_part`, когда его загружает vitastor-nbd.
Имейте в виду, что (nbds_max)*(1+max_part) обычно не может превышать 256.
## osd_nearfull_ratio
- Тип: число
- Значение по умолчанию: 0.95
- Можно менять на лету: да
Доля занятого места на OSD, начиная с которой он считается "почти заполненным" в
выводе vitastor-cli status.
Помните, что часть клиентских запросов может зависнуть или завершиться с ошибкой,
если на 100 % заполнится хотя бы 1 OSD!
Однако, в отличие от Ceph, заполненные на 100 % OSD Vitastor не падают (в Ceph
заполненные на 100% OSD вообще не могут стартовать), так что вы сможете
восстановить работу кластера после ошибок отсутствия свободного места
без уничтожения и пересоздания OSD.

View File

@ -15,6 +15,7 @@ These parameters only apply to Monitors.
- [mon_stats_timeout](#mon_stats_timeout)
- [osd_out_time](#osd_out_time)
- [placement_levels](#placement_levels)
- [use_old_pg_combinator](#use_old_pg_combinator)
## etcd_mon_ttl
@ -77,3 +78,11 @@ values. Smaller priority means higher level in tree. For example,
levels are always predefined and can't be removed. If one of them is not
present in the configuration, then it is defined with the default priority
(100 for "host", 101 for "osd").
## use_old_pg_combinator
- Type: boolean
- Default: false
Use the old PG combination generator which doesn't support [level_placement](pool.en.md#level_placement)
and [raw_placement](pool.en.md#raw_placement) for pools which don't use this features.

View File

@ -15,6 +15,7 @@
- [mon_stats_timeout](#mon_stats_timeout)
- [osd_out_time](#osd_out_time)
- [placement_levels](#placement_levels)
- [use_old_pg_combinator](#use_old_pg_combinator)
## etcd_mon_ttl
@ -78,3 +79,11 @@ OSD перед обновлением агрегированной статис
"host" и "osd" являются предопределёнными и не могут быть удалены. Если
один из них отсутствует в конфигурации, он доопределяется с приоритетом по
умолчанию (100 для уровня "host", 101 для "osd").
## use_old_pg_combinator
- Тип: булево (да/нет)
- Значение по умолчанию: false
Использовать старый генератор комбинаций PG, не поддерживающий [level_placement](pool.ru.md#level_placement)
и [raw_placement](pool.ru.md#raw_placement) для пулов, которые не используют данные функции.

View File

@ -25,12 +25,11 @@ between clients, OSDs and etcd.
- [peer_connect_timeout](#peer_connect_timeout)
- [osd_idle_timeout](#osd_idle_timeout)
- [osd_ping_timeout](#osd_ping_timeout)
- [up_wait_retry_interval](#up_wait_retry_interval)
- [max_etcd_attempts](#max_etcd_attempts)
- [etcd_quick_timeout](#etcd_quick_timeout)
- [etcd_slow_timeout](#etcd_slow_timeout)
- [etcd_keepalive_timeout](#etcd_keepalive_timeout)
- [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
- [etcd_ws_keepalive_interval](#etcd_ws_keepalive_interval)
## tcp_header_buffer_size
@ -212,17 +211,6 @@ Maximum time to wait for OSD keepalive responses. If an OSD doesn't respond
within this time, the connection to it is dropped and a reconnection attempt
is scheduled.
## up_wait_retry_interval
- Type: milliseconds
- Default: 50
- Minimum: 10
- Can be changed online: yes
OSDs respond to clients with a special error code when they receive I/O
requests for a PG that's not synchronized and started. This parameter sets
the time for the clients to wait before re-attempting such I/O requests.
## max_etcd_attempts
- Type: integer
@ -257,7 +245,7 @@ Timeout for etcd requests which are allowed to wait for some time.
Timeout for etcd connection HTTP Keep-Alive. Should be higher than
etcd_report_interval to guarantee that keepalive actually works.
## etcd_ws_keepalive_timeout
## etcd_ws_keepalive_interval
- Type: seconds
- Default: 30

View File

@ -25,12 +25,11 @@
- [peer_connect_timeout](#peer_connect_timeout)
- [osd_idle_timeout](#osd_idle_timeout)
- [osd_ping_timeout](#osd_ping_timeout)
- [up_wait_retry_interval](#up_wait_retry_interval)
- [max_etcd_attempts](#max_etcd_attempts)
- [etcd_quick_timeout](#etcd_quick_timeout)
- [etcd_slow_timeout](#etcd_slow_timeout)
- [etcd_keepalive_timeout](#etcd_keepalive_timeout)
- [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
- [etcd_ws_keepalive_interval](#etcd_ws_keepalive_interval)
## tcp_header_buffer_size
@ -221,19 +220,6 @@ OSD в любом случае согласовывают реальное зн
Если OSD не отвечает за это время, соединение отключается и производится
повторная попытка соединения.
## up_wait_retry_interval
- Тип: миллисекунды
- Значение по умолчанию: 50
- Минимальное значение: 10
- Можно менять на лету: да
Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не
поднятым на данный момент на них PG, либо к PG в процессе синхронизации,
они отвечают клиентам специальным кодом ошибки, означающим, что клиент
должен некоторое время подождать перед повторением запроса. Именно это время
ожидания задаёт данный параметр.
## max_etcd_attempts
- Тип: целое число
@ -270,7 +256,7 @@ OSD в любом случае согласовывают реальное зн
Таймаут для HTTP Keep-Alive в соединениях к etcd. Должен быть больше, чем
etcd_report_interval, чтобы keepalive гарантированно работал.
## etcd_ws_keepalive_timeout
## etcd_ws_keepalive_interval
- Тип: секунды
- Значение по умолчанию: 30

View File

@ -32,6 +32,8 @@ Parameters:
- [pg_minsize](#pg_minsize)
- [pg_count](#pg_count)
- [failure_domain](#failure_domain)
- [level_placement](#level_placement)
- [raw_placement](#raw_placement)
- [max_osd_combinations](#max_osd_combinations)
- [block_size](#block_size)
- [bitmap_granularity](#bitmap_granularity)
@ -41,6 +43,7 @@ Parameters:
- [osd_tags](#osd_tags)
- [primary_affinity_tags](#primary_affinity_tags)
- [scrub_interval](#scrub_interval)
- [used_for_fs](#used_for_fs)
Examples:
@ -83,7 +86,11 @@ Parent node reference is required for intermediate tree nodes.
Separate OSD settings are set in etc keys `/vitastor/config/osd/<number>`
in JSON format `{"<key>":<value>}`.
As of now, two settings are supported:
As of now, the following settings are supported:
- [reweight](#reweight)
- [tags](#tags)
- [noout](#noout)
## reweight
@ -106,6 +113,14 @@ subsets and then use a specific subset for pool instead of all OSDs.
For example you can mark SSD OSDs with tag "ssd" and HDD OSDs with "hdd" and
such tags will work as device classes.
## noout
- Type: boolean
- Default: false
If set to true, [osd_out_time](monitor.en.md#osd_out_time) is ignored for this
OSD and it's never removed from data distribution by the monitor.
# Pool parameters
## name
@ -154,8 +169,25 @@ That is, if it becomes impossible to place PG data on at least (pg_minsize)
OSDs, PG is deactivated for both read and write. So you know that a fresh
write always goes to at least (pg_minsize) OSDs (disks).
That is, pg_size minus pg_minsize sets the number of disk failures to tolerate
without temporary downtime (for [osd_out_time](monitor.en.md#osd_out_time)).
For example, the difference between pg_minsize 2 and 1 in a 3-way replicated
pool (pg_size=3) is:
- If 2 hosts go down with pg_minsize=2, the pool becomes inactive and remains
inactive for [osd_out_time](monitor.en.md#osd_out_time) (10 minutes). After
this timeout, the monitor selects replacement hosts/OSDs and the pool comes
up and starts to heal. Therefore, if you don't have replacement OSDs, i.e.
if you only have 3 hosts with OSDs and 2 of them are down, the pool remains
inactive until you add or return at least 1 host (or change failure_domain
to "osd").
- If 2 hosts go down with pg_minsize=1, the pool only experiences a short
I/O pause until the monitor notices that OSDs are down (5-10 seconds with
the default [etcd_report_interval](osd.en.md#etcd_report_interval)). After
this pause, I/O resumes, but new data is temporarily written in only 1 copy.
Then, after osd_out_time, the monitor also selects replacement OSDs and the
pool starts to heal.
So, pg_minsize regulates the number of failures that a pool can tolerate
without temporary downtime for [osd_out_time](monitor.en.md#osd_out_time),
but at a cost of slightly reduced storage reliability.
FIXME: pg_minsize behaviour may be changed in the future to only make PGs
read-only instead of deactivating them.
@ -168,8 +200,8 @@ read-only instead of deactivating them.
Number of PGs for this pool. The value should be big enough for the monitor /
LP solver to be able to optimize data placement.
"Enough" is usually around 64-128 PGs per OSD, i.e. you set pg_count for pool
to (total OSD count * 100 / pg_size). You can round it to the closest power of 2,
"Enough" is usually around 10-100 PGs per OSD, i.e. you set pg_count for pool
to (total OSD count * 10 / pg_size). You can round it to the closest power of 2,
because it makes it easier to reduce or increase PG count later by dividing or
multiplying it by 2.
@ -191,6 +223,69 @@ never put on OSDs in the same failure domain (for example, on the same host).
So failure domain specifies the unit which failure you are protecting yourself
from.
## level_placement
- Type: string
Additional failure domain rules, applied in conjuction with failure_domain.
Must be specified in the following form:
`<placement level>=<sequence of characters>, <level2>=<sequence2>, ...`
Sequence should be exactly [pg_size](#pg_size) character long. Each character
corresponds to an OSD in the PG of this pool. Equal characters mean that
corresponding items of the PG should be placed into the same placement tree
item at this level. Different characters mean that items should be placed into
different items.
For example, if you want a EC 4+2 pool and you want every 2 chunks to be stored
in its own datacenter and you also want each chunk to be stored on a different
host, you should set `level_placement` to `dc=112233 host=123456`.
Or you can set `level_placement` to `dc=112233` and leave `failure_domain` empty,
because `host` is the default `failure_domain` and it will be applied anyway.
Without this rule, it may happen that 3 chunks will be stored on OSDs in the
same datacenter, and the data will become inaccessibly if that datacenter goes
down in this case.
Of course, you should group your hosts into datacenters before applying the rule
by setting [placement_levels](monitor.en.md#placement_levels) to something like
`{"dc":90,"host":100,"osd":110}` and add DCs to [node_placement](#placement-tree),
like `{"dc1":{"level":"dc"},"host1":{"parent":"dc1"},...}`.
## raw_placement
- Type: string
Raw PG placement rules, specified in the form of a DSL (domain-specific language).
Use only if you really know what you're doing :)
DSL specification:
```
dsl := item | item ("\n" | ",") items
item := "any" | rules
rules := rule | rule rules
rule := level operator arg
level := /\w+/
operator := "!=" | "=" | ">" | "?="
arg := value | "(" values ")"
values := value | value "," values
value := item_ref | constant_id
item_ref := /\d+/
constant_id := /"([^"]+)"/
```
"?=" operator means "preferred". I.e. `dc ?= "meow"` means "prefer datacenter meow
for this chunk, but put into another dc if it's unavailable".
Examples:
- Simple 3 replicas with failure_domain=host: `any, host!=1, host!=(1,2)`
- EC 4+2 in 3 DC: `any, dc=1 host!=1, dc!=1, dc=3 host!=3, dc!=(1,3), dc=5 host!=5`
- 1 replica in fixed DC + 2 in random DCs: `dc?=meow, dc!=1, dc!=(1,2)`
## max_osd_combinations
- Type: integer
@ -282,6 +377,25 @@ of the OSDs containing a data chunk for a PG.
Automatic scrubbing interval for this pool. Overrides
[global scrub_interval setting](osd.en.md#scrub_interval).
## used_for_fs
- Type: string
If non-empty, the pool is marked as used for VitastorFS with metadata stored
in block image (regular Vitastor volume) named as the value of this pool parameter.
When a pool is marked as used for VitastorFS, regular block volume creation in it
is disabled (vitastor-cli refuses to create images without --force) to protect
the user from block volume and FS file ID collisions and data loss.
[vitastor-nfs](../usage/nfs.ru.md), in its turn, refuses to use pools not marked
for the corresponding FS when starting. This also implies that you can use one
pool only for one VitastorFS.
The second thing that is disabled for VitastorFS pools is reporting per-inode space
usage statistics in etcd because a FS pool may store a very large number of files
and statistics for them all would take a lot of space in etcd.
# Examples
## Replicated pool

View File

@ -31,6 +31,8 @@
- [pg_minsize](#pg_minsize)
- [pg_count](#pg_count)
- [failure_domain](#failure_domain)
- [level_placement](#level_placement)
- [raw_placement](#raw_placement)
- [max_osd_combinations](#max_osd_combinations)
- [block_size](#block_size)
- [bitmap_granularity](#bitmap_granularity)
@ -40,6 +42,7 @@
- [osd_tags](#osd_tags)
- [primary_affinity_tags](#primary_affinity_tags)
- [scrub_interval](#scrub_interval)
- [used_for_fs](#used_for_fs)
Примеры:
@ -82,10 +85,11 @@
Настройки отдельных OSD задаются в ключах etcd `/vitastor/config/osd/<number>`
в JSON-формате `{"<key>":<value>}`.
На данный момент поддерживаются две настройки:
На данный момент поддерживаются следующие настройки:
- [reweight](#reweight)
- [tags](#tags)
- [noout](#noout)
## reweight
@ -109,6 +113,14 @@
всех. Можно, например, пометить SSD OSD тегом "ssd", а HDD тегом "hdd", в
этом смысле теги работают аналогично классам устройств.
## noout
- Тип: булево (да/нет)
- Значение по умолчанию: false
Если установлено в true, то [osd_out_time](monitor.ru.md#osd_out_time) для этого
OSD игнорируется и OSD не удаляется из распределения данных монитором.
# Параметры
## name
@ -157,9 +169,25 @@
OSD, PG деактивируется на чтение и запись. Иными словами, всегда известно,
что новые блоки данных всегда записываются как минимум на pg_minsize дисков.
По сути, разница pg_size и pg_minsize задаёт число отказов дисков, которые пул
может пережить без временной (на [osd_out_time](monitor.ru.md#osd_out_time))
остановки обслуживания.
Для примера, разница между pg_minsize 2 и 1 в реплицированном пуле с 3 копиями
данных (pg_size=3), проявляется следующим образом:
- Если 2 сервера отключаются при pg_minsize=2, пул становится неактивным и
остаётся неактивным в течение [osd_out_time](monitor.ru.md#osd_out_time)
(10 минут), после чего монитор назначает другие OSD/серверы на замену, пул
поднимается и начинает восстанавливать недостающие копии данных. Соответственно,
если OSD на замену нет - то есть, если у вас всего 3 сервера с OSD и 2 из них
недоступны - пул так и остаётся недоступным до тех пор, пока вы не вернёте
или не добавите хотя бы 1 сервер (или не переключите failure_domain на "osd").
- Если 2 сервера отключаются при pg_minsize=1, ввод-вывод лишь приостанавливается
на короткое время, до тех пор, пока монитор не поймёт, что OSD отключены
(что занимает 5-10 секунд при стандартном [etcd_report_interval](osd.ru.md#etcd_report_interval)).
После этого ввод-вывод восстанавливается, но новые данные временно пишутся
всего в 1 копии. Когда же проходит osd_out_time, монитор точно так же назначает
другие OSD на замену выбывшим и пул начинает восстанавливать копии данных.
То есть, pg_minsize регулирует число отказов, которые пул может пережить без
временной остановки обслуживания на [osd_out_time](monitor.ru.md#osd_out_time),
но ценой немного пониженных гарантий надёжности.
FIXME: Поведение pg_minsize может быть изменено в будущем с полной деактивации
PG на перевод их в режим только для чтения.
@ -172,8 +200,8 @@ PG на перевод их в режим только для чтения.
Число PG для данного пула. Число должно быть достаточно большим, чтобы монитор
мог равномерно распределить по ним данные.
Обычно это означает примерно 64-128 PG на 1 OSD, т.е. pg_count можно устанавливать
равным (общему числу OSD * 100 / pg_size). Значение можно округлить до ближайшей
Обычно это означает примерно 10-100 PG на 1 OSD, т.е. pg_count можно устанавливать
равным (общему числу OSD * 10 / pg_size). Значение можно округлить до ближайшей
степени 2, чтобы потом было легче уменьшать или увеличивать число PG, умножая
или деля его на 2.
@ -194,6 +222,71 @@ PG в Vitastor эферемерны, то есть вы можете менят
Иными словами, домен отказа - это то, от отказа чего вы защищаете себя избыточным
хранением.
## level_placement
- Тип: строка
Правила дополнительных доменов отказа, применяемые вместе с failure_domain.
Должны задаваться в следующем виде:
`<уровень>=<последовательность символов>, <уровень2>=<последовательность2>, ...`
Каждая `<последовательность>` должна состоять ровно из [pg_size](#pg_size) символов.
Каждый символ соответствует одному OSD (размещению одной части PG) этого пула.
Одинаковые символы означают, что соответствующие части размещаются в один и тот же
узел дерева OSD на заданном `<уровне>`. Разные символы означают, что части
размещаются в разные узлы.
Например, если вы хотите сделать пул EC 4+2 и хотите поместить каждые 2 части
данных в свой датацентр, и также вы хотите, чтобы каждая часть размещалась на
другом хосте, то вы должны задать `level_placement` равным `dc=112233 host=123456`.
Либо вы просто можете задать `level_placement` равным `dc=112233` и оставить
`failure_domain` пустым, т.к. `host` это его значение по умолчанию и оно также
применится автоматически.
Без этого правила может получиться так, что в одном из датацентров окажется
3 части данных одной PG и данные окажутся недоступными при временном отключении
этого датацентра.
Естественно, перед установкой правила вам нужно сгруппировать ваши хосты в
датацентры, установив [placement_levels](monitor.ru.md#placement_levels) во что-то
типа `{"dc":90,"host":100,"osd":110}` и добавив датацентры в [node_placement](#дерево-размещения),
примерно так: `{"dc1":{"level":"dc"},"host1":{"parent":"dc1"},...}`.
## raw_placement
- Type: string
Низкоуровневые правила генерации PG в форме DSL (доменно-специфичного языка).
Используйте, только если действительно знаете, зачем вам это надо :)
Спецификация DSL:
```
dsl := item | item ("\n" | ",") items
item := "any" | rules
rules := rule | rule rules
rule := level operator arg
level := /\w+/
operator := "!=" | "=" | ">" | "?="
arg := value | "(" values ")"
values := value | value "," values
value := item_ref | constant_id
item_ref := /\d+/
constant_id := /"([^"]+)"/
```
Оператор "?=" означает "предпочитаемый". Т.е. `dc ?= "meow"` означает "предпочитать
датацентр meow для этой части данных, но разместить её в другом датацентре, если
meow недоступен".
Примеры:
- Простые 3 реплики с failure_domain=host: `any, host!=1, host!=(1,2)`
- EC 4+2 в 3 датацентрах: `any, dc=1 host!=1, dc!=1, dc=3 host!=3, dc!=(1,3), dc=5 host!=5`
- 1 копия в фиксированном ДЦ + 2 в других ДЦ: `dc?=meow, dc!=1, dc!=(1,2)`
## max_osd_combinations
- Тип: целое число
@ -290,6 +383,27 @@ OSD с "all".
Интервал скраба, то есть, автоматической фоновой проверки данных для данного пула.
Переопределяет [глобальную настройку scrub_interval](osd.ru.md#scrub_interval).
## used_for_fs
- Type: string
Если непусто, пул помечается как используемый для файловой системы VitastorFS с
метаданными, хранимыми в блочном образе Vitastor с именем, равным значению
этого параметра.
Когда пул помечается как используемый для VitastorFS, создание обычных блочных
образов в нём отключается (vitastor-cli отказывается создавать образы без --force),
чтобы защитить пользователя от коллизий ID файлов и блочных образов и, таким
образом, от потери данных.
[vitastor-nfs](../usage/nfs.ru.md), в свою очередь, при запуске отказывается
использовать для ФС пулы, не выделенные для неё. Это также означает, что один
пул может использоваться только для одной VitastorFS.
Также для ФС-пулов отключается передача статистики в etcd по отдельным инодам,
так как ФС-пул может содержать очень много файлов и статистика по ним всем
заняла бы очень много места в etcd.
# Примеры
## Реплицированный пул

View File

@ -1,3 +1,37 @@
- name: client_retry_interval
type: ms
min: 10
default: 50
online: true
info: |
Retry time for I/O requests failed due to inactive PGs or network
connectivity errors.
info_ru: |
Время повтора запросов ввода-вывода, неудачных из-за неактивных PG или
ошибок сети.
- name: client_eio_retry_interval
type: ms
default: 1000
online: true
info: |
Retry time for I/O requests failed due to data corruption or unfinished
EC object deletions (has_incomplete PG state). 0 disables such retries
and clients are not blocked and just get EIO error code instead.
info_ru: |
Время повтора запросов ввода-вывода, неудачных из-за повреждения данных
или незавершённых удалений EC-объектов (состояния PG has_incomplete).
0 отключает повторы таких запросов и клиенты не блокируются, а вместо
этого просто получают код ошибки EIO.
- name: client_retry_enospc
type: bool
default: true
online: true
info: |
Retry writes on out of space errors to wait until some space is freed on
OSDs.
info_ru: |
Повторять запросы записи, завершившиеся с ошибками нехватки места, т.е.
ожидать, пока на OSD не освободится место.
- name: client_max_dirty_bytes
type: int
default: 33554432
@ -166,3 +200,27 @@
Максимальное число разделов на одном NBD-устройстве. Данное значение передаётся
модулю ядра nbd как параметр `max_part`, когда его загружает vitastor-nbd.
Имейте в виду, что (nbds_max)*(1+max_part) обычно не может превышать 256.
- name: osd_nearfull_ratio
type: float
default: 0.95
online: true
info: |
Ratio of used space on OSD to treat it as "almost full" in vitastor-cli status output.
Remember that some client writes may hang or complete with an error if even
just one OSD becomes 100 % full!
However, unlike in Ceph, 100 % full Vitastor OSDs don't crash (in Ceph they're
unable to start at all), so you'll be able to recover from "out of space" errors
without destroying and recreating OSDs.
info_ru: |
Доля занятого места на OSD, начиная с которой он считается "почти заполненным" в
выводе vitastor-cli status.
Помните, что часть клиентских запросов может зависнуть или завершиться с ошибкой,
если на 100 % заполнится хотя бы 1 OSD!
Однако, в отличие от Ceph, заполненные на 100 % OSD Vitastor не падают (в Ceph
заполненные на 100% OSD вообще не могут стартовать), так что вы сможете
восстановить работу кластера после ошибок отсутствия свободного места
без уничтожения и пересоздания OSD.

View File

@ -56,6 +56,8 @@
{{../../usage/nfs.en.md}}
{{../../usage/admin.en.md}}
## Performance
{{../../performance/understanding.en.md}}
@ -64,4 +66,6 @@
{{../../performance/comparison1.en.md}}
{{../../performance/bench2.en.md}}
{{../../intro/author.en.md|indent=1}}

View File

@ -56,6 +56,8 @@
{{../../usage/nfs.ru.md}}
{{../../usage/admin.ru.md}}
## Производительность
{{../../performance/understanding.ru.md}}
@ -64,4 +66,6 @@
{{../../performance/comparison1.ru.md}}
{{../../performance/bench2.ru.md}}
{{../../intro/author.ru.md|indent=1}}

View File

@ -63,3 +63,12 @@
"host" и "osd" являются предопределёнными и не могут быть удалены. Если
один из них отсутствует в конфигурации, он доопределяется с приоритетом по
умолчанию (100 для уровня "host", 101 для "osd").
- name: use_old_pg_combinator
type: bool
default: false
info: |
Use the old PG combination generator which doesn't support [level_placement](pool.en.md#level_placement)
and [raw_placement](pool.en.md#raw_placement) for pools which don't use this features.
info_ru: |
Использовать старый генератор комбинаций PG, не поддерживающий [level_placement](pool.ru.md#level_placement)
и [raw_placement](pool.ru.md#raw_placement) для пулов, которые не используют данные функции.

View File

@ -243,21 +243,6 @@
Максимальное время ожидания ответа на запрос проверки состояния соединения.
Если OSD не отвечает за это время, соединение отключается и производится
повторная попытка соединения.
- name: up_wait_retry_interval
type: ms
min: 10
default: 50
online: true
info: |
OSDs respond to clients with a special error code when they receive I/O
requests for a PG that's not synchronized and started. This parameter sets
the time for the clients to wait before re-attempting such I/O requests.
info_ru: |
Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не
поднятым на данный момент на них PG, либо к PG в процессе синхронизации,
они отвечают клиентам специальным кодом ошибки, означающим, что клиент
должен некоторое время подождать перед повторением запроса. Именно это время
ожидания задаёт данный параметр.
- name: max_etcd_attempts
type: int
default: 5
@ -295,7 +280,7 @@
info_ru: |
Таймаут для HTTP Keep-Alive в соединениях к etcd. Должен быть больше, чем
etcd_report_interval, чтобы keepalive гарантированно работал.
- name: etcd_ws_keepalive_timeout
- name: etcd_ws_keepalive_interval
type: sec
default: 30
online: true

View File

@ -13,7 +13,7 @@
## Server-side features
- Basic part: highly-available block storage with symmetric clustering and no SPOF
- [Performance](../performance/comparison1.en.md) ;-D
- [Performance](../performance/bench2.en.md) ;-D
- [Multiple redundancy schemes](../config/pool.en.md#scheme): Replication, XOR n+1, Reed-Solomon erasure codes
based on jerasure and ISA-L libraries with any number of data and parity drives in a group
- Configuration via simple JSON data structures in etcd (parameters, pools and images)
@ -33,6 +33,7 @@
- [Checksums](../config/layout-osd.en.md#data_csum_type)
- [Client write-back cache](../config/client.en.md#client_enable_writeback)
- [Intelligent recovery auto-tuning](../config/osd.en.md#recovery_tune_interval)
- [Clustered file system](../usage/nfs.en.md#vitastorfs)
## Plugins and tools
@ -46,13 +47,12 @@
- [CSI plugin for Kubernetes](../installation/kubernetes.en.md)
- [OpenStack support: Cinder driver, Nova and libvirt patches](../installation/openstack.en.md)
- [Proxmox storage plugin and packages](../installation/proxmox.en.md)
- [Simplified NFS proxy for file-based image access emulation (suitable for VMWare)](../usage/nfs.en.md)
- [Simplified NFS proxy for file-based image access emulation (suitable for VMWare)](../usage/nfs.en.md#pseudo-fs)
## Roadmap
The following features are planned for the future:
- File system
- Control plane optimisation
- Other administrative tools
- Web GUI

View File

@ -13,7 +13,7 @@
## Серверные функции
- Базовая часть - надёжное кластерное блочное хранилище без единой точки отказа
- [Производительность](../performance/comparison1.ru.md) ;-D
- [Производительность](../performance/bench2.ru.md) ;-D
- [Несколько схем отказоустойчивости](../config/pool.ru.md#scheme): репликация, XOR n+1 (1 диск чётности), коды коррекции ошибок
Рида-Соломона на основе библиотек jerasure и ISA-L с любым числом дисков данных и чётности в группе
- Конфигурация через простые человекочитаемые JSON-структуры в etcd
@ -35,6 +35,7 @@
- [Контрольные суммы](../config/layout-osd.ru.md#data_csum_type)
- [Буферизация записи на стороне клиента](../config/client.ru.md#client_enable_writeback)
- [Интеллектуальная автоподстройка скорости восстановления](../config/osd.ru.md#recovery_tune_interval)
- [Кластерная файловая система](../usage/nfs.ru.md#vitastorfs)
## Драйверы и инструменты
@ -48,11 +49,10 @@
- [CSI-плагин для Kubernetes](../installation/kubernetes.ru.md)
- [Базовая поддержка OpenStack: драйвер Cinder, патчи для Nova и libvirt](../installation/openstack.ru.md)
- [Плагин для Proxmox](../installation/proxmox.ru.md)
- [Упрощённая NFS-прокси для эмуляции файлового доступа к образам (подходит для VMWare)](../usage/nfs.ru.md)
- [Упрощённая NFS-прокси для эмуляции файлового доступа к образам (подходит для VMWare)](../usage/nfs.ru.md#псевдо-фс)
## Планы развития
- Файловая система
- Оптимизация слоя управления
- Другие инструменты администрирования
- Web-интерфейс

View File

@ -14,6 +14,7 @@
- [Check cluster status](#check-cluster-status)
- [Create an image](#create-an-image)
- [Install plugins](#install-plugins)
- [Create VitastorFS](#create-vitastorfs)
## Preparation
@ -75,18 +76,16 @@ On the monitor hosts:
## Create a pool
Create pool configuration in etcd:
Create a pool using vitastor-cli:
```
etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool",
"scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'
vitastor-cli create-pool testpool --pg_size 2 --pg_count 256
```
For EC pools the configuration should look like the following:
```
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}}'
vitastor-cli create-pool testpool --ec 2+2 --pg_count 256
```
After you do this, one of the monitors will configure PGs and OSDs will start them.
@ -116,3 +115,9 @@ After that, you can [run benchmarks](../usage/fio.en.md) or [start QEMU manually
- [Proxmox](../installation/proxmox.en.md)
- [OpenStack](../installation/openstack.en.md)
- [Kubernetes CSI](../installation/kubernetes.en.md)
## Create VitastorFS
If you want to use clustered file system in addition to VM or container images:
- [Follow the instructions here](../usage/nfs.en.md#vitastorfs)

View File

@ -14,6 +14,7 @@
- [Проверьте состояние кластера](#проверьте-состояние-кластера)
- [Создайте образ](#создайте-образ)
- [Установите плагины](#установите-плагины)
- [Создайте VitastorFS](#создайте-vitastorfs)
## Подготовка
@ -77,18 +78,16 @@
## Создайте пул
Создайте конфигурацию пула с помощью etcdctl:
Создайте пул с помощью vitastor-cli:
```
etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool",
"scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}'
vitastor-cli create-pool testpool --pg_size 2 --pg_count 256
```
Для пулов с кодами коррекции ошибок конфигурация должна выглядеть примерно так:
```
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool",
"scheme":"ec","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}}'
vitastor-cli create-pool testpool --ec 2+2 --pg_count 256
```
После этого один из мониторов должен сконфигурировать PG, а OSD должны запустить их.
@ -118,3 +117,10 @@ vitastor-cli create -s 10G testimg
- [Proxmox](../installation/proxmox.ru.md)
- [OpenStack](../installation/openstack.ru.md)
- [Kubernetes CSI](../installation/kubernetes.ru.md)
## Создайте VitastorFS
Если вы хотите использовать не только блочные образы виртуальных машин или контейнеров,
а также кластерную файловую систему, то:
- [Следуйте инструкциям](../usage/nfs.en.md#vitastorfs)

View File

@ -0,0 +1,154 @@
[Documentation](../../README.md#documentation) → Performance → Newer benchmark of Vitastor 1.3.1
-----
[Читать на русском](bench2.ru.md)
# Newer benchmark of Vitastor 1.3.1
- [Test environment](#test-environment)
- [Notes](#notes)
- [Raw drive performance](#raw-drive-performance)
- [2 replicas](#2-replicas)
- [3 replicas](#3-replicas)
- [EC 2+1](#ec-2-1)
## Test environment
Hardware configuration: 3 nodes, each with:
- 8x NVMe Samsung PM9A3 1.92 TB
- 2x Xeon Gold 6342 (24 cores @ 2.8 GHz)
- 256 GB RAM
- Dual-port 25 GbE Mellanox ConnectX-4 LX network card with RoCEv2
- Connected to 2 Mellanox SN2010 switches with MLAG
## Notes
Vitastor version was 1.3.1.
Tests were ran from the storage nodes - 4 fio clients per each of 3 nodes.
The same large 3 TB image was tested from all hosts because Vitastor has no
performance penalties related to running multiple clients over a single inode.
CPU power saving was disabled. 4 OSDs were created per each NVMe.
Checksums were not enabled. Tests with checksums will be conducted later,
along with the newer version of Vitastor, and results will be updated.
CPU configuration was not optimal because of NUMA. It's better to avoid 2-socket
platforms. It was especially noticeable in RDMA tests - in the form of ksoftirqd
processes (usually 1 per server) eating 100 % of one CPU core and actual bandwidth
of one network port reduced to 3-5 Gbit/s instead of 25 Gbit/s - probably because
of RFS (Receive Flow Steering) misses. Many network configurations were tried during
tests, but nothing helped to solve this problem, so final tests were conducted with
the default settings.
# Raw drive performance
- Linear write ~1000-2000 MB/s, depending on current state of the drive's garbage collector
- Linear read ~3300 MB/s
- T1Q1 random write ~60000 iops (latency ~0.015ms)
- T1Q1 random read ~14700 iops (latency ~0.066ms)
- T1Q16 random write ~180000 iops
- T1Q16 random read ~120000 iops
- T1Q32 random write ~180000 iops
- T1Q32 random read ~195000 iops
- T1Q128 random write ~180000 iops
- T1Q128 random read ~195000 iops
- T4Q128 random write ~525000 iops
- T4Q128 random read ~750000 iops
These numbers make obvious that results could be much better if a faster network
was available, because NVMe drives obviously weren't a bottleneck. For example,
theoretical maximum linear read performance for 24 drives could be 79.2 GByte/s,
which is 633 Gbit/s. Real Vitastor read speed (both linear and random) was around
16 Gbyte/s, which is 130 Gbit/s. It's important to note that it was still much
larger than the network bandwidth of one server (50 Gbit/s). This is also correct
because tests were conducted from all 3 nodes.
## 2 replicas
| | TCP | RDMA |
|------------------------------|--------------|--------------|
| Linear read (4M T6 Q16) | 13.13 GB/s | 16.25 GB/s |
| Linear write (4M T6 Q16) | 8.16 GB/s | 7.88 GB/s |
| Read 4k T1 Q1 | 8745 iops | 10252 iops |
| Write 4k T1 Q1 | 8097 iops | 11488 iops |
| Read 4k T12 Q128 | 1305936 iops | 4265861 iops |
| Write 4k T12 Q128 | 660490 iops | 1384033 iops |
CPU consumption OSD per 1 disk:
| | TCP | RDMA |
|------------------------------|---------|---------|
| Linear read (4M T6 Q16) | 29.7 % | 29.8 % |
| Linear write (4M T6 Q16) | 84.4 % | 33.2 % |
| Read 4k T12 Q128 | 98.4 % | 119.1 % |
| Write 4k T12 Q128 | 173.4 % | 175.9 % |
CPU consumption per 1 client (fio):
| | TCP | RDMA |
|------------------------------|--------|--------|
| Linear read (4M T6 Q16) | 100 % | 85.2 % |
| Linear write (4M T6 Q16) | 55.8 % | 48.8 % |
| Read 4k T12 Q128 | 99.9 % | 96 % |
| Write 4k T12 Q128 | 71.6 % | 48.5 % |
## 3 replicas
| | TCP | RDMA |
|------------------------------|--------------|--------------|
| Linear read (4M T6 Q16) | 13.98 GB/s | 16.54 GB/s |
| Linear write (4M T6 Q16) | 5.38 GB/s | 5.7 GB/s |
| Read 4k T1 Q1 | 8969 iops | 9980 iops |
| Write 4k T1 Q1 | 8126 iops | 11672 iops |
| Read 4k T12 Q128 | 1358818 iops | 4279088 iops |
| Write 4k T12 Q128 | 433890 iops | 993506 iops |
CPU consumption OSD per 1 disk:
| | TCP | RDMA |
|------------------------------|--------|---------|
| Linear read (4M T6 Q16) | 24.9 % | 25.4 % |
| Linear write (4M T6 Q16) | 99.3 % | 38.4 % |
| Read 4k T12 Q128 | 95.3 % | 111.7 % |
| Write 4k T12 Q128 | 173 % | 194 % |
CPU consumption per 1 client (fio):
| | TCP | RDMA |
|------------------------------|--------|--------|
| Linear read (4M T6 Q16) | 99.9 % | 85.8 % |
| Linear write (4M T6 Q16) | 38.9 % | 38.1 % |
| Read 4k T12 Q128 | 100 % | 96.1 % |
| Write 4k T12 Q128 | 51.6 % | 41.9 % |
## EC 2+1
| | TCP | RDMA |
|------------------------------|--------------|--------------|
| Linear read (4M T6 Q16) | 10.07 GB/s | 11.43 GB/s |
| Linear write (4M T6 Q16) | 7.74 GB/s | 8.32 GB/s |
| Read 4k T1 Q1 | 7408 iops | 8891 iops |
| Write 4k T1 Q1 | 3525 iops | 4903 iops |
| Read 4k T12 Q128 | 1216496 iops | 2552765 iops |
| Write 4k T12 Q128 | 278110 iops | 821261 iops |
CPU consumption OSD per 1 disk:
| | TCP | RDMA |
|------------------------------|---------|---------|
| Linear read (4M T6 Q16) | 68.6 % | 33.6 % |
| Linear write (4M T6 Q16) | 108.3 % | 50.2 % |
| Read 4k T12 Q128 | 138.1 % | 97.9 % |
| Write 4k T12 Q128 | 168.7 % | 188.5 % |
CPU consumption per 1 client (fio):
| | TCP | RDMA |
|------------------------------|--------|--------|
| Linear read (4M T6 Q16) | 88.2 % | 52.4 % |
| Linear write (4M T6 Q16) | 51.8 % | 46.8 % |
| Read 4k T12 Q128 | 99.7 % | 61.3 % |
| Write 4k T12 Q128 | 35.1 % | 31.3 % |

View File

@ -0,0 +1,157 @@
[Документация](../../README-ru.md#документация) → Производительность → Более новый тест Vitastor 1.3.1
-----
[Read in English](bench2.en.md)
# Более новый тест Vitastor 1.3.1
- [Описание стенда](#описание-стенда)
- [Примечания](#примечания)
- [Производительность голых дисков](#производительность-голых-дисков)
- [2 реплики](#2-реплики)
- [3 реплики](#3-реплики)
- [EC 2+1](#ec-2-1)
## Описание стенда
Железо: 3 сервера, в каждом:
- 8x NVMe Samsung PM9A3 1.92 TB
- 2x Xeon Gold 6342 (24 ядра @ 2.8 GHz)
- 256 GB RAM
- Двухпортовая 25 GbE сетевая карта Mellanox ConnectX-4 LX с поддержкой RoCEv2
- Подключение к 2 коммутаторам Mellanox SN2010 в MLAG
## Примечания
Версия Vitastor 1.3.1.
Тесты проводились с самих серверов хранения - по 4 клиента fio с каждого из 3 серверов.
Тестировался один большой образ размером 3 ТБ со всех хостов - создавать отдельные образы
для тестов в Vitastor необязательно, т.к. в Vitastor нет замедления при записи в один
узел несколькими клиентами.
Экономия энергии CPU отключена. На каждый NVMe создавалось 4 OSD.
Контрольные суммы не включались. Тесты с контрольными суммами будут проведены
позднее. Тогда же будет протестирована более новая версия Vitastor, и результаты
будут обновлены.
Конфигурация CPU стенда неоптимальна из-за NUMA - двухпроцессорных серверов лучше
избегать. Особенно это проявлялось во время тестов с RDMA - выражалось это в потреблении
100% одного ядра CPU одним процессом ksoftirqd и работой одного из двух сетевых портов
на скорости 3-5 ГБит/с вместо 25 ГБит/с - предположительно, из-за "непопаданий" RFS
(Receive Flow Steering) на нужные ядра. Решить эту проблему во время проведения тестов
не получилось. Было перепробовано множество различных настроек, но в итоге тесты проведены
с настройками по умолчанию, т.к. улучшения добиться не удалось.
# Производительность голых дисков
- Линейная запись ~1000-2000 МБ/с, в зависимости от состояния сборщика мусора диска
- Линейное чтение ~3300 МБ/с
- T1Q1 запись ~60000 iops (задержка ~0.015ms)
- T1Q1 чтение ~14700 iops (задержка ~0.066ms)
- T1Q16 запись ~180000 iops
- T1Q16 чтение ~120000 iops
- T1Q32 запись ~180000 iops
- T1Q32 чтение ~195000 iops
- T1Q128 запись ~180000 iops
- T1Q128 чтение ~195000 iops
- T4Q128 запись ~525000 iops
- T4Q128 чтение ~750000 iops
Из данных цифр очевидно, что при наличии более быстрой сети результаты были бы
значительно лучше, так как в диски тест, очевидно, не упирался. Например, теоретический предел по
линейному чтению для 24 таких дисков был бы около 79.2 ГБайт/с, то есть,
633 гигабита в секунду. Реальная скорость чтения (и случайного, и линейного)
Vitastor составила примерно 16 ГБайт/с, то есть 130 гигабит в секунду. При этом
следует заметить, что этот результат всё равно значительно лучше пропускной способности
сети отдельно взятого узла - что тоже вполне логично, так как тест выполнялся со
всех трёх узлов.
## 2 реплики
| | TCP | RDMA |
|------------------------------|--------------|--------------|
| Линейное чтение (4M T6 Q16) | 13.13 ГБ/с | 16.25 ГБ/с |
| Линейная запись (4M T6 Q16) | 8.16 ГБ/с | 7.88 ГБ/с |
| Чтение 4k T1 Q1 | 8745 iops | 10252 iops |
| Запись 4k T1 Q1 | 8097 iops | 11488 iops |
| Чтение 4k T12 Q128 | 1305936 iops | 4265861 iops |
| Запись 4k T12 Q128 | 660490 iops | 1384033 iops |
Потребление CPU OSD на 1 диск:
| | TCP | RDMA |
|------------------------------|---------|---------|
| Линейное чтение (4M T6 Q16) | 29.7 % | 29.8 % |
| Линейная запись (4M T6 Q16) | 84.4 % | 33.2 % |
| Чтение 4k T12 Q128 | 98.4 % | 119.1 % |
| Запись 4k T12 Q128 | 173.4 % | 175.9 % |
Потребление CPU на 1 клиента (fio):
| | TCP | RDMA |
|------------------------------|--------|--------|
| Линейное чтение (4M T6 Q16) | 100 % | 85.2 % |
| Линейная запись (4M T6 Q16) | 55.8 % | 48.8 % |
| Чтение 4k T12 Q128 | 99.9 % | 96 % |
| Запись 4k T12 Q128 | 71.6 % | 48.5 % |
## 3 реплики
| | TCP | RDMA |
|------------------------------|--------------|--------------|
| Линейное чтение (4M T6 Q16) | 13.98 ГБ/с | 16.54 ГБ/с |
| Линейная запись (4M T6 Q16) | 5.38 ГБ/с | 5.7 ГБ/с |
| Чтение 4k T1 Q1 | 8969 iops | 9980 iops |
| Запись 4k T1 Q1 | 8126 iops | 11672 iops |
| Чтение 4k T12 Q128 | 1358818 iops | 4279088 iops |
| Запись 4k T12 Q128 | 433890 iops | 993506 iops |
Потребление CPU OSD на 1 диск:
| | TCP | RDMA |
|------------------------------|--------|---------|
| Линейное чтение (4M T6 Q16) | 24.9 % | 25.4 % |
| Линейная запись (4M T6 Q16) | 99.3 % | 38.4 % |
| Чтение 4k T12 Q128 | 95.3 % | 111.7 % |
| Запись 4k T12 Q128 | 173 % | 194 % |
Потребление CPU на 1 клиента (fio):
| | TCP | RDMA |
|------------------------------|--------|--------|
| Линейное чтение (4M T6 Q16) | 99.9 % | 85.8 % |
| Линейная запись (4M T6 Q16) | 38.9 % | 38.1 % |
| Чтение 4k T12 Q128 | 100 % | 96.1 % |
| Запись 4k T12 Q128 | 51.6 % | 41.9 % |
## EC 2+1
| | TCP | RDMA |
|------------------------------|--------------|--------------|
| Линейное чтение (4M T6 Q16) | 10.07 ГБ/с | 11.43 ГБ/с |
| Линейная запись (4M T6 Q16) | 7.74 ГБ/с | 8.32 ГБ/с |
| Чтение 4k T1 Q1 | 7408 iops | 8891 iops |
| Запись 4k T1 Q1 | 3525 iops | 4903 iops |
| Чтение 4k T12 Q128 | 1216496 iops | 2552765 iops |
| Запись 4k T12 Q128 | 278110 iops | 821261 iops |
Потребление CPU OSD на 1 диск:
| | TCP | RDMA |
|------------------------------|---------|---------|
| Линейное чтение (4M T6 Q16) | 68.6 % | 33.6 % |
| Линейная запись (4M T6 Q16) | 108.3 % | 50.2 % |
| Чтение 4k T12 Q128 | 138.1 % | 97.9 % |
| Запись 4k T12 Q128 | 168.7 % | 188.5 % |
Потребление CPU на 1 клиента (fio):
| | TCP | RDMA |
|------------------------------|--------|--------|
| Линейное чтение (4M T6 Q16) | 88.2 % | 52.4 % |
| Линейная запись (4M T6 Q16) | 51.8 % | 46.8 % |
| Чтение 4k T12 Q128 | 99.7 % | 61.3 % |
| Запись 4k T12 Q128 | 35.1 % | 31.3 % |

215
docs/usage/admin.en.md Normal file
View File

@ -0,0 +1,215 @@
[Documentation](../../README.md#documentation) → Usage → Administration
-----
[Читать на русском](admin.ru.md)
# Administration
- [Pool states](#pool-states)
- [PG states](#pg-states)
- [Base PG states](#base-pg-states)
- [Additional PG states](#additional-pg-states)
- [Removing a healthy disk](#removing-a-healthy-disk)
- [Removing a failed disk](#removing-a-failed-disk)
- [Adding a disk](#adding-a-disk)
- [Restoring from lost pool configuration](#restoring-from-lost-pool-configuration)
- [Upgrading Vitastor](#upgrading-vitastor)
- [OSD memory usage](#osd-memory-usage)
## Pool states
Pool is active — that is, fully available for client input/output — when all its PGs are
'active' (maybe with some additional state flags).
If at least 1 PG is inactive, pool is also inactive and all clients suspend their I/O and
wait until you fix the cluster. :-)
## PG states
PG states may be seen in [vitastor-cli status](cli.en.md#status) output.
PG state consists of exactly 1 base state and an arbitrary number of additional states.
### Base PG states
PG state always includes exactly 1 of the following base states:
- **active** — PG is active and handles user I/O.
- **incomplete** — Not enough OSDs are available to activate this PG. That is, more disks
are lost than it's allowed by the pool's redundancy scheme. For example, if the pool has
pg_size=3 and pg_minsize=1, part of the data may be written only to 1 OSD. If that exact
OSD is lost, PG will become **incomplete**.
- **offline** — PG isn't activated by any OSD at all. Either primary OSD isn't set for
this PG at all (if the pool is just created), or an unavailable OSD is set as primary,
or the primary OSD refuses to start this PG (for example, because of wrong block_size),
or the PG is stopped by the monitor using `pause: true` flag in `/vitastor/config/pgs` in etcd.
- **starting** — primary OSD has acquired PG lock in etcd, PG is starting.
- **peering** — primary OSD requests PG object listings from secondary OSDs and calculates
the PG state.
- **repeering** — PG is waiting for current I/O operations to complete and will
then transition to **peering**.
- **stopping** — PG is waiting for current I/O operations to complete and will
then transition to **offline** or be activated by another OSD.
All states except **active** mean that PG is inactive and client I/O is suspended.
**peering** state is normally visible only for a short period of time during OSD restarts
and during switching primary OSD of PGs.
**starting**, **repeering**, **stopping** states normally almost aren't visible at all.
If you notice them for any noticeable time — chances are some operations on some OSDs hung.
Search for "slow op" in OSD logs to find them — operations hung for more than
[slow_log_interval](../config/osd.en.md#slow_log_interval) are logged as "slow ops".
State transition diagram:
![PG state transitions](pg_states.svg "PG state transitions")
### Additional PG states
If a PG is active it can also have any number of the following additional states:
- **degraded** — PG is running on reduced number of drives (OSDs), redundancy of all
objects in this PG is reduced.
- **has_incomplete** — some objects in this PG are incomplete (unrecoverable), that is,
they have too many lost EC parts (more than pool's [parity_chunks](../config/pool.en.md#parity_chunks)).
- **has_degraded** — some objects in this PG have reduced redundancy
compared to the rest of the PG (so PG can be degraded+has_degraded at the same time).
These objects should be healed automatically by recovery process, unless
it's disabled by [no_recovery](../config/osd.en.md#no_recovery).
- **has_misplaced** — some objects in this PG are stored on an OSD set different from
the target set of the PG. These objects should be moved automatically, unless
rebalance is disabled by [no_rebalance](../config/osd.en.md#no_rebalance). Objects
that are degraded and misplaced at the same time are treated as just degraded.
- **has_unclean** — one more state normally noticeable only for very short time during
PG activation. It's used only with EC pools and means that some objects of this PG
have started but not finished modifications. All such objects are either quickly
committed or rolled back by the primary OSD when starting the PG, that is why the
state shouldn't be noticeable. If you notice it, it probably means that commit or
rollback operations are hung.
- **has_invalid** — PG contains objects with incorrect part ID. Never occurs normally.
It can only occur if you delete a non-empty EC pool and then recreate it as a replica
pool or with smaller data part count.
- **has_corrupted** — PG has corrupted objects, discovered by checking checksums during
read or during scrub. When possible, such objects should be recovered automatically.
If objects remain corrupted, use [vitastor-cli describe](cli.en.md#describe) to find
out details and/or look into the log of the primary OSD of the PG.
- **has_inconsistent** — PG has objects with non-matching parts or copies on different OSDs,
and it's impossible to determine which copy is correct automatically. It may happen
if you use a pool with 2 replica and you don't enable checksums, and if data on one
of replicas becomes corrupted. You should also use vitastor-cli [describe](cli.en.md#describe)
and [fix](cli.en.md#fix) commands to remove the incorrect version in this case.
- **left_on_dead** — part of the data of this PG is left on unavailable OSD that isn't
fully removed from the cluster. You should either start the corresponding OSD back and
let it remove the unneeded data or remove it from cluster using vitastor-cli
[rm-osd](cli.en.md#rm-osd) if you know that it's gone forever (for example, if the disk died).
- **scrubbing** — data [scrub](../config/osd.en.md#auto_scrub) is running for this PG.
## Removing a healthy disk
Befor removing a healthy disk from the cluster set its OSD weight(s) to 0 to
move data away. To do that, add `"reweight":0` to etcd key `/vitastor/config/osd/<OSD_NUMBER>`.
For example:
```
etcdctl --endpoints=http://1.1.1.1:2379/v3 put /vitastor/config/osd/1 '{"reweight":0}'
```
Then wait until rebalance finishes and remove OSD by running `vitastor-disk purge /dev/vitastor/osdN-data`.
## Removing a failed disk
If a disk is already dead, its OSD(s) are likely already stopped.
In this case just remove OSD(s) from the cluster by running `vitastor-cli rm-osd OSD_NUMBER`.
## Adding a disk
If you're adding a server, first install Vitastor packages and copy the
`/etc/vitastor/vitastor.conf` configuration file to it.
After that you can just run `vitastor-disk prepare /dev/nvmeXXX`, of course with
the same parameters which you used for other OSDs in your cluster before.
## Restoring from lost pool configuration
If you remove or corrupt `/vitastor/config/pools` key in etcd all pools will
be deleted. Don't worry, the data won't be lost, but you'll need to perform
a specific recovery procedure.
First you need to restore previous configuration of the pool with the same ID
and EC/replica parameters and wait until pool PGs appear in `vitastor-cli status`.
Then add all OSDs into the history records of all PGs. You can do it by running
the following script (just don't forget to use your own PG_COUNT and POOL_ID):
```
PG_COUNT=32
POOL_ID=1
ALL_OSDS=$(etcdctl --endpoints=your_etcd_address:2379 get --keys-only --prefix /vitastor/osd/stats/ | \
perl -e '$/ = undef; $a = <>; $a =~ s/\s*$//; $a =~ s!/vitastor/osd/stats/!!g; $a =~ s/\s+/,/g; print $a')
for i in $(seq 1 $PG_COUNT); do
etcdctl --endpoints=your_etcd_address:2379 put /vitastor/pg/history/$POOL_ID/$i '{"all_peers":['$ALL_OSDS']}'; done
done
```
After that all PGs should peer and find all previous data.
## Upgrading Vitastor
Every upcoming Vitastor version is usually compatible with previous both forward
and backward regarding the network protocol and etcd data structures.
So, by default, if this page doesn't contain explicit different instructions, you
can upgrade your Vitastor cluster by simply upgrading packages and restarting all
OSDs and monitors in any order.
Upgrading is performed without stopping clients (VMs/containers), you just need to
upgrade and restart servers one by one. However, ideally you should restart VMs too
to make them use the new version of the client library.
Exceptions (specific upgrade instructions):
- Upgrading <= 1.1.x to 1.2.0 or later, if you use EC n+k with k>=2, is recommended
to be performed with full downtime: first you should stop all clients, then all OSDs,
then upgrade and start everything back — because versions before 1.2.0 have several
bugs leading to invalid data being read in EC n+k, k>=2 configurations in degraded pools.
- Versions <= 0.8.7 are incompatible with versions >= 0.9.0, so you should first
upgrade from <= 0.8.7 to 0.8.8 or 0.8.9, and only then to >= 0.9.x. If you upgrade
without this intermediate step, client I/O will hang until the end of upgrade process.
- Upgrading from <= 0.5.x to >= 0.6.x is not supported.
Rollback:
- Version 1.0.0 has a new disk format, so OSDs initiaziled on 1.0.0 can't be rolled
back to 0.9.x or previous versions.
- Versions before 0.8.0 don't have vitastor-disk, so OSDs, initialized by it, won't
start with 0.7.x or 0.6.x. :-)
## OSD memory usage
OSD uses RAM mainly for:
- Metadata index: `data_size`/[`block_size`](../config/layout-cluster.en.md#block_size) * `approximately 1.1` * `32` bytes.
Consumed always.
- Copy of the on-disk metadata area: `data_size`/[`block_size`](../config/layout-cluster.en.md#block_size) * `28` bytes.
Consumed if [inmemory_metadata](../config/osd.en.md#inmemory_metadata) isn't disabled.
- Bitmaps: `data_size`/[`bitmap_granularity`](../config/layout-cluster.en.md#bitmap_granularity)/`8` * `2` bytes.
Consumed always.
- Journal index: between 0 and, approximately, journal size. Consumed always.
- Copy of the on-disk journal area: exactly journal size. Consumed if
[inmemory_journal](../config/osd.en.md#inmemory_journal) isn't disabled.
- Checksums: `data_size`/[`csum_block_size`](../config/osd.en.md#csum_block_size) * 4 bytes.
Consumed if checksums are enabled and [inmemory_metadata](../config/osd.en.md#inmemory_metadata) isn't disabled.
bitmap_granularity is almost always 4 KB.
So with default SSD settings (block_size=128k, journal_size=32M, csum_block_size=4k) memory usage is:
- Metadata and bitmaps: ~600 MB per 1 TB of data.
- Journal: up to 64 MB per 1 OSD.
- Checksums: 1 GB per 1 TB of data.
With default HDD settings (block_size=1M, journal_size=128M, csum_block_size=32k):
- Metadata and bitmaps: ~128 MB per 1 TB of data.
- Journal: up to 256 MB per 1 OSD.
- Checksums: 128 MB per 1 TB of data.

211
docs/usage/admin.ru.md Normal file
View File

@ -0,0 +1,211 @@
[Документация](../../README-ru.md#документация) → Использование → Администрирование
-----
[Read in English](admin.en.md)
# Администрирование
- [Состояния пулов](#состояния-пулов)
- [Состояния PG](#состояния-pg)
- [Базовые состояния PG](#базовые-состояния-pg)
- [Дополнительные состояния PG](#дополнительные-состояния-pg)
- [Удаление исправного диска](#удаление-исправного-диска)
- [Удаление неисправного диска](#удаление-неисправного-диска)
- [Добавление диска](#добавление-диска)
- [Восстановление потерянной конфигурации пулов](#восстановление-потерянной-конфигурации-пулов)
- [Обновление Vitastor](#обновление-vitastor)
- [Потребление памяти OSD](#потребление-памяти-osd)
## Состояния пулов
Пул активен — то есть, полностью доступен для клиентского ввода-вывода — когда все его PG
активны, то есть, имеют статус active, возможно, с любым набором дополнительных флагов.
Если хотя бы 1 PG неактивна, пул неактивен и все клиенты зависают и ждут, пока вы почините
кластер. :-)
## Состояния PG
Вы можете видеть состояния PG в выводе команды [vitastor-cli status](cli.ru.md#status).
Состояние PG состоит из ровно 1 базового флага состояния, плюс любого числа дополнительных.
### Базовые состояния PG
Состояние PG включает в себя ровно 1 флаг из следующих:
- **active** — PG активна и обрабатывает запросы ввода-вывода от пользователей.
- **incomplete** — Недостаточно живых OSD, чтобы включить эту PG.
То есть, дисков потеряно больше, чем разрешено схемой отказоустойчивости пула и pg_minsize.
Например, если у пула pg_size=3 и pg_minsize=1, то часть данных может записаться всего на 1 OSD.
Если потом конкретно этот OSD упадёт, PG окажется **incomplete**.
- **offline** — PG вообще не активирована ни одним OSD. Либо первичный OSD не назначен вообще
(если пул только создан), либо в качестве первичного назначен недоступный OSD, либо
назначенный OSD отказывается запускать эту PG (например, из-за несовпадения block_size),
либо PG остановлена монитором через флаг `pause: true` в `/vitastor/config/pgs` в etcd.
- **starting** — первичный OSD захватил блокировку PG в etcd, PG запускается.
- **peering** — первичный OSD опрашивает вторичные OSD на предмет списков объектов данной PG и рассчитывает её состояние.
- **repeering** — PG ожидает завершения текущих операций ввода-вывода, после чего перейдёт в состояние **peering**.
- **stopping** — PG ожидает завершения текущих операций ввода-вывода, после чего перейдёт в состояние **offline** или поднимется на другом OSD.
Все состояния, кроме **active**, означают, что PG неактивна и ввод-вывод приостановлен.
Состояние **peering** в норме заметно только при перезапуске OSD или переключении первичных
OSD, на протяжении небольшого периода времени.
Состояния **starting**, **repeering**, **stopping** в норме практически не заметны вообще,
PG должны очень быстро переходить из них в другие. Если эти состояния заметны
хоть сколько-то значительное время — вероятно, какие-то операции на каких-то OSD зависли.
Чтобы найти их, ищите "slow op" в журналах OSD — операции, зависшие дольше,
чем на [slow_log_interval](../config/osd.ru.md#slow_log_interval), записываются в
журналы OSD как "slow op".
Диаграмма переходов:
![Диаграмма переходов](pg_states.svg "Диаграмма переходов")
### Дополнительные состояния PG
Если PG активна, она также может иметь любое число дополнительных флагов состояний:
- **degraded** — PG поднята на неполном числе дисков (OSD), избыточность хранения всех объектов снижена.
- **has_incomplete** — часть объектов в PG неполные (невосстановимые), то есть, у них потеряно
слишком много EC-частей (больше, чем [parity_chunks](../config/pool.ru.md#parity_chunks) пула).
- **has_degraded** — часть объектов в PG деградированы, избыточность их хранения снижена по сравнению
с остальным содержимым данной PG (то есть, PG может одновременно быть degraded+has_degraded).
Данные объекты должны восстановиться автоматически, если только восстановление не отключено
через [no_recovery](../config/osd.ru.md#no_recovery).
- **has_misplaced** — часть объектов в PG сейчас расположена не на целевом наборе OSD этой PG.
Данные объекты должны переместиться автоматически, если только перебалансировка не отключена
через [no_rebalance](../config/osd.ru.md#no_rebalance). Объекты, являющиеся одновременно
degraded и misplaced, считаются просто degraded.
- **has_unclean** — ещё одно состояние, в норме заметное только очень короткое время при поднятии PG.
Применяется только к EC и означает, что на каких-то OSD этой PG есть EC-части объектов, для которых
был начат, но не завершён процесс записи. Все такие объекты первичный OSD либо завершает, либо
откатывает при поднятии PG первым делом, поэтому состояние и не должно быть заметно. Опять-таки,
если оно заметно — значит, скорее всего, операции отката или завершения записи на каких-то OSD зависли.
- **has_invalid** — в PG найдены объекты с некорректными ID части. В норме не проявляется вообще
никогда, проявляется только если, не удалив данные, создать на месте EC-пула либо реплика-пул,
либо EC-пул с меньшим числом частей данных.
- **has_corrupted** — в PG есть повреждённые объекты, обнаруженные с помощью контрольных сумм или
скраба (сверки копий). Если объекты можно восстановить, они восстановятся автоматически. Если
не восстанавливаются, используйте команду [vitastor-cli describe](cli.ru.md#describe) для
выяснения деталей и/или смотрите в журнал первичного OSD данной PG.
- **has_inconsistent** — в PG есть объекты, у которых не совпадают копии/части данных на разных OSD,
и при этом автоматически определить, какая копия верная, а какая нет, невозможно. Такое может
произойти, если вы используете 2 реплики, не включали контрольные суммы, и на одной из реплик
данные повредились. В этом случае тоже надо использовать команды vitastor-cli [describe](cli.ru.md#describe)
и [fix](cli.ru.md#fix) для удаления некорректной версии.
- **left_on_dead** — часть данных PG осталась на отключённом, но не удалённом из кластера окончательно,
OSD. Вам нужно либо вернуть соответствующий OSD в строй и дать ему очистить лишние данные, либо
удалить его из кластера окончательно с помощью vitastor-cli [rm-osd](cli.ru.md#rm-osd), если
известно, что он уже не вернётся (например, если умер диск).
- **scrubbing** — идёт фоновая проверка данных PG ([скраб](../config/osd.ru.md#auto_scrub)).
## Удаление исправного диска
Перед удалением исправного диска из кластера установите его OSD вес в 0, чтобы убрать с него данные.
Для этого добавьте в ключ `/vitastor/config/osd/<НОМЕР_OSD>` в etcd значение `"reweight":0`, например:
```
etcdctl --endpoints=http://1.1.1.1:2379/v3 put /vitastor/config/osd/1 '{"reweight":0}'
```
Дождитесь завершения ребаланса, после чего удалите OSD командой `vitastor-disk purge /dev/vitastor/osdN-data`.
## Удаление неисправного диска
Если диск уже умер, его OSD, скорее всего, уже будет/будут остановлен(ы).
В этом случае просто удалите OSD из etcd командой `vitastor-cli rm-osd НОМЕР_OSD`.
## Добавление диска
Если сервер новый, установите на него пакеты Vitastor и скопируйте файл конфигурации
`/etc/vitastor/vitastor.conf`.
После этого достаточно выполнить команду `vitastor-disk prepare /dev/nvmeXXX`, разумеется,
с параметрами, аналогичными другим OSD в вашем кластере.
## Восстановление потерянной конфигурации пулов
Если удалить или повредить ключ `/vitastor/config/pools` в etcd, все пулы будут удалены.
Не волнуйтесь, данные потеряны не будут, но вам нужно будет провести специальную
процедуру восстановления.
Сначала нужно будет восстановить конфигурацию пулов, создав пул с таким же ID и
с такими же параметрами EC/реплик, и подождать, пока PG пула появятся в `vitastor-cli status`.
Далее нужно будет добавить все OSD в исторические записи всех PG. Примерно так
(только подставьте свои PG_COUNT и POOL_ID):
```
PG_COUNT=32
POOL_ID=1
ALL_OSDS=$(etcdctl --endpoints=your_etcd_address:2379 get --keys-only --prefix /vitastor/osd/stats/ | \
perl -e '$/ = undef; $a = <>; $a =~ s/\s*$//; $a =~ s!/vitastor/osd/stats/!!g; $a =~ s/\s+/,/g; print $a')
for i in $(seq 1 $PG_COUNT); do
etcdctl --endpoints=your_etcd_address:2379 put /vitastor/pg/history/$POOL_ID/$i '{"all_peers":['$ALL_OSDS']}'; done
done
```
После этого все PG должны пройти peering и найти все предыдущие данные.
## Обновление Vitastor
Обычно каждая следующая версия Vitastor совместима с предыдущими и "вперёд", и "назад"
с точки зрения сетевого протокола и структур данных в etcd.
Так что по умолчанию, если на данной странице не указано обратное, считается, что для
обновления достаточно обновить пакеты и перезапустить все OSD и мониторы Vitastor в
произвольном порядке.
Обновление производится без остановки клиентов (виртуальных машин/контейнеров), для этого
достаточно обновлять серверы по одному. Однако, конечно, чтобы запущенные виртуальные машины
начали использовать новую версию клиентской библиотеки, их тоже нужно перезапустить.
Исключения (особые указания при обновлении):
- Обновляться с версий <= 1.1.x до версий >= 1.2.0, если вы используете EC n+k и k>=2,
рекомендуется с временной остановкой кластера — сначала нужно остановить всех клиентов,
потом все OSD, потом обновить и запустить всё обратно — из-за нескольких багов, которые
могли приводить к некорректному чтению данных в деградированных EC-пулах.
- Версии <= 0.8.7 несовместимы с версиями >= 0.9.0, поэтому при обновлении с <= 0.8.7
нужно сначала обновиться до 0.8.8 или 0.8.9, а уже потом до любых версий >= 0.9.x.
Иначе клиентский ввод-вывод зависнет до завершения обновления.
- Обновление с версий 0.5.x и более ранних до 0.6.x и более поздних не поддерживается.
Откат:
- В версии 1.0.0 поменялся дисковый формат, поэтому OSD, созданные на версии >= 1.0.0,
нельзя откатить до версии 0.9.x и более ранних.
- В версиях ранее 0.8.0 нет vitastor-disk, значит, созданные им OSD нельзя откатить
до 0.7.x или 0.6.x. :-)
## Потребление памяти OSD
Основное потребление памяти складывается из:
- Индекс метаданных: `размеранных`/[`block_size`](../config/layout-cluster.ru.md#block_size) * `примерно 1.1` * `32` байт.
Потребляется всегда.
- Копия дисковой области метаданных: `размеранных`/[`block_size`](../config/layout-cluster.ru.md#block_size) * `28` байт.
Потребляется, если не отключена настройка [inmemory_metadata](../config/osd.ru.md#inmemory_metadata).
- Битмапы: `размеранных`/[`bitmap_granularity`](../config/layout-cluster.ru.md#bitmap_granularity)/`8` * `2` байт.
Потребляется всегда.
- Индекс журнала: от 0 до, приблизительно, размера журнала. Потребляется всегда.
- Копия дисковой области журнала: в точности размер журнала. Потребляется,
если не отключена настройка [inmemory_journal](../config/osd.ru.md#inmemory_journal).
- Контрольные суммы: `размеранных`/[`csum_block_size`](../config/osd.ru.md#csum_block_size) * `4` байт.
Потребляется, если включены контрольные суммы и не отключена настройка [inmemory_metadata](../config/osd.ru.md#inmemory_metadata).
bitmap_granularity, как правило, никогда не меняется и равен 4 килобайтам.
Таким образом, при SSD-настройках по умолчанию (block_size=128k, journal_size=32M, csum_block_size=4k) потребляется:
- Метаданные и битмапы: ~600 МБ на 1 ТБ данных
- Журнал: до 64 МБ на 1 OSD
- Контрольные суммы: 1 ГБ на 1 ТБ данных
При HDD-настройках по умолчанию (block_size=1M, journal_size=128M, csum_block_size=32k):
- Метаданные и битмапы: ~128 МБ на 1 ТБ данных
- Журнал: до 256 МБ на 1 OSD
- Контрольные суммы: 128 МБ на 1 ТБ данных

View File

@ -24,6 +24,10 @@ It supports the following commands:
- [fix](#fix)
- [alloc-osd](#alloc-osd)
- [rm-osd](#rm-osd)
- [create-pool](#create-pool)
- [modify-pool](#modify-pool)
- [ls-pools](#ls-pools)
- [rm-pool](#rm-pool)
Global options:
@ -131,19 +135,18 @@ See also about [how to export snapshots](qemu.en.md#exporting-snapshots).
## modify
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force] [--down-ok]`
Rename, resize image or change its readonly status. Images with children can't be made read-write.
If the new size is smaller than the old size, extra data will be purged.
You should resize file system in the image, if present, before shrinking it.
```
-f|--force Proceed with shrinking or setting readwrite flag even if the image has children.
```
* `-f|--force` - Proceed with shrinking or setting readwrite flag even if the image has children.
* `--down-ok` - Proceed with shrinking even if some data will be left on unavailable OSDs.
## rm
`vitastor-cli rm <from> [<to>] [--writers-stopped]`
`vitastor-cli rm <from> [<to>] [--writers-stopped] [--down-ok]`
Remove `<from>` or all layers between `<from>` and `<to>` (`<to>` must be a child of `<from>`),
rebasing all their children accordingly. --writers-stopped allows merging to be a bit
@ -151,6 +154,10 @@ more effective in case of a single 'slim' read-write child and 'fat' removed par
the child is merged into parent and parent is renamed to child in that case.
In other cases parent layers are always merged into children.
Other options:
* `--down-ok` - Continue deletion/merging even if some data will be left on unavailable OSDs.
## flatten
`vitastor-cli flatten <layer>`
@ -179,11 +186,9 @@ Merge layer data without changing metadata. Merge `<from>`..`<to>` to `<target>`
## describe
`vitastor-cli describe [--osds <osds>] [--object-state <states>] [--pool <pool>]
[--inode <ino>] [--min-inode <ino>] [--max-inode <ino>]
[--min-offset <offset>] [--max-offset <offset>]`
`vitastor-cli describe [OPTIONS]`
Describe unclean object locations in the cluster.
Describe unclean object locations in the cluster. Options:
```
--osds <osds>
@ -193,6 +198,8 @@ Describe unclean object locations in the cluster.
degraded, misplaced, incomplete, corrupted, inconsistent.
--pool <pool name or number>
Only list objects in the given pool.
--pg <pg number>
Only list objects in the given PG of the pool.
--inode, --min-inode, --max-inode
Restrict listing to specific inode numbers.
--min-offset, --max-offset
@ -238,3 +245,93 @@ Refuses to remove OSDs with data without `--force` and `--allow-data-loss`.
With `--dry-run` only checks if deletion is possible without data loss and
redundancy degradation.
## create-pool
`vitastor-cli create-pool|pool-create <name> (-s <pg_size>|--ec <N>+<K>) -n <pg_count> [OPTIONS]`
Create a pool. Required parameters:
| <!-- --> | <!-- --> |
|--------------------------|---------------------------------------------------------------------------------------|
| `-s R` or `--pg_size R` | Number of replicas for replicated pools |
| `--ec N+K` | Number of data (N) and parity (K) chunks for erasure-coded pools |
| `-n N` or `--pg_count N` | PG count for the new pool (start with 10*<OSD count>/pg_size rounded to a power of 2) |
Optional parameters:
| <!-- --> | <!-- --> |
|--------------------------------|----------------------------------------------------------------------------|
| `--pg_minsize <number>` | R or N+K minus number of failures to tolerate without downtime ([details](../config/pool.en.md#pg_minsize)) |
| `--failure_domain host` | Failure domain: host, osd or a level from placement_levels. Default: host |
| `--root_node <node>` | Put pool only on child OSDs of this placement tree node |
| `--osd_tags <tag>[,<tag>]...` | Put pool only on OSDs tagged with all specified tags |
| `--block_size 128k` | Put pool only on OSDs with this data block size |
| `--bitmap_granularity 4k` | Put pool only on OSDs with this logical sector size |
| `--immediate_commit none` | Put pool only on OSDs with this or larger immediate_commit (none < small < all) |
| `--level_placement <rules>` | Use additional failure domain rules (example: "dc=112233") |
| `--raw_placement <rules>` | Specify raw PG generation rules ([details](../config/pool.en.md#raw_placement)) |
| `--primary_affinity_tags tags` | Prefer to put primary copies on OSDs with all specified tags |
| `--scrub_interval <time>` | Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y |
| `--used_for_fs <name>` | Mark pool as used for VitastorFS with metadata in image <name> |
| `--pg_stripe_size <number>` | Increase object grouping stripe |
| `--max_osd_combinations 10000` | Maximum number of random combinations for LP solver input |
| `--wait` | Wait for the new pool to come online |
| `-f` or `--force` | Do not check that cluster has enough OSDs to create the pool |
See also [Pool configuration](../config/pool.en.md) for detailed parameter descriptions.
Examples:
`vitastor-cli create-pool test_x4 -s 4 -n 32`
`vitastor-cli create-pool test_ec42 --ec 4+2 -n 32`
## modify-pool
`vitastor-cli modify-pool|pool-modify <id|name> [--name <new_name>] [PARAMETERS...]`
Modify an existing pool. Modifiable parameters:
```
[-s|--pg_size <number>] [--pg_minsize <number>] [-n|--pg_count <count>]
[--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>] [--no_inode_stats 0|1]
[--max_osd_combinations <number>] [--primary_affinity_tags <tags>] [--scrub_interval <time>]
```
Non-modifiable parameters (changing them WILL lead to data loss):
```
[--block_size <size>] [--bitmap_granularity <size>]
[--immediate_commit <all|small|none>] [--pg_stripe_size <size>]
```
These, however, can still be modified with -f|--force.
See [create-pool](#create-pool) for parameter descriptions.
Examples:
`vitastor-cli modify-pool pool_A --name pool_B`
`vitastor-cli modify-pool 2 --pg_size 4 -n 128`
## rm-pool
`vitastor-cli rm-pool|pool-rm [--force] <id|name>`
Remove a pool. Refuses to remove pools with images without `--force`.
## ls-pools
`vitastor-cli ls-pools|pool-ls|ls-pool|pools [-l] [--detail] [--sort FIELD] [-r] [-n N] [--stats] [<glob> ...]`
List pools (only matching <glob> patterns if passed).
| <!-- --> | <!-- --> |
|----------------------|-------------------------------------------------------|
| `-l` or `--long` | Also report I/O statistics |
| `--detail` | Use list format (not table), show all details |
| `--sort FIELD` | Sort by specified field (see fields in --json output) |
| `-r` or `--reverse` | Sort in descending order |
| `-n` or `--count N` | Only list first N items |

View File

@ -23,6 +23,10 @@ vitastor-cli - интерфейс командной строки для адм
- [merge-data](#merge-data)
- [alloc-osd](#alloc-osd)
- [rm-osd](#rm-osd)
- [create-pool](#create-pool)
- [modify-pool](#modify-pool)
- [ls-pools](#ls-pools)
- [rm-pool](#rm-pool)
Глобальные опции:
@ -85,8 +89,8 @@ kaveri 2/1 32 0 B 10 G 0 B 100% 0%
`vitastor-cli ls [-l] [-p POOL] [--sort FIELD] [-r] [-n N] [<glob> ...]`
Показать список образов, если переданы шаблоны `<glob>`, то только с именами,
соответствующими этим шаблонам (стандартные ФС-шаблоны с * и ?).
Показать список образов, если передан(ы) шаблон(ы) `<glob>`, то только с именами,
соответствующими одному из шаблонов (стандартные ФС-шаблоны с * и ?).
Опции:
@ -132,7 +136,7 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
## modify
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]`
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force] [--down-ok]`
Изменить размер, имя образа или флаг "только для чтения". Снимать флаг "только для чтения"
и уменьшать размер образов, у которых есть дочерние клоны, без `--force` нельзя.
@ -140,13 +144,12 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
Если новый размер меньше старого, "лишние" данные будут удалены, поэтому перед уменьшением
образа сначала уменьшите файловую систему в нём.
```
-f|--force Разрешить уменьшение или перевод в чтение-запись образа, у которого есть клоны.
```
* `-f|--force` - Разрешить уменьшение или перевод в чтение-запись образа, у которого есть клоны.
* `--down-ok` - Разрешить уменьшение, даже если часть данных останется неудалённой на недоступных OSD.
## rm
`vitastor-cli rm <from> [<to>] [--writers-stopped]`
`vitastor-cli rm <from> [<to>] [--writers-stopped] [--down-ok]`
Удалить образ `<from>` или все слои от `<from>` до `<to>` (`<to>` должен быть дочерним
образом `<from>`), одновременно меняя родительские образы их клонов (если таковые есть).
@ -158,6 +161,10 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
В других случаях родительские слои вливаются в дочерние.
Другие опции:
* `--down-ok` - Продолжать удаление/слияние, даже если часть данных останется неудалённой на недоступных OSD.
## flatten
`vitastor-cli flatten <layer>`
@ -187,12 +194,10 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
## describe
`vitastor-cli describe [--osds <osds>] [--object-state <состояния>] [--pool <пул>]
[--inode <номер>] [--min-inode <номер>] [--max-inode <номер>]
[--min-offset <смещение>] [--max-offset <смещение>]`
`vitastor-cli describe [ОПЦИИ]`
Описать состояние "грязных" объектов в кластере, то есть таких объектов, копии
или части которых хранятся на наборе OSD, не равном целевому.
или части которых хранятся на наборе OSD, не равном целевому. Опции:
```
--osds <osds>
@ -207,6 +212,8 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
- inconsistent - неконсистентный, с неоднозначным расхождением копий/частей
--pool <имя или ID пула>
Перечислять только объекты из заданного пула.
--pg <номер PG>
Перечислять только объекты из заданной PG пула.
--inode, --min-inode, --max-inode
Перечислять только объекты из указанных номеров инодов (образов).
--min-offset, --max-offset
@ -255,3 +262,93 @@ vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>
С опцией `--dry-run` только проверяет, возможно ли удаление без потери данных и деградации
избыточности.
## create-pool
`vitastor-cli create-pool|pool-create <name> (-s <pg_size>|--ec <N>+<K>) -n <pg_count> [OPTIONS]`
Создать пул. Обязательные параметры:
| <!-- --> | <!-- --> |
|---------------------------|---------------------------------------------------------------------------------------------|
| `-s R` или `--pg_size R` | Число копий данных для реплицированных пулов |
| `--ec N+K` | Число частей данных (N) и чётности (K) для пулов с кодами коррекции ошибок |
| `-n N` или `--pg_count N` | Число PG для нового пула (начните с 10*<число OSD>/pg_size, округлённого до степени двойки) |
Необязательные параметры:
| <!-- --> | <!-- --> |
|--------------------------------|----------------------------------------------------------------------------|
| `--pg_minsize <number>` | (R или N+K) минус число разрешённых отказов без остановки пула ([подробнее](../config/pool.ru.md#pg_minsize)) |
| `--failure_domain host` | Домен отказа: host, osd или другой из placement_levels. По умолчанию: host |
| `--root_node <node>` | Использовать для пула только дочерние OSD этого узла дерева размещения |
| `--osd_tags <tag>[,<tag>]...` | ...только OSD со всеми заданными тегами |
| `--block_size 128k` | ...только OSD с данным размером блока |
| `--bitmap_granularity 4k` | ...только OSD с данным размером логического сектора |
| `--immediate_commit none` | ...только OSD с этим или большим immediate_commit (none < small < all) |
| `--level_placement <rules>` | Задать правила дополнительных доменов отказа (пример: "dc=112233") |
| `--raw_placement <rules>` | Задать низкоуровневые правила генерации PG ([детали](../config/pool.ru.md#raw_placement)) |
| `--primary_affinity_tags tags` | Предпочитать OSD со всеми данными тегами для роли первичных |
| `--scrub_interval <time>` | Включить скрабы с заданным интервалом времени (число + единица s/m/h/d/M/y) |
| `--pg_stripe_size <number>` | Увеличить блок группировки объектов по PG |
| `--max_osd_combinations 10000` | Максимальное число случайных комбинаций OSD для ЛП-солвера |
| `--wait` | Подождать, пока новый пул будет активирован |
| `-f` или `--force` | Не проверять, что в кластере достаточно доменов отказа для создания пула |
Подробно о параметрах см. [Конфигурация пулов](../config/pool.ru.md).
Примеры:
`vitastor-cli create-pool test_x4 -s 4 -n 32`
`vitastor-cli create-pool test_ec42 --ec 4+2 -n 32`
## modify-pool
`vitastor-cli modify-pool|pool-modify <id|name> [--name <new_name>] [PARAMETERS...]`
Изменить настройки существующего пула. Изменяемые параметры:
```
[-s|--pg_size <number>] [--pg_minsize <number>] [-n|--pg_count <count>]
[--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>]
[--max_osd_combinations <number>] [--primary_affinity_tags <tags>] [--scrub_interval <time>]
```
Неизменяемые параметры (их изменение ПРИВЕДЁТ к потере данных):
```
[--block_size <size>] [--bitmap_granularity <size>]
[--immediate_commit <all|small|none>] [--pg_stripe_size <size>]
```
Эти параметры можно изменить, только если явно передать опцию -f или --force.
Описания параметров смотрите в [create-pool](#create-pool).
Примеры:
`vitastor-cli modify-pool pool_A --name pool_B`
`vitastor-cli modify-pool 2 --pg_size 4 -n 128`
## rm-pool
`vitastor-cli rm-pool|pool-rm [--force] <id|name>`
Удалить пул. Отказывается удалять пул, в котором ещё есть образы, без `--force`.
## ls-pools
`vitastor-cli ls-pools|pool-ls|ls-pool|pools [-l] [--detail] [--sort FIELD] [-r] [-n N] [--stats] [<glob> ...]`
Показать список пулов. Если передан(ы) шаблон(ы) `<glob>`, то только с именами,
соответствующими одному из шаблонов (стандартные ФС-шаблоны с * и ?).
| <!-- --> | <!-- --> |
|-----------------------|------------------------------------------------------------|
| `-l` или `--long` | Вывести также статистику ввода-вывода |
| `--detail` | Максимально подробный вывод в виде списка (а не таблицы) |
| `--sort FIELD` | Сортировать по заданному полю (поля см. в выводе с --json) |
| `-r` или `--reverse` | Сортировать в обратном порядке |
| `-n` или `--count N` | Выводить только первые N записей |

View File

@ -88,7 +88,7 @@ Options (both modes):
--block_size 1M/128k Set blockstore object size
--bitmap_granularity 4k Set bitmap granularity
--data_csum_type none Set data checksum type (crc32c or none)
--csum_block_size 4k Set data checksum block size
--csum_block_size 4k/32k Set data checksum block size (SSD/HDD default)
--data_device_block 4k Override data device block size
--meta_device_block 4k Override metadata device block size
--journal_device_block 4k Override journal device block size

View File

@ -89,7 +89,7 @@ vitastor-disk - инструмент командной строки для уп
--block_size 1M/128k Задать размер объекта хранилища
--bitmap_granularity 4k Задать гранулярность битовых карт
--data_csum_type none Задать тип контрольных сумм (crc32c или none)
--csum_block_size 4k Задать размер блока расчёта контрольных сумм
--csum_block_size 4k/32k Задать размер блока расчёта контрольных сумм (дефолт SSD/HDD)
--data_device_block 4k Задать размер блока устройства данных
--meta_device_block 4k Задать размер блока метаданных
--journal_device_block 4k Задать размер блока журнала

View File

@ -15,12 +15,21 @@ See also [VDUSE](qemu.en.md#vduse) as a better alternative to NBD.
Vitastor Kubernetes CSI driver uses NBD when VDUSE is unavailable.
## Map image
Supports the following commands:
- [map](#map)
- [unmap](#unmap)
- [ls](#ls)
- [netlink-map](#netlink-map)
- [netlink-unmap](#netlink-unmap)
- [netlink-revive](#netlink-revive)
## map
To create a local block device for a Vitastor image run:
```
vitastor-nbd map --image testimg
vitastor-nbd map [/dev/nbdN] --image testimg
```
It will output a block device name like /dev/nbd0 which you can then use as a normal disk.
@ -29,25 +38,25 @@ You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--ima
vitastor-nbd supports all usual Vitastor configuration options like `--config_file <path_to_config>` plus NBD-specific:
* `--nbd_timeout 300` \
Timeout for I/O operations in seconds after exceeding which the kernel stops
the device. You can set it to 0 to disable the timeout, but beware that you
won't be able to stop the device at all if vitastor-nbd process dies.
* `--nbd_timeout 0` \
Timeout for I/O operations in seconds after exceeding which the kernel stops the device.
Before Linux 5.19, if nbd_timeout is 0, a dead NBD device can't be removed from
the system at all without rebooting.
* `--nbd_max_devices 64 --nbd_max_part 3` \
Options for the `nbd` kernel module when modprobing it (`nbds_max` and `max_part`).
note that maximum allowed (nbds_max)*(1+max_part) is 256.
* `--logfile /path/to/log/file.txt` \
Write log messages to the specified file instead of dropping them (in background mode)
or printing them to the standard output (in foreground mode).
* `--dev_num N` \
Use the specified device /dev/nbdN instead of automatic selection.
Use the specified device /dev/nbdN instead of automatic selection (alternative syntax
to /dev/nbdN positional parameter).
* `--foreground 1` \
Stay in foreground, do not daemonize.
Note that `nbd_timeout`, `nbd_max_devices` and `nbd_max_part` options may also be specified
in `/etc/vitastor/vitastor.conf` or in other configuration file specified with `--config_file`.
## Unmap image
## unmap
To unmap the device run:
@ -55,12 +64,14 @@ To unmap the device run:
vitastor-nbd unmap /dev/nbd0
```
## List mapped images
## ls
```
vitastor-nbd ls [--json]
```
List mapped images.
Example output (normal format):
```
@ -78,3 +89,45 @@ Example output (JSON format):
```
{"/dev/nbd0": {"image": "bench", "pid": 584536}, "/dev/nbd1": {"image": "bench1", "pid": 584546}}
```
## netlink-map
```
vitastor-nbd netlink-map [/dev/nbdN] (--image <image> | --pool <pool> --inode <inode> --size <size in bytes>)
```
On recent kernel versions it's also possinle to map NBD devices using netlink interface.
This is an experimental feature because it doesn't solve all issues of NBD. Differences from regular ioctl-based 'map':
1. netlink-map can create new `/dev/nbdN` devices (those not present in /dev/).
2. netlink-mapped devices can be unmapped only using `netlink-unmap` command.
3. netlink-mapped devices don't show up `ls` output (yet).
4. Dead netlink-mapped devices can be 'revived' using `netlink-revive`.
However, old I/O requests will hang forever if `nbd_timeout` is not specified.
5. netlink-map supports additional options:
* `--nbd_conn_timeout 0` \
Disconnect a dead device automatically after this number of seconds.
* `--nbd_destroy_on_disconnect 1` \
Delete the nbd device on disconnect.
* `--nbd_disconnect_on_close 1` \
Disconnect the nbd device on close by last opener.
* `--nbd_ro 1` \
Set device into read only mode.
## netlink-unmap
```
vitastor-nbd netlink-unmap /dev/nbdN
```
Unmap a device using netlink interface. Works with both netlink and ioctl mapped devices.
## netlink-revive
```
vitastor-nbd netlink-revive /dev/nbdX (--image <image> | --pool <pool> --inode <inode> --size <size in bytes>)
```
Restart a dead NBD netlink-mapped device without removing it. Supports the same options as `netlink-map`.

View File

@ -18,12 +18,21 @@ NBD немного снижает производительность из-за
CSI-драйвер Kubernetes Vitastor использует NBD, когда VDUSE недоступен.
## Подключить устройство
Поддерживаются следующие команды:
- [map](#map)
- [unmap](#unmap)
- [ls](#ls)
- [netlink-map](#netlink-map)
- [netlink-unmap](#netlink-unmap)
- [netlink-revive](#netlink-revive)
## map
Чтобы создать локальное блочное устройство для образа, выполните команду:
```
vitastor-nbd map --image testimg
vitastor-nbd map [/dev/nbdN] --image testimg
```
Команда напечатает название блочного устройства вида /dev/nbd0, которое потом можно
@ -35,16 +44,13 @@ vitastor-nbd map --image testimg
vitastor-nbd поддерживает все обычные опции Vitastor, например, `--config_file <path_to_config>`,
плюс специфичные для NBD:
* `--nbd_timeout 30` \
* `--nbd_timeout 0` \
Максимальное время выполнения любой операции чтения/записи в секундах, при
превышении которого ядро остановит NBD-устройство. Вы можете установить опцию
в 0, чтобы отключить ограничение времени, но имейте в виду, что в этом случае
вы вообще не сможете отключить NBD-устройство при нештатном завершении процесса
vitastor-nbd.
превышении которого ядро остановит NBD-устройство. На ядрах Linux старее 5.19,
если таймаут установлен в 0, NBD-устройство вообще невозможно отключить из системы
при нештатном завершении процесса.
* `--nbd_max_devices 64 --nbd_max_part 3` \
Опции, передаваемые модулю ядра nbd, если его загружает vitastor-nbd
(`nbds_max` и `max_part`). Имейте в виду, что (nbds_max)*(1+max_part)
обычно не должно превышать 256.
Опции, передаваемые модулю ядра nbd, если его загружает vitastor-nbd (`nbds_max` и `max_part`).
* `--logfile /path/to/log/file.txt` \
Писать сообщения о процессе работы в заданный файл, вместо пропуска их
при фоновом режиме запуска или печати на стандартный вывод при запуске
@ -58,7 +64,7 @@ vitastor-nbd поддерживает все обычные опции Vitastor,
также задавать в `/etc/vitastor/vitastor.conf` или в другом файле конфигурации,
заданном опцией `--config_file`.
## Отключить устройство
## unmap
Для отключения устройства выполните:
@ -66,12 +72,14 @@ vitastor-nbd поддерживает все обычные опции Vitastor,
vitastor-nbd unmap /dev/nbd0
```
## Вывести подключённые устройства
## ls
```
vitastor-nbd ls [--json]
```
Вывести подключённые устройства.
Пример вывода в обычном формате:
```
@ -89,3 +97,46 @@ pid: 584546
```
{"/dev/nbd0": {"image": "bench", "pid": 584536}, "/dev/nbd1": {"image": "bench1", "pid": 584546}}
```
## netlink-map
```
vitastor-nbd netlink-map [/dev/nbdN] (--image <image> | --pool <POOL> --inode <INODE> --size <SIZE>)
```
На свежих версиях ядра Linux также возможно подключать NBD-устройства через интерфейс netlink.
Это экспериментальная функция, так как она не решает всех проблем NBD. Отличия от обычного 'map':
1. Можно создавать новые `/dev/nbdN` устройства (отсутствующие в /dev/).
2. Отключать netlink-устройства можно только командой `netlink-unmap`.
3. netlink-устройства не видно в выводе `ls` (пока что).
4. Мёртвые netlink-устройства можно "оживить" командой `netlink-revive`. Правда, предыдущие
запросы ввода-вывода всё равно зависнут навсегда, если `nbd_timeout` не задан.
5. Поддерживаются дополнительные опции:
* `--nbd_conn_timeout 0` \
Отключать мёртвое устройство автоматически через данное число секунд.
* `--nbd_destroy_on_disconnect 1` \
Удалять NBD-устройство при отключении.
* `--nbd_disconnect_on_close 1` \
Отключать NBD-устройство автоматически, когда его все закроют.
* `--nbd_ro 1` \
Установить для NBD-устройства режим "только для чтения".
## netlink-unmap
```
vitastor-nbd netlink-unmap /dev/nbdN
```
Отключить устройство через интерфейс netlink. Работает и с обычными, и с netlink-устройствами.
## netlink-revive
```
vitastor-nbd netlink-revive /dev/nbdX (--image <image> | --pool <pool> --inode <inode> --size <size in bytes>)
```
Оживить мёртвое NBD-устройство, ранее подключённое через netlink, без удаления. Поддерживает
те же опции, что и `netlink-map`.

View File

@ -1,45 +1,153 @@
[Documentation](../../README.md#documentation) → Usage → NFS
[Documentation](../../README.md#documentation) → Usage → VitastorFS and pseudo-FS
-----
[Читать на русском](nfs.ru.md)
# NFS
# VitastorFS and pseudo-FS
Vitastor has a simplified NFS 3.0 proxy for file-based image access emulation. It's not
suitable as a full-featured file system, at least because all file/image metadata is stored
in etcd and kept in memory all the time - thus you can't put a lot of files in it.
Vitastor has two file system implementations. Both can be used via `vitastor-nfs`.
However, NFS proxy is totally fine as a method to provide VM image access and allows to
plug Vitastor into, for example, VMWare. It's important to note that for VMWare it's a much
better access method than iSCSI, because with iSCSI we'd have to put all VM images into one
Vitastor image exported as a LUN to VMWare and formatted with VMFS. VMWare doesn't use VMFS
over NFS.
Commands:
- [mount](#mount)
- [start](#start)
NFS proxy is stateless if you use immediate_commit=all mode (for SSD with capacitors or
HDDs with disabled cache), so you can run multiple NFS proxies and use a network load
balancer or any failover method you want to in that case.
## Pseudo-FS
vitastor-nfs usage:
Simplified pseudo-FS proxy is used for file-based image access emulation. It's not
suitable as a full-featured file system: it lacks a lot of FS features, it stores
all file/image metadata in memory and in etcd. So it's fine for hundreds or thousands
of large files/images, but not for millions.
Pseudo-FS proxy is intended for environments where other block volume access methods
can't be used or impose additional restrictions - for example, VMWare. NFS is better
for VMWare than, for example, iSCSI, because with iSCSI, VMWare puts all VM images
into one large shared block image in its own VMFS file system, and with NFS, VMWare
doesn't use VMFS and puts each VM disk in a regular file which is equal to one
Vitastor block image, just as originally intended.
To use Vitastor pseudo-FS locally, run `vitastor-nfs mount --block /mnt/vita`.
Also you can start the network server:
```
vitastor-nfs [STANDARD OPTIONS] [OTHER OPTIONS]
--subdir <DIR> export images prefixed <DIR>/ (default empty - export all images)
--portmap 0 do not listen on port 111 (portmap/rpcbind, requires root)
--bind <IP> bind service to <IP> address (default 0.0.0.0)
--nfspath <PATH> set NFS export path to <PATH> (default is /)
--port <PORT> use port <PORT> for NFS services (default is 2049)
--pool <POOL> use <POOL> as default pool for new files (images)
--foreground 1 stay in foreground, do not daemonize
vitastor-nfs start --block --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
```
Example start and mount commands (etcd_address is optional):
To mount the FS exported by this server, run:
```
vitastor-nfs --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
mount server:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
```
```
mount localhost:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
```
## VitastorFS
VitastorFS is a full-featured clustered (Read-Write-Many) file system. It supports most POSIX
features like hierarchical organization, symbolic links, hard links, quick renames and so on.
VitastorFS metadata is stored in a Parallel Optimistic B-Tree key-value database,
implemented over a regular Vitastor block volume. Directory entries and inodes
are stored in a simple human-readable JSON format in the B-Tree. `vitastor-kv` tool
can be used to inspect the database.
To use VitastorFS:
1. Create a pool or choose an existing empty pool for FS data
2. Create an image for FS metadata, preferably in a faster (SSD or replica-HDD) pool,
but you can create it in the data pool too if you want (image size doesn't matter):
`vitastor-cli create -s 10G -p fastpool testfs`
3. Mark data pool as an FS pool: `vitastor-cli modify-pool --used-for-fs testfs data-pool`
4. Either mount the FS: `vitastor-nfs mount --fs testfs --pool data-pool /mnt/vita`
5. Or start the NFS server: `vitastor-nfs start --fs testfs --pool data-pool`
### Supported POSIX features
- Read-after-write semantics (read returns new data immediately after write)
- Linear and random read and write
- Writing outside current file size
- Hierarchical structure, immediate rename of files and directories
- File size change support (truncate)
- Permissions (chmod/chown)
- Flushing data to stable storage (if required) (fsync)
- Symbolic links
- Hard links
- Special files (devices, sockets, named pipes)
- File modification and attribute change time tracking (mtime and ctime)
- Modification time (mtime) and last access time (atime) change support (utimes)
- Correct handling of directory listing during file creation/deletion
### Limitations
POSIX features currently not implemented in VitastorFS:
- File locking is not supported
- Actually used space is not counted, so `du` always reports apparent file sizes
instead of actually allocated space
- Access times (`atime`) are not tracked (like `-o noatime`)
- Modification time (`mtime`) is updated lazily every second (like `-o lazytime`)
Other notable missing features which should be addressed in the future:
- Defragmentation of "shared" inodes. Files smaller than pool object size (block_size
multiplied by data part count if pool is EC) are internally stored in large block
volumes sequentially, one after another, and leave garbage after deleting or resizing.
Defragmentator will be implemented to collect this garbage.
- Inode ID reuse. Currently inode IDs always grow, the limit is 2^48 inodes, so
in theory you may hit it if you create and delete a very large number of files
- Compaction of the key-value B-Tree. Current implementation never merges or deletes
B-Tree blocks, so B-Tree may become bloated over time. Currently you can
use `vitastor-kv dumpjson` & `loadjson` commands to recreate the index in such
situations.
- Filesystem check tool. VitastorFS doesn't have journal because it would impose a
severe performance hit, optimistic CAS-based transactions are used instead of it.
So, again, in theory an abnormal shutdown of the FS server may leave some garbage
in the DB. The FS is implemented is such way that this garbage doesn't affect its
function, but having a tool to clean it up still seems a right thing to do.
## Horizontal scaling
Linux NFS 3.0 client doesn't support built-in scaling or failover, i.e. you can't
specify multiple server addresses when mounting the FS.
However, you can use any regular TCP load balancing over multiple NFS servers.
It's absolutely safe with `immediate_commit=all` and `client_enable_writeback=false`
settings, because Vitastor NFS proxy doesn't keep uncommitted data in memory
with these settings. But it may even work without `immediate_commit=all` because
the Linux NFS client repeats all uncommitted writes if it loses the connection.
## Commands
### mount
`vitastor-nfs (--fs <NAME> | --block) [-o <OPT>] mount <MOUNTPOINT>`
Start local filesystem server and mount file system to <MOUNTPOINT>.
Use regular `umount <MOUNTPOINT>` to unmount the FS.
The server will be automatically stopped when the FS is unmounted.
- `-o|--options <OPT>` - Pass additional NFS mount options (ex.: -o async).
### start
`vitastor-nfs (--fs <NAME> | --block) start`
Start network NFS server. Options:
| <!-- --> | <!-- --> |
|-----------------|------------------------------------------------------------|
| `--bind <IP>` | bind service to \<IP> address (default 0.0.0.0) |
| `--port <PORT>` | use port \<PORT> for NFS services (default is 2049) |
| `--portmap 0` | do not listen on port 111 (portmap/rpcbind, requires root) |
## Common options
| <!-- --> | <!-- --> |
|--------------------|----------------------------------------------------------|
| `--fs <NAME>` | use VitastorFS with metadata in image \<NAME> |
| `--block` | use pseudo-FS presenting images as files |
| `--pool <POOL>` | use \<POOL> as default pool for new files |
| `--subdir <DIR>` | export \<DIR> instead of root directory (pseudo-FS only) |
| `--nfspath <PATH>` | set NFS export path to \<PATH> (default is /) |
| `--pidfile <FILE>` | write process ID to the specified file |
| `--logfile <FILE>` | log to the specified file |
| `--foreground 1` | stay in foreground, do not daemonize |

View File

@ -1,44 +1,159 @@
[Документация](../../README-ru.md#документация) → Использование → NFS
[Документация](../../README-ru.md#документация) → Использование → VitastorFS и псевдо-ФС
-----
[Read in English](nfs.en.md)
# NFS
# VitastorFS и псевдо-ФС
В Vitastor реализована упрощённая NFS 3.0 прокси для эмуляции файлового доступа к образам.
Это не полноценная файловая система, т.к. метаданные всех файлов (образов) сохраняются
в etcd и всё время хранятся в оперативной памяти - то есть, положить туда много файлов
не получится.
В Vitastor есть две реализации файловой системы. Обе используются через `vitastor-nfs`.
Однако в качестве способа доступа к образам виртуальных машин NFS прокси прекрасно подходит
и позволяет подключить Vitastor, например, к VMWare.
Команды:
- [mount](#mount)
- [start](#start)
При этом, если вы используете режим immediate_commit=all (для SSD с конденсаторами или HDD
с отключённым кэшем), то NFS-сервер не имеет состояния и вы можете свободно поднять
его в нескольких экземплярах и использовать поверх них сетевой балансировщик нагрузки или
схему с отказоустойчивостью.
## Псевдо-ФС
Использование vitastor-nfs:
Упрощённая реализация псевдо-ФС используется для эмуляции файлового доступа к блочным
образам Vitastor. Это не полноценная файловая система - в ней отсутствуют многие функции
POSIX ФС, а метаданные всех файлов (образов) сохраняются в etcd и всё время хранятся в
оперативной памяти - то есть, псевдо-ФС подходит для сотен или тысяч файлов, но не миллионов.
Псевдо-ФС предназначена для доступа к образам виртуальных машин в средах, где другие
способы невозможны или неудобны - например, в VMWare. Для VMWare это лучшая опция, чем
iSCSI, так как при использовании iSCSI VMWare размещает все виртуальные машины в одном
большом блочном образе внутри собственной ФС VMFS, а с NFS VMFS не используется и каждый
диск ВМ представляется в виде одного файла, то есть, соответствует одному блочному образу
Vitastor, как это и задумано изначально.
Чтобы подключить псевдо-ФС Vitastor, выполните команду `vitastor-nfs mount --block /mnt/vita`.
Либо же запустите сетевой вариант сервера:
```
vitastor-nfs [СТАНДАРТНЫЕ ОПЦИИ] [ДРУГИЕ ОПЦИИ]
--subdir <DIR> экспортировать "поддиректорию" - образы с префиксом имени <DIR>/ (по умолчанию пусто - экспортировать все образы)
--portmap 0 отключить сервис portmap/rpcbind на порту 111 (по умолчанию включён и требует root привилегий)
--bind <IP> принимать соединения по адресу <IP> (по умолчанию 0.0.0.0 - на всех)
--nfspath <PATH> установить путь NFS-экспорта в <PATH> (по умолчанию /)
--port <PORT> использовать порт <PORT> для NFS-сервисов (по умолчанию 2049)
--pool <POOL> использовать пул <POOL> для новых образов (обязательно, если пул в кластере не один)
--foreground 1 не уходить в фон после запуска
vitastor-nfs start --block --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
```
Пример монтирования Vitastor через NFS (etcd_address необязателен):
Примонтировать ФС, запущенную с такими опциями, можно следующей командой:
```
vitastor-nfs --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool
mount server:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
```
```
mount localhost:/ /mnt/ -o port=2050,mountport=2050,nfsvers=3,soft,nolock,tcp
```
## VitastorFS
VitastorFS - полноценная кластерная (Read-Write-Many) файловая система. Она поддерживает
большую часть функций POSIX - иерархическую организацию, символические ссылки, жёсткие
ссылки, быстрые переименования и так далее.
Метаданные VitastorFS хранятся в собственной реализации БД формата ключ-значения,
основанной на Параллельном Оптимистичном Б-дереве поверх обычного блочного образа Vitastor.
И записи каталогов, и иноды, как обычно в Vitastor, хранятся в простом человекочитаемом
JSON-формате :-). Для инспекции содержимого БД можно использовать инструмент `vitastor-kv`.
Чтобы использовать VitastorFS:
1. Создайте пул для данных ФС или выберите существующий пустой пул
2. Создайте блочный образ для метаданных ФС, желательно, в более быстром пуле (на SSD
или по крайней мере на HDD, но без EC), но можно и в том же пуле, что данные
(размер образа значения не имеет):
`vitastor-cli create -s 10G -p fastpool testfs`
3. Пометьте пул данных как ФС-пул: `vitastor-cli modify-pool --used-for-fs testfs data-pool`
4. Либо примонтируйте ФС: `vitastor-nfs mount --fs testfs --pool data-pool /mnt/vita`
5. Либо запустите сетевой NFS-сервер: `vitastor-nfs start --fs testfs --pool data-pool`
### Поддерживаемые функции POSIX
- Чтение актуальной версии данных сразу после записи
- Последовательное и произвольное чтение и запись
- Запись за пределами текущего размера файла
- Иерархическая организация, мгновенное переименование файлов и каталогов
- Изменение размера файла (truncate)
- Права на файлы (chmod/chown)
- Фиксация данных на диски (когда необходимо) (fsync)
- Символические ссылки
- Жёсткие ссылки
- Специальные файлы (устройства, сокеты, каналы)
- Отслеживание времён модификации (mtime), изменения атрибутов (ctime)
- Ручное изменение времён модификации (mtime), последнего доступа (atime)
- Корректная обработка изменений списка файлов во время листинга
### Ограничения
Отсутствующие на данный момент в VitastorFS функции POSIX:
- Блокировки файлов не поддерживаются
- Фактически занятое файлами место не подсчитывается и не возвращается вызовами
stat(2), так что `du` всегда показывает сумму размеров файлов, а не фактически занятое место
- Времена доступа (`atime`) не отслеживаются (как будто ФС смонтирована с `-o noatime`)
- Времена модификации (`mtime`) отслеживаются асинхронно (как будто ФС смонтирована с `-o lazytime`)
Другие недостающие функции, которые нужно добавить в будущем:
- Дефрагментация "общих инодов". На уровне реализации ФС файлы, меньшие, чем размер
объекта пула (block_size умножить на число частей данных, если пул EC),
упаковываются друг за другом в большие "общие" иноды/тома. Если такие файлы удалять
или увеличивать, они перемещаются и оставляют за собой "мусор", вот тут-то и нужен
дефрагментатор.
- Переиспользование номеров инодов. В текущей реализации номера инодов всё время
увеличиваются, так что в теории вы можете упереться в лимит, если насоздаёте
и наудаляете больше, чем 2^48 файлов.
- Очистка места в Б-дереве метаданных. Текущая реализация никогда не сливает и не
удаляет блоки Б-дерева, так что в теории дерево может разростись и стать неоптимальным.
Если вы столкнётесь с такой ситуацией сейчас, вы можете решить её с помощью
команд `vitastor-kv dumpjson` и `loadjson` (т.е. пересоздав и загрузив обратно все метаданные ФС).
- Инструмент проверки метаданных файловой системы. У VitastorFS нет журнала, так как
журнал бы сильно замедлил реализацию, вместо него используются оптимистичные
транзакции на основе CAS (сравнить-и-записать), и теоретически при нештатном
завершении сервера ФС в БД также могут оставаться неконсистентные "мусорные"
записи. ФС устроена так, что на работу они не влияют, но для порядка и их стоит
уметь подчищать.
## Горизонтальное масштабирование
Клиент Linux NFS 3.0 не поддерживает встроенное масштабирование или отказоустойчивость.
То есть, вы не можете задать несколько адресов серверов при монтировании ФС.
Однако вы можете использовать любые стандартные сетевые балансировщики нагрузки
или схемы с отказоустойчивостью. Это точно безопасно при настройках `immediate_commit=all` и
`client_enable_writeback=false`, так как с ними NFS-сервер Vitastor вообще не хранит
в памяти ещё не зафиксированные на дисках данные; и вполне вероятно безопасно
даже без `immediate_commit=all`, потому что NFS-клиент ядра Linux повторяет все
незафиксированные запросы при потере соединения.
## Команды
### mount
`vitastor-nfs (--fs <NAME> | --block) mount [-o <OPT>] <MOUNTPOINT>`
Запустить локальный сервер и примонтировать ФС в директорию <MOUNTPOINT>.
Чтобы отмонтировать ФС, используйте обычную команду `umount <MOUNTPOINT>`.
Сервер автоматически останавливается при отмонтировании ФС.
- `-o|--options <OPT>` - Передать дополнительные опции монтирования NFS (пример: -o async).
### start
`vitastor-nfs (--fs <NAME> | --block) start`
Запустить сетевой NFS-сервер. Опции:
| <!-- --> | <!-- --> |
|-----------------|-----------------------------------------------------------------------|
| `--bind <IP>` | принимать соединения по адресу \<IP> (по умолчанию 0.0.0.0 - на всех) |
| `--port <PORT>` | использовать порт \<PORT> для NFS-сервисов (по умолчанию 2049) |
| `--portmap 0` | отключить сервис portmap/rpcbind на порту 111 (по умолчанию включён и требует root привилегий) |
## Общие опции
| <!-- --> | <!-- --> |
|--------------------|---------------------------------------------------------|
| `--fs <NAME>` | использовать VitastorFS с метаданными в образе \<NAME> |
| `--block` | использовать псевдо-ФС для доступа к блочным образам |
| `--pool <POOL>` | использовать пул \<POOL> для новых файлов (обязательно, если пул в кластере не один) |
| `--subdir <DIR>` | экспортировать подкаталог \<DIR>, а не корень (только для псевдо-ФС) |
| `--nfspath <PATH>` | установить путь NFS-экспорта в \<PATH> (по умолчанию /) |
| `--pidfile <FILE>` | записать ID процесса в заданный файл |
| `--logfile <FILE>` | записывать логи в заданный файл |
| `--foreground 1` | не уходить в фон после запуска |

13
docs/usage/pg_states.dot Normal file
View File

@ -0,0 +1,13 @@
digraph G {
rankdir=LR;
bgcolor=transparent;
edge [color="#00A000"];
node [shape=hexagon, fillcolor="#A0A000", fontcolor=white, fontname="sans-serif", fontsize=12, style=filled, penwidth=0];
offline -> starting -> peering -> offline;
stopping -> offline;
starting -> incomplete -> offline;
active -> repeering -> peering -> active -> stopping;
offline [fillcolor="#A00000"];
incomplete [fillcolor="#A00000"];
active [fillcolor="#00A000"];
}

114
docs/usage/pg_states.svg Normal file
View File

@ -0,0 +1,114 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<!-- Generated by graphviz version 2.43.0 (0)
-->
<!-- Title: G Pages: 1 -->
<svg width="603pt" height="123pt"
viewBox="0.00 0.00 602.66 122.55" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 118.55)">
<title>G</title>
<!-- offline -->
<g id="node1" class="node">
<title>offline</title>
<polygon fill="#a00000" stroke="black" stroke-width="0" points="75.52,-56 56.6,-74 18.75,-74 -0.17,-56 18.75,-38 56.6,-38 75.52,-56"/>
<text text-anchor="middle" x="37.67" y="-52.9" font-family="sans-serif" font-size="12.00" fill="white">offline</text>
</g>
<!-- starting -->
<g id="node2" class="node">
<title>starting</title>
<polygon fill="#a0a000" stroke="black" stroke-width="0" points="199.56,-79 177.49,-97 133.35,-97 111.28,-79 133.35,-61 177.49,-61 199.56,-79"/>
<text text-anchor="middle" x="155.42" y="-75.9" font-family="sans-serif" font-size="12.00" fill="white">starting</text>
</g>
<!-- offline&#45;&gt;starting -->
<g id="edge1" class="edge">
<title>offline&#45;&gt;starting</title>
<path fill="none" stroke="#00a000" d="M69.39,-62.1C81.66,-64.54 96.04,-67.4 109.45,-70.06"/>
<polygon fill="#00a000" stroke="#00a000" points="108.98,-73.54 119.47,-72.05 110.34,-66.67 108.98,-73.54"/>
</g>
<!-- peering -->
<g id="node3" class="node">
<title>peering</title>
<polygon fill="#a0a000" stroke="black" stroke-width="0" points="335.57,-95 313.96,-113 270.74,-113 249.13,-95 270.74,-77 313.96,-77 335.57,-95"/>
<text text-anchor="middle" x="292.35" y="-91.9" font-family="sans-serif" font-size="12.00" fill="white">peering</text>
</g>
<!-- starting&#45;&gt;peering -->
<g id="edge2" class="edge">
<title>starting&#45;&gt;peering</title>
<path fill="none" stroke="#00a000" d="M194.36,-83.5C209.71,-85.32 227.6,-87.44 243.8,-89.36"/>
<polygon fill="#00a000" stroke="#00a000" points="243.82,-92.89 254.16,-90.59 244.64,-85.94 243.82,-92.89"/>
</g>
<!-- incomplete -->
<g id="node5" class="node">
<title>incomplete</title>
<polygon fill="#a00000" stroke="black" stroke-width="0" points="349.09,-41 320.72,-59 263.99,-59 235.62,-41 263.99,-23 320.72,-23 349.09,-41"/>
<text text-anchor="middle" x="292.35" y="-37.9" font-family="sans-serif" font-size="12.00" fill="white">incomplete</text>
</g>
<!-- starting&#45;&gt;incomplete -->
<g id="edge5" class="edge">
<title>starting&#45;&gt;incomplete</title>
<path fill="none" stroke="#00a000" d="M188.74,-69.9C204.92,-65.34 224.85,-59.73 242.82,-54.67"/>
<polygon fill="#00a000" stroke="#00a000" points="243.9,-58 252.57,-51.92 242,-51.26 243.9,-58"/>
</g>
<!-- peering&#45;&gt;offline -->
<g id="edge3" class="edge">
<title>peering&#45;&gt;offline</title>
<path fill="none" stroke="#00a000" d="M259.32,-103.69C222.67,-112.11 161.28,-121.52 111.35,-106 94.55,-100.78 78.2,-90.18 65.27,-80.08"/>
<polygon fill="#00a000" stroke="#00a000" points="67.26,-77.19 57.3,-73.58 62.84,-82.61 67.26,-77.19"/>
</g>
<!-- active -->
<g id="node6" class="node">
<title>active</title>
<polygon fill="#00a000" stroke="black" stroke-width="0" points="456.34,-49 438.55,-67 402.97,-67 385.18,-49 402.97,-31 438.55,-31 456.34,-49"/>
<text text-anchor="middle" x="420.76" y="-45.9" font-family="sans-serif" font-size="12.00" fill="white">active</text>
</g>
<!-- peering&#45;&gt;active -->
<g id="edge9" class="edge">
<title>peering&#45;&gt;active</title>
<path fill="none" stroke="#00a000" d="M322.99,-84.22C341.47,-77.49 365.34,-68.8 384.75,-61.74"/>
<polygon fill="#00a000" stroke="#00a000" points="385.96,-65.03 394.16,-58.32 383.56,-58.45 385.96,-65.03"/>
</g>
<!-- stopping -->
<g id="node4" class="node">
<title>stopping</title>
<polygon fill="#a0a000" stroke="black" stroke-width="0" points="591.65,-18 567.57,-36 519.39,-36 495.31,-18 519.39,0 567.57,0 591.65,-18"/>
<text text-anchor="middle" x="543.48" y="-14.9" font-family="sans-serif" font-size="12.00" fill="white">stopping</text>
</g>
<!-- stopping&#45;&gt;offline -->
<g id="edge4" class="edge">
<title>stopping&#45;&gt;offline</title>
<path fill="none" stroke="#00a000" d="M500.13,-14.3C440.78,-9.83 329.58,-4.07 235.49,-14 179.71,-19.89 116.5,-34.9 77.11,-45.29"/>
<polygon fill="#00a000" stroke="#00a000" points="76.14,-41.92 67.38,-47.89 77.94,-48.69 76.14,-41.92"/>
</g>
<!-- incomplete&#45;&gt;offline -->
<g id="edge6" class="edge">
<title>incomplete&#45;&gt;offline</title>
<path fill="none" stroke="#00a000" d="M240.25,-44.03C194.33,-46.76 127.57,-50.72 83.64,-53.33"/>
<polygon fill="#00a000" stroke="#00a000" points="83.32,-49.84 73.54,-53.93 83.73,-56.83 83.32,-49.84"/>
</g>
<!-- active&#45;&gt;stopping -->
<g id="edge10" class="edge">
<title>active&#45;&gt;stopping</title>
<path fill="none" stroke="#00a000" d="M449.46,-41.89C463.64,-38.25 481.26,-33.72 497.34,-29.59"/>
<polygon fill="#00a000" stroke="#00a000" points="498.29,-32.96 507.11,-27.08 496.55,-26.18 498.29,-32.96"/>
</g>
<!-- repeering -->
<g id="node7" class="node">
<title>repeering</title>
<polygon fill="#a0a000" stroke="black" stroke-width="0" points="594.84,-83 569.16,-101 517.8,-101 492.12,-83 517.8,-65 569.16,-65 594.84,-83"/>
<text text-anchor="middle" x="543.48" y="-79.9" font-family="sans-serif" font-size="12.00" fill="white">repeering</text>
</g>
<!-- active&#45;&gt;repeering -->
<g id="edge7" class="edge">
<title>active&#45;&gt;repeering</title>
<path fill="none" stroke="#00a000" d="M448.85,-56.63C462.9,-60.59 480.44,-65.53 496.53,-70.06"/>
<polygon fill="#00a000" stroke="#00a000" points="495.74,-73.47 506.32,-72.82 497.64,-66.74 495.74,-73.47"/>
</g>
<!-- repeering&#45;&gt;peering -->
<g id="edge8" class="edge">
<title>repeering&#45;&gt;peering</title>
<path fill="none" stroke="#00a000" d="M495.33,-85.27C451.99,-87.36 387.93,-90.44 343.63,-92.58"/>
<polygon fill="#00a000" stroke="#00a000" points="343.2,-89.09 333.38,-93.07 343.54,-96.09 343.2,-89.09"/>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 5.9 KiB

49
mon/.eslintrc.js Normal file
View File

@ -0,0 +1,49 @@
module.exports = {
"env": {
"es6": true,
"node": true
},
"extends": [
"eslint:recommended",
"plugin:node/recommended"
],
"parserOptions": {
"ecmaVersion": 2020
},
"plugins": [
],
"rules": {
"indent": [
"error",
4
],
"brace-style": [
"error",
"allman",
{ "allowSingleLine": true }
],
"linebreak-style": [
"error",
"unix"
],
"semi": [
"error",
"always"
],
"no-useless-escape": [
"off"
],
"no-control-regex": [
"off"
],
"no-empty": [
"off"
],
"no-process-exit": [
"off"
],
"node/shebang": [
"off"
]
}
};

View File

@ -97,7 +97,6 @@ function scale_pg_history(prev_pg_history, prev_pgs, new_pgs)
function scale_pg_count(prev_pgs, new_pg_count)
{
const old_pg_count = prev_pgs.length;
// Just for the lp_solve optimizer - pick a "previous" PG for each "new" one
if (prev_pgs.length < new_pg_count)
{

409
mon/dsl_pgs.js Normal file
View File

@ -0,0 +1,409 @@
const { select_murmur3 } = require('./murmur3.js');
const NO_OSD = 'Z';
class RuleCombinator
{
constructor(osd_tree, rules, max_combinations, ordered)
{
this.osd_tree = index_tree(Object.values(osd_tree).filter(o => o.id));
this.rules = rules;
this.max_combinations = max_combinations;
this.ordered = ordered;
}
random_combinations()
{
return random_custom_combinations(this.osd_tree, this.rules, this.max_combinations, this.ordered);
}
check_combinations(pgs)
{
return check_custom_combinations(this.osd_tree, this.rules, pgs);
}
}
// Convert alternative "level-index" format to rules
// level_index = { [level: string]: string | string[] }
// level_sequence = optional, levels from upper to lower, i.e. [ 'dc', 'host' ]
// Example: level_index = { dc: "112233", host: "ABCDEF" }
function parse_level_indexes(level_index, level_sequence)
{
const rules = [];
const lvl_first = {};
for (const level in level_index)
{
const idx = level_index[level];
while (rules.length < idx.length)
{
rules.push([]);
}
const seen = {};
for (let i = 0; i < idx.length; i++)
{
if (!seen[idx[i]])
{
const other = Object.values(seen);
if (other.length)
{
rules[i].push([ level, '!=', other ]);
}
seen[idx[i]] = i+1;
}
else
{
rules[i].push([ level, '=', seen[idx[i]] ]);
}
}
lvl_first[level] = seen;
}
if (level_sequence)
{
// Prune useless rules for the sake of prettiness
// For simplicity, call "upper" level DC and "lower" level host
const level_prio = Object.keys(level_sequence).reduce((a, c) => { a[level_sequence[c]] = c; return a; }, {});
for (let upper_i = 0; upper_i < level_sequence.length-1; upper_i++)
{
const upper_level = level_sequence[upper_i];
for (let i = 0; i < rules.length; i++)
{
const noteq = {};
for (let k = 0; k < level_index[upper_level].length; k++)
{
// If upper_level[x] is different from upper_level[y]
// then lower_level[x] is also different from lower_level[y]
if (level_index[upper_level][k] != level_index[upper_level][i])
{
noteq[k+1] = true;
}
}
for (let j = 0; j < rules[i].length; j++)
{
if (level_prio[rules[i][j][0]] != null && level_prio[rules[i][j][0]] > upper_i && rules[i][j][1] == '!=')
{
rules[i][j][2] = rules[i][j][2].filter(other_host => !noteq[other_host]);
if (!rules[i][j][2].length)
{
rules[i].splice(j--, 1);
}
}
}
}
}
}
return rules;
}
// Parse rules in DSL format
// dsl := item | item ("\n" | ",") items
// item := "any" | rules
// rules := rule | rule rules
// rule := level operator arg
// level := /\w+/
// operator := "!=" | "=" | ">" | "?="
// arg := value | "(" values ")"
// values := value | value "," values
// value := item_ref | constant_id
// item_ref := /\d+/
// constant_id := /"([^"]+)"/
//
// Output: [ level, operator, value ][][]
function parse_pg_dsl(text)
{
const tokens = [ ...text.matchAll(/\w+|!=|\?=|[>=\(\),\n]|"([^\"]+)"/g) ].map(t => [ t[0], t.index ]);
let positions = [ [] ];
let rules = positions[0];
for (let i = 0; i < tokens.length; )
{
if (tokens[i][0] === '\n' || tokens[i][0] === ',')
{
rules = [];
positions.push(rules);
i++;
}
else if (!rules.length && tokens[i][0] === 'any' && (i == tokens.length-1 || tokens[i+1][0] === ',' || tokens[i+1][0] === '\n'))
{
i++;
}
else
{
if (!/^\w/.exec(tokens[i][0]))
{
throw new Error('Unexpected '+tokens[i][0]+' at '+tokens[i][1]+' (level name expected)');
}
if (i > tokens.length-3)
{
throw new Error('Unexpected EOF (operator and value expected)');
}
if (/^\w/.exec(tokens[i+1][0]) || tokens[i+1][0] === ',' || tokens[i+1][0] === '\n')
{
throw new Error('Unexpected '+tokens[i+1][0]+' at '+tokens[i+1][1]+' (operator expected)');
}
if (!/^[\w"(]/.exec(tokens[i+2][0])) // "
{
throw new Error('Unexpected '+tokens[i+2][0]+' at '+tokens[i+2][1]+' (id, round brace, number or node ID expected)');
}
let rule = [ tokens[i][0], tokens[i+1][0], tokens[i+2][0] ];
i += 3;
if (rule[2][0] == '"')
{
rule[2] = { id: rule[2].substr(1, rule[2].length-2) };
}
else if (rule[2] === '(')
{
rule[2] = [];
// eslint-disable-next-line no-constant-condition
while (true)
{
if (i > tokens.length-1)
{
throw new Error('Unexpected EOF (expected list and a closing round brace)');
}
if (tokens[i][0] === ',')
{
i++;
}
else if (tokens[i][0] === ')')
{
i++;
break;
}
else if (tokens[i][0][0] === '"')
{
rule[2].push({ id: tokens[i][0].substr(1, tokens[i][0].length-2) });
i++;
}
else if (/^\d+$/.exec(tokens[i][0]))
{
const n = 0|tokens[i][0];
if (!n)
{
throw new Error('Level reference cannot be 0 (refs count from 1) at '+tokens[i][1]);
}
else if (n > positions.length)
{
throw new Error('Forward references are forbidden at '+tokens[i][1]);
}
rule[2].push(n);
i++;
}
else if (!/^\w/.exec(tokens[i][0]))
{
throw new Error('Unexpected '+tokens[i][0]+' at '+tokens[i][1]+' (number or node ID expected)');
}
else
{
rule[2].push({ id: tokens[i][0] });
i++;
}
}
}
else if (!/^\d+$/.exec(rule[2]))
{
rule[2] = { id: rule[2] };
}
else
{
rule[2] = 0|rule[2];
if (!rule[2])
{
throw new Error('Level reference cannot be 0 (refs count from 1) at '+tokens[i-1][1]);
}
else if (rule[2] > positions.length)
{
throw new Error('Forward references are forbidden at '+tokens[i-1][1]);
}
}
rules.push(rule);
}
}
return positions;
}
// osd_tree = index_tree() output
// levels = { string: number }
// rules = [ level, operator, value ][][]
// level = string
// operator = '=' | '!=' | '>' | '?='
// value = number|number[] | { id: string|string[] }
// examples:
// 1) simple 3 replicas with failure_domain=host:
// [ [], [ [ 'host', '!=', 1 ] ], [ [ 'host', '!=', [ 1, 2 ] ] ] ]
// in DSL form: any, host!=1, host!=(1,2)
// 2) EC 4+2 in 3 DC:
// [ [], [ [ 'dc', '=', 1 ], [ 'host', '!=', 1 ] ],
// [ 'dc', '!=', 1 ], [ [ 'dc', '=', 3 ], [ 'host', '!=', 3 ] ],
// [ 'dc', '!=', [ 1, 3 ] ], [ [ 'dc', '=', 5 ], [ 'host', '!=', 5 ] ] ]
// in DSL form: any, dc=1 host!=1, dc!=1, dc=3 host!=3, dc!=(1,3), dc=5 host!=5
// 3) 1 replica in fixed DC + 2 in random DCs:
// [ [ [ 'dc', '=', { id: 'meow' } ] ], [ [ 'dc', '!=', 1 ] ], [ [ 'dc', '!=', [ 1, 2 ] ] ] ]
// in DSL form: dc=meow, dc!=1, dc!=(1,2)
// 4) 2 replicas in each DC (almost the same as (2)):
// DSL: any, dc=1 host!=1, dc!=1, dc=3 host!=3
// Alternative simpler way to specify rules would be: [ DC: 112233 HOST: 123456 ]
function random_custom_combinations(osd_tree, rules, count, ordered)
{
const r = {};
const first = filter_tree_by_rules(osd_tree, rules[0], []);
let max_size = 0;
// All combinations for the first item (usually "any") to try to include each OSD at least once
for (const f of first)
{
const selected = [ f ];
for (let i = 1; i < rules.length; i++)
{
const filtered = filter_tree_by_rules(osd_tree, rules[i], selected);
const idx = select_murmur3(filtered.length, i => 'p:'+f.id+':'+filtered[i].id);
selected.push(idx == null ? { levels: {}, id: null } : filtered[idx]);
}
const size = selected.filter(s => s.id !== null).length;
max_size = max_size < size ? size : max_size;
const pg = selected.map(s => s.id === null ? NO_OSD : (0|s.id));
if (!ordered)
pg.sort();
r['pg_'+pg.join('_')] = pg;
}
// Pseudo-random selection
for (let n = 0; n < count; n++)
{
const selected = [];
for (const item_rules of rules)
{
const filtered = selected.length ? filter_tree_by_rules(osd_tree, item_rules, selected) : first;
const idx = select_murmur3(filtered.length, i => n+':'+filtered[i].id);
selected.push(idx == null ? { levels: {}, id: null } : filtered[idx]);
}
const size = selected.filter(s => s.id !== null).length;
max_size = max_size < size ? size : max_size;
const pg = selected.map(s => s.id === null ? NO_OSD : (0|s.id));
if (!ordered)
pg.sort();
r['pg_'+pg.join('_')] = pg;
}
// Exclude PGs with less successful selections than maximum
for (const k in r)
{
if (r[k].filter(s => s !== NO_OSD).length < max_size)
{
delete r[k];
}
}
return r;
}
function filter_tree_by_rules(osd_tree, rules, selected)
{
let cur = osd_tree[''].children;
for (const rule of rules)
{
const val = (rule[2] instanceof Array ? rule[2] : [ rule[2] ])
.map(v => v instanceof Object ? v.id : selected[v-1].levels[rule[0]]);
let preferred = [], other = [];
for (let i = 0; i < cur.length; i++)
{
const item = cur[i];
const level_id = item.levels[rule[0]];
if (level_id)
{
if (rule[1] == '>' && val.filter(v => level_id <= v).length == 0 ||
(rule[1] == '=' || rule[1] == '?=') && val.filter(v => level_id != v).length == 0 ||
rule[1] == '!=' && val.filter(v => level_id == v).length == 0)
{
// Include
preferred.push(item);
}
else if (rule[1] == '?=' && val.filter(v => level_id != v).length > 0)
{
// Non-preferred
other.push(item);
}
}
else if (item.children)
{
// Descend
cur.splice(i+1, 0, ...item.children);
}
}
cur = preferred.length ? preferred : other;
}
// Get leaf items
for (let i = 0; i < cur.length; i++)
{
if (cur[i].children)
{
// Descend
cur.splice(i, 1, ...cur[i].children);
i--;
}
}
return cur;
}
// Convert from
// node_list = { id: string|number, level: string, size?: number, parent?: string|number }[]
// to
// node_tree = { [node_id]: { id, level, size?, parent?, children?: child_node_id[], levels: { [level]: id, ... } } }
function index_tree(node_list)
{
const tree = { '': { children: [], levels: {} } };
for (const node of node_list)
{
tree[node.id] = { ...node, levels: {} };
delete tree[node.id].children;
}
for (const node of node_list)
{
const parent_id = node.parent && tree[node.parent] ? node.parent : '';
tree[parent_id].children = tree[parent_id].children || [];
tree[parent_id].children.push(tree[node.id]);
}
const cur = tree[''].children;
for (let i = 0; i < cur.length; i++)
{
cur[i].levels[cur[i].level] = cur[i].id;
if (cur[i].children)
{
for (const child of cur[i].children)
{
child.levels = { ...cur[i].levels, ...child.levels };
}
cur.splice(i, 1, ...cur[i].children);
i--;
}
}
return tree;
}
// selection = id[]
// osd_tree = index_tree output
// rules = parse_pg_dsl output
function check_custom_combinations(osd_tree, rules, pgs)
{
const res = [];
skip_pg: for (const pg of pgs)
{
let selected = pg.map(id => osd_tree[id] || null);
for (let i = 0; i < rules.length; i++)
{
const filtered = filter_tree_by_rules(osd_tree, rules[i], selected);
if (selected[i] === null && filtered.length ||
!filtered.filter(ok => selected[i].id === ok.id).length)
{
continue skip_pg;
}
}
res.push(pg);
}
return res;
}
module.exports = {
RuleCombinator,
NO_OSD,
index_tree,
parse_level_indexes,
parse_pg_dsl,
random_custom_combinations,
check_custom_combinations,
};

View File

@ -50,15 +50,15 @@ async function lp_solve(text)
return { score, vars };
}
async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1, ordered = false })
// osd_weights = { [id]: weight }
async function optimize_initial({ osd_weights, combinator, pg_count, pg_size = 3, pg_minsize = 2, parity_space = 1, ordered = false })
{
if (!pg_count || !osd_tree)
if (!pg_count || !osd_weights)
{
return null;
}
const all_weights = Object.assign({}, ...Object.values(osd_tree));
const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
const all_pgs = Object.values(random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1));
const total_weight = Object.values(osd_weights).reduce((a, c) => Number(a) + Number(c), 0);
const all_pgs = Object.values(make_cyclic(combinator.random_combinations(), parity_space));
const pg_per_osd = {};
for (const pg of all_pgs)
{
@ -69,15 +69,15 @@ async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize =
pg_per_osd[osd].push((i >= pg_minsize ? parity_space+'*' : '')+"pg_"+pg.join("_"));
}
}
const pg_effsize = Math.min(pg_minsize, Object.keys(osd_tree).length)
+ Math.max(0, Math.min(pg_size, Object.keys(osd_tree).length) - pg_minsize) * parity_space;
let pg_effsize = all_pgs.reduce((a, c) => Math.max(a, c.filter(e => e != NO_OSD).length), 0);
pg_effsize = Math.min(pg_minsize, pg_effsize) + Math.max(0, Math.min(pg_size, pg_effsize) - pg_minsize) * parity_space;
let lp = '';
lp += "max: "+all_pgs.map(pg => 'pg_'+pg.join('_')).join(' + ')+";\n";
for (const osd in pg_per_osd)
{
if (osd !== NO_OSD)
{
let osd_pg_count = all_weights[osd]/total_weight*pg_effsize*pg_count;
let osd_pg_count = osd_weights[osd]/total_weight*pg_effsize*pg_count;
lp += pg_per_osd[osd].join(' + ')+' <= '+osd_pg_count+';\n';
}
}
@ -93,7 +93,7 @@ async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize =
throw new Error('Problem is infeasible or unbounded - is it a bug?');
}
const int_pgs = make_int_pgs(lp_result.vars, pg_count, ordered);
const eff = pg_list_space_efficiency(int_pgs, all_weights, pg_minsize, parity_space);
const eff = pg_list_space_efficiency(int_pgs, osd_weights, pg_minsize, parity_space);
const res = {
score: lp_result.score,
weights: lp_result.vars,
@ -104,6 +104,22 @@ async function optimize_initial({ osd_tree, pg_count, pg_size = 3, pg_minsize =
return res;
}
function make_cyclic(pgs, parity_space)
{
if (parity_space > 1)
{
for (const pg in pgs)
{
for (let i = 1; i < pg.size; i++)
{
const cyclic = [ ...pg.slice(i), ...pg.slice(0, i) ];
pgs['pg_'+cyclic.join('_')] = cyclic;
}
}
}
return pgs;
}
function shuffle(array)
{
for (let i = array.length - 1, j, x; i > 0; i--)
@ -199,7 +215,7 @@ function calc_intersect_weights(old_pg_size, pg_size, pg_count, prev_weights, al
{
const intersect_count = ordered
? pg.reduce((a, osd, i) => a + (prev_hash[osd] == 1+i ? 1 : 0), 0)
: pg.reduce((a, osd, i) => a + (prev_hash[osd] ? 1 : 0), 0);
: pg.reduce((a, osd) => a + (prev_hash[osd] ? 1 : 0), 0);
if (max_int < intersect_count)
{
max_int = intersect_count;
@ -216,47 +232,17 @@ function calc_intersect_weights(old_pg_size, pg_size, pg_count, prev_weights, al
return move_weights;
}
function add_valid_previous(osd_tree, prev_weights, all_pgs)
{
// Add previous combinations that are still valid
const hosts = Object.keys(osd_tree).sort();
const host_per_osd = {};
for (const host in osd_tree)
{
for (const osd in osd_tree[host])
{
host_per_osd[osd] = host;
}
}
skip_pg: for (const pg_name in prev_weights)
{
const seen_hosts = {};
const pg = pg_name.substr(3).split(/_/);
for (const osd of pg)
{
if (!host_per_osd[osd] || seen_hosts[host_per_osd[osd]])
{
continue skip_pg;
}
seen_hosts[host_per_osd[osd]] = true;
}
if (!all_pgs[pg_name])
{
all_pgs[pg_name] = pg;
}
}
}
// Try to minimize data movement
async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3, pg_minsize = 2, max_combinations = 10000, parity_space = 1, ordered = false })
async function optimize_change({ prev_pgs: prev_int_pgs, osd_weights, combinator, pg_size = 3, pg_minsize = 2, parity_space = 1, ordered = false })
{
if (!osd_tree)
if (!osd_weights)
{
return null;
}
// FIXME: use parity_chunks with parity_space instead of pg_minsize
const pg_effsize = Math.min(pg_minsize, Object.keys(osd_tree).length)
+ Math.max(0, Math.min(pg_size, Object.keys(osd_tree).length) - pg_minsize) * parity_space;
let all_pgs = make_cyclic(combinator.random_combinations(), parity_space);
let pg_effsize = Object.values(all_pgs).reduce((a, c) => Math.max(a, c.filter(e => e != NO_OSD).length), 0);
pg_effsize = Math.min(pg_minsize, pg_effsize) + Math.max(0, Math.min(pg_size, pg_effsize) - pg_minsize) * parity_space;
const pg_count = prev_int_pgs.length;
const prev_weights = {};
const prev_pg_per_osd = {};
@ -273,10 +259,13 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
}
const old_pg_size = prev_int_pgs[0].length;
// Get all combinations
let all_pgs = random_combinations(osd_tree, pg_size, max_combinations, parity_space > 1);
if (old_pg_size == pg_size)
{
add_valid_previous(osd_tree, prev_weights, all_pgs);
const still_valid = combinator.check_combinations(Object.keys(prev_weights).map(pg_name => pg_name.substr(3).split('_')));
for (const pg of still_valid)
{
all_pgs['pg_'+pg.join('_')] = pg;
}
}
all_pgs = Object.values(all_pgs);
const pg_per_osd = {};
@ -295,8 +284,7 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
// Calculate total weight - old PG weights
const all_pg_names = all_pgs.map(pg => 'pg_'+pg.join('_'));
const all_pgs_hash = all_pg_names.reduce((a, c) => { a[c] = true; return a; }, {});
const all_weights = Object.assign({}, ...Object.values(osd_tree));
const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
const total_weight = Object.values(osd_weights).reduce((a, c) => Number(a) + Number(c), 0);
// Generate the LP problem
let lp = '';
lp += 'max: '+all_pg_names.map(pg_name => (
@ -311,7 +299,7 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
)).join(' + ');
const rm_osd_pg_count = (prev_pg_per_osd[osd]||[])
.reduce((a, [ old_pg_name, space ]) => (a + (all_pgs_hash[old_pg_name] ? space : 0)), 0);
const osd_pg_count = all_weights[osd]*pg_effsize/total_weight*pg_count - rm_osd_pg_count;
const osd_pg_count = osd_weights[osd]*pg_effsize/total_weight*pg_count - rm_osd_pg_count;
lp += osd_sum + ' <= ' + osd_pg_count + ';\n';
}
}
@ -421,7 +409,7 @@ async function optimize_change({ prev_pgs: prev_int_pgs, osd_tree, pg_size = 3,
int_pgs: new_pgs,
differs,
osd_differs,
space: pg_effsize * pg_list_space_efficiency(new_pgs, all_weights, pg_minsize, parity_space),
space: pg_effsize * pg_list_space_efficiency(new_pgs, osd_weights, pg_minsize, parity_space),
total_space: total_weight,
};
}
@ -502,198 +490,6 @@ function put_aligned_pgs(aligned_pgs, int_pgs, prev_int_pgs, keygen)
}
}
// Convert multi-level osd_tree = { level: number|string, id?: string, size?: number, children?: osd_tree }[]
// levels = { string: number }
// to a two-level osd_tree suitable for all_combinations()
function flatten_tree(osd_tree, levels, failure_domain_level, osd_level, domains = {}, i = { i: 1 })
{
osd_level = levels[osd_level] || osd_level;
failure_domain_level = levels[failure_domain_level] || failure_domain_level;
for (const node of osd_tree)
{
if ((levels[node.level] || node.level) < failure_domain_level)
{
flatten_tree(node.children||[], levels, failure_domain_level, osd_level, domains, i);
}
else
{
domains['dom'+(i.i++)] = extract_osds([ node ], levels, osd_level);
}
}
return domains;
}
function extract_osds(osd_tree, levels, osd_level, osds = {})
{
for (const node of osd_tree)
{
if ((levels[node.level] || node.level) >= osd_level)
{
osds[node.id] = node.size;
}
else
{
extract_osds(node.children||[], levels, osd_level, osds);
}
}
return osds;
}
// ordered = don't treat (x,y) and (y,x) as equal
function random_combinations(osd_tree, pg_size, count, ordered)
{
let seed = 0x5f020e43;
let rng = () =>
{
seed ^= seed << 13;
seed ^= seed >> 17;
seed ^= seed << 5;
return seed + 2147483648;
};
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
const hosts = Object.keys(osd_tree).sort().filter(h => osds[h].length > 0);
const r = {};
// Generate random combinations including each OSD at least once
for (let h = 0; h < hosts.length; h++)
{
for (let o = 0; o < osds[hosts[h]].length; o++)
{
const pg = [ osds[hosts[h]][o] ];
const cur_hosts = [ ...hosts ];
cur_hosts.splice(h, 1);
for (let i = 1; i < pg_size && i < hosts.length; i++)
{
const next_host = rng() % cur_hosts.length;
const next_osd = rng() % osds[cur_hosts[next_host]].length;
pg.push(osds[cur_hosts[next_host]][next_osd]);
cur_hosts.splice(next_host, 1);
}
const cyclic_pgs = [ pg ];
if (ordered)
{
for (let i = 1; i < pg.size; i++)
{
cyclic_pgs.push([ ...pg.slice(i), ...pg.slice(0, i) ]);
}
}
for (const pg of cyclic_pgs)
{
while (pg.length < pg_size)
{
pg.push(NO_OSD);
}
r['pg_'+pg.join('_')] = pg;
}
}
}
// Generate purely random combinations
while (count > 0)
{
let host_idx = [];
const cur_hosts = [ ...hosts.map((h, i) => i) ];
const max_hosts = pg_size < hosts.length ? pg_size : hosts.length;
if (ordered)
{
for (let i = 0; i < max_hosts; i++)
{
const r = rng() % cur_hosts.length;
host_idx[i] = cur_hosts[r];
cur_hosts.splice(r, 1);
}
}
else
{
for (let i = 0; i < max_hosts; i++)
{
const r = rng() % (cur_hosts.length - (max_hosts - i - 1));
host_idx[i] = cur_hosts[r];
cur_hosts.splice(0, r+1);
}
}
let pg = host_idx.map(h => osds[hosts[h]][rng() % osds[hosts[h]].length]);
while (pg.length < pg_size)
{
pg.push(NO_OSD);
}
r['pg_'+pg.join('_')] = pg;
count--;
}
return r;
}
// Super-stupid algorithm. Given the current OSD tree, generate all possible OSD combinations
// osd_tree = { failure_domain1: { osd1: size1, ... }, ... }
// ordered = return combinations without duplicates having different order
function all_combinations(osd_tree, pg_size, ordered, count)
{
const hosts = Object.keys(osd_tree).sort();
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
while (hosts.length < pg_size)
{
osds[NO_OSD] = [ NO_OSD ];
hosts.push(NO_OSD);
}
let host_idx = [];
let osd_idx = [];
for (let i = 0; i < pg_size; i++)
{
host_idx.push(i);
osd_idx.push(0);
}
const r = [];
while (!count || count < 0 || r.length < count)
{
r.push(host_idx.map((hi, i) => osds[hosts[hi]][osd_idx[i]]));
let inc = pg_size-1;
while (inc >= 0)
{
osd_idx[inc]++;
if (osd_idx[inc] >= osds[hosts[host_idx[inc]]].length)
{
osd_idx[inc] = 0;
inc--;
}
else
{
break;
}
}
if (inc < 0)
{
// no osds left in the current host combination, select the next one
inc = pg_size-1;
same_again: while (inc >= 0)
{
host_idx[inc]++;
for (let prev_host = 0; prev_host < inc; prev_host++)
{
if (host_idx[prev_host] == host_idx[inc])
{
continue same_again;
}
}
if (host_idx[inc] < (ordered ? hosts.length-(pg_size-1-inc) : hosts.length))
{
while ((++inc) < pg_size)
{
host_idx[inc] = (ordered ? host_idx[inc-1]+1 : 0);
}
break;
}
else
{
inc--;
}
}
if (inc < 0)
{
break;
}
}
}
return r;
}
function pg_weights_space_efficiency(weights, pg_count, osd_sizes)
{
const per_osd = {};
@ -752,11 +548,8 @@ module.exports = {
pg_weights_space_efficiency,
pg_list_space_efficiency,
pg_per_osd_space_efficiency,
flatten_tree,
lp_solve,
make_int_pgs,
align_pgs,
random_combinations,
all_combinations,
};

View File

@ -6,6 +6,8 @@ const http = require('http');
const crypto = require('crypto');
const os = require('os');
const WebSocket = require('ws');
const { RuleCombinator, parse_level_indexes, parse_pg_dsl } = require('./dsl_pgs.js');
const { SimpleCombinator, flatten_tree } = require('./simple_pgs.js');
const LPOptimizer = require('./lp-optimizer.js');
const stableStringify = require('./stable-stringify.js');
const PGUtil = require('./PGUtil.js');
@ -35,9 +37,8 @@ const etcd_allow = new RegExp('^'+[
'pg/state/[1-9]\\d*/[1-9]\\d*',
'pg/stats/[1-9]\\d*/[1-9]\\d*',
'pg/history/[1-9]\\d*/[1-9]\\d*',
'pool/stats/[1-9]\\d*',
'history/last_clean_pgs',
'inode/stats/[1-9]\\d*/[1-9]\\d*',
'inode/stats/[1-9]\\d*/\\d+',
'pool/stats/[1-9]\\d*',
'stats',
'index/image/.*',
@ -63,6 +64,7 @@ const etcd_tree = {
mon_stats_timeout: 1000, // ms. min: 100
osd_out_time: 600, // seconds. min: 0
placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
use_old_pg_combinator: false,
// client and osd
tcp_header_buffer_size: 65536,
use_sync_send_recv: false,
@ -86,13 +88,16 @@ const etcd_tree = {
client_max_buffered_bytes: 33554432,
client_max_buffered_ops: 1024,
client_max_writeback_iodepth: 256,
client_retry_interval: 50, // ms. min: 10
client_eio_retry_interval: 1000, // ms
client_retry_enospc: true,
osd_nearfull_ratio: 0.95,
// client and osd - configurable online
log_level: 0,
peer_connect_interval: 5, // seconds. min: 1
peer_connect_timeout: 5, // seconds. min: 1
osd_idle_timeout: 5, // seconds. min: 1
osd_ping_timeout: 5, // seconds. min: 1
up_wait_retry_interval: 50, // ms. min: 10
max_etcd_attempts: 5,
etcd_quick_timeout: 1000, // ms
etcd_slow_timeout: 5000, // ms
@ -184,7 +189,12 @@ const etcd_tree = {
// number of parity chunks, required for EC
parity_chunks?: 1,
pg_count: 100,
failure_domain: 'host',
// default is failure_domain=host
failure_domain?: 'host',
// additional failure domain rules; failure_domain=x is equivalent to x=123..N
level_placement?: 'dc=112233 host=123456',
raw_placement?: 'any, dc=1 host!=1, dc=1 host!=(1,2)',
old_combinator: false,
max_osd_combinations: 10000,
// block_size, bitmap_granularity, immediate_commit must match all OSDs used in that pool
block_size: 131072,
@ -204,7 +214,7 @@ const etcd_tree = {
}, */
pools: {},
osd: {
/* <id>: { reweight?: 1, tags?: [ 'nvme', ... ] }, ... */
/* <id>: { reweight?: 1, tags?: [ 'nvme', ... ], noout?: true }, ... */
},
/* pgs: {
hash: string,
@ -574,7 +584,7 @@ class Mon
now = Date.now();
}
tried[base] = now;
const ok = await new Promise((ok, no) =>
const ok = await new Promise(ok =>
{
const timer_id = setTimeout(() =>
{
@ -736,6 +746,7 @@ class Mon
this.save_last_clean_running = true;
// last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
const new_clean_pgs = { items: {} };
// eslint-disable-next-line indent
next_pool:
for (const pool_id in this.state.config.pools)
{
@ -818,6 +829,7 @@ class Mon
async become_master()
{
const state = { ...this.get_mon_state(), id: ''+this.etcd_lease_id };
// eslint-disable-next-line no-constant-condition
while (1)
{
const res = await this.etcd_call('/kv/txn', {
@ -860,34 +872,21 @@ class Mon
const levels = this.config.placement_levels||{};
levels.host = levels.host || 100;
levels.osd = levels.osd || 101;
const tree = { '': { children: [] } };
const tree = {};
let up_osds = {};
for (const node_id in this.state.config.node_placement||{})
{
const node_cfg = this.state.config.node_placement[node_id];
if (/^\d+$/.exec(node_id))
{
node_cfg.level = 'osd';
}
if (!node_id || !node_cfg.level || !levels[node_cfg.level])
{
// All nodes must have non-empty IDs and valid levels
continue;
}
tree[node_id] = { id: node_id, level: node_cfg.level, parent: node_cfg.parent, children: [] };
}
// This requires monitor system time to be in sync with OSD system times (at least to some extent)
const down_time = Date.now()/1000 - this.config.osd_out_time;
for (const osd_num of this.all_osds().sort((a, b) => a - b))
{
const stat = this.state.osd.stats[osd_num];
if (stat && stat.size && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time))
const osd_cfg = this.state.config.osd[osd_num];
let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight);
if (reweight < 0 || isNaN(reweight))
reweight = 1;
if (stat && stat.size && reweight && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time ||
osd_cfg && osd_cfg.noout))
{
// Numeric IDs are reserved for OSDs
const osd_cfg = this.state.config.osd[osd_num];
let reweight = osd_cfg && Number(osd_cfg.reweight);
if (reweight < 0 || isNaN(reweight))
reweight = 1;
if (this.state.osd.state[osd_num] && reweight > 0)
{
// React to down OSDs immediately
@ -915,6 +914,43 @@ class Mon
}
}
}
for (const node_id in this.state.config.node_placement||{})
{
const node_cfg = this.state.config.node_placement[node_id];
if (/^\d+$/.exec(node_id))
{
node_cfg.level = 'osd';
}
if (!node_id || !node_cfg.level || !levels[node_cfg.level] ||
node_cfg.level === 'osd' && !tree[node_id])
{
// All nodes must have non-empty IDs and valid levels
// OSDs have to actually exist
continue;
}
tree[node_id] = tree[node_id] || {};
tree[node_id].id = node_id;
tree[node_id].level = node_cfg.level;
tree[node_id].parent = node_cfg.parent;
if (node_cfg.level !== 'osd')
{
tree[node_id].children = [];
}
}
return { up_osds, levels, osd_tree: tree };
}
make_hier_tree(tree)
{
const levels = this.config.placement_levels||{};
levels.host = levels.host || 100;
levels.osd = levels.osd || 101;
tree = { ...tree };
for (const node_id in tree)
{
tree[node_id] = { ...tree[node_id], children: [] };
}
tree[''] = { children: [] };
for (const node_id in tree)
{
if (node_id === '' || tree[node_id].level === 'osd' && (!tree[node_id].size || tree[node_id].size <= 0))
@ -929,9 +965,27 @@ class Mon
// Parent's level must be less than child's; OSDs must be leaves
const parent = parent_level && parent_level < node_level ? node_cfg.parent : '';
tree[parent].children.push(tree[node_id]);
delete node_cfg.parent;
}
return { up_osds, levels, osd_tree: tree };
// Delete empty nodes
let deleted = 0;
do
{
deleted = 0;
for (const node_id in tree)
{
if (tree[node_id].level !== 'osd' && (!tree[node_id].children || !tree[node_id].children.length))
{
const parent = tree[node_id].parent;
if (parent)
{
tree[parent].children = tree[parent].children.filter(c => c != tree[node_id]);
}
deleted++;
delete tree[node_id];
}
}
} while (deleted > 0);
return tree;
}
async stop_all_pgs(pool_id)
@ -967,7 +1021,7 @@ class Mon
const key = b64(this.etcd_prefix+'/osd/state/'+osd_num);
checks.push({ key, target: 'MOD', result: 'LESS', mod_revision: ''+this.etcd_watch_revision });
}
const res = await this.etcd_call('/kv/txn', {
await this.etcd_call('/kv/txn', {
compare: [
{ key: b64(this.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
{ key: b64(this.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
@ -1095,7 +1149,6 @@ class Mon
pool_cfg.pg_minsize = Math.floor(pool_cfg.pg_minsize);
pool_cfg.parity_chunks = Math.floor(pool_cfg.parity_chunks) || undefined;
pool_cfg.pg_count = Math.floor(pool_cfg.pg_count);
pool_cfg.failure_domain = pool_cfg.failure_domain || 'host';
pool_cfg.max_osd_combinations = Math.floor(pool_cfg.max_osd_combinations) || 10000;
if (!/^[1-9]\d*$/.exec(''+pool_id))
{
@ -1175,10 +1228,45 @@ class Mon
console.log('Pool '+pool_id+' has invalid primary_affinity_tags (must be a string or array of strings)');
return false;
}
if (!this.get_pg_rules(pool_id, pool_cfg, true))
{
return false;
}
return true;
}
filter_osds_by_tags(orig_tree, flat_tree, tags)
filter_osds_by_root_node(pool_tree, root_node)
{
if (!root_node)
{
return;
}
let hier_tree = this.make_hier_tree(pool_tree);
let included = [ ...(hier_tree[root_node] || {}).children||[] ];
for (let i = 0; i < included.length; i++)
{
if (included[i].children)
{
included.splice(i+1, 0, ...included[i].children);
}
}
let cur = pool_tree[root_node] || {};
while (cur && cur.id)
{
included.unshift(cur);
cur = pool_tree[cur.parent||''];
}
included = included.reduce((a, c) => { a[c.id||''] = true; return a; }, {});
for (const item in pool_tree)
{
if (!included[item])
{
delete pool_tree[item];
}
}
}
filter_osds_by_tags(orig_tree, tags)
{
if (!tags)
{
@ -1186,30 +1274,22 @@ class Mon
}
for (const tag of (tags instanceof Array ? tags : [ tags ]))
{
for (const host in flat_tree)
for (const osd in orig_tree)
{
let found = 0;
for (const osd in flat_tree[host])
if (orig_tree[osd].level === 'osd' &&
(!orig_tree[osd].tags || !orig_tree[osd].tags[tag]))
{
if (!orig_tree[osd].tags || !orig_tree[osd].tags[tag])
delete flat_tree[host][osd];
else
found++;
}
if (!found)
{
delete flat_tree[host];
delete orig_tree[osd];
}
}
}
}
filter_osds_by_block_layout(flat_tree, block_size, bitmap_granularity, immediate_commit)
filter_osds_by_block_layout(orig_tree, block_size, bitmap_granularity, immediate_commit)
{
for (const host in flat_tree)
for (const osd in orig_tree)
{
let found = 0;
for (const osd in flat_tree[host])
if (orig_tree[osd].level === 'osd')
{
const osd_stat = this.state.osd.stats[osd];
if (osd_stat && (osd_stat.bs_block_size && osd_stat.bs_block_size != block_size ||
@ -1217,16 +1297,8 @@ class Mon
osd_stat.immediate_commit == 'small' && immediate_commit == 'all' ||
osd_stat.immediate_commit == 'none' && immediate_commit != 'none'))
{
delete flat_tree[host][osd];
delete orig_tree[osd];
}
else
{
found++;
}
}
if (!found)
{
delete flat_tree[host];
}
}
}
@ -1236,12 +1308,84 @@ class Mon
let aff_osds = up_osds;
if (pool_cfg.primary_affinity_tags)
{
aff_osds = { ...up_osds };
this.filter_osds_by_tags(osd_tree, { x: aff_osds }, pool_cfg.primary_affinity_tags);
aff_osds = Object.keys(up_osds).reduce((a, c) => { a[c] = osd_tree[c]; return a; }, {});
this.filter_osds_by_tags(aff_osds, pool_cfg.primary_affinity_tags);
for (const osd in aff_osds)
{
aff_osds[osd] = true;
}
}
return aff_osds;
}
get_pg_rules(pool_id, pool_cfg, warn)
{
if (pool_cfg.level_placement)
{
const pg_size = (0|pool_cfg.pg_size);
let rules = pool_cfg.level_placement;
if (typeof rules === 'string')
{
rules = rules.split(/\s+/).map(s => s.split(/=/, 2)).reduce((a, c) => { a[c[0]] = c[1]; return a; }, {});
}
else
{
rules = { ...rules };
}
// Always add failure_domain to prevent rules from being totally incorrect
const all_diff = [];
for (let i = 1; i <= pg_size; i++)
{
all_diff.push(i);
}
rules[pool_cfg.failure_domain || 'host'] = all_diff;
const levels = this.config.placement_levels||{};
levels.host = levels.host || 100;
levels.osd = levels.osd || 101;
for (const k in rules)
{
if (!levels[k] || typeof rules[k] !== 'string' &&
(!(rules[k] instanceof Array) ||
rules[k].filter(s => typeof s !== 'string' && typeof s !== 'number').length > 0))
{
if (warn)
console.log('Pool '+pool_id+' configuration is invalid: level_placement should be { [level]: string | (string|number)[] }');
return null;
}
else if (rules[k].length != pg_size)
{
if (warn)
console.log('Pool '+pool_id+' configuration is invalid: values in level_placement should contain exactly pg_size ('+pg_size+') items');
return null;
}
}
return parse_level_indexes(rules);
}
else if (typeof pool_cfg.raw_placement === 'string')
{
try
{
return parse_pg_dsl(pool_cfg.raw_placement);
}
catch (e)
{
if (warn)
console.log('Pool '+pool_id+' configuration is invalid: invalid raw_placement: '+e.message);
}
}
else
{
let rules = [ [] ];
let prev = [ 1 ];
for (let i = 1; i < pool_cfg.pg_size; i++)
{
rules.push([ [ pool_cfg.failure_domain||'host', '!=', prev ] ]);
prev = [ ...prev, i+1 ];
}
return rules;
}
}
async generate_pool_pgs(pool_id, osd_tree, levels)
{
const pool_cfg = this.state.config.pools[pool_id];
@ -1249,16 +1393,16 @@ class Mon
{
return null;
}
let pool_tree = osd_tree[pool_cfg.root_node || ''];
pool_tree = pool_tree ? pool_tree.children : [];
pool_tree = LPOptimizer.flatten_tree(pool_tree, levels, pool_cfg.failure_domain, 'osd');
this.filter_osds_by_tags(osd_tree, pool_tree, pool_cfg.osd_tags);
let pool_tree = { ...osd_tree };
this.filter_osds_by_root_node(pool_tree, pool_cfg.root_node);
this.filter_osds_by_tags(pool_tree, pool_cfg.osd_tags);
this.filter_osds_by_block_layout(
pool_tree,
pool_cfg.block_size || this.config.block_size || 131072,
pool_cfg.bitmap_granularity || this.config.bitmap_granularity || 4096,
pool_cfg.immediate_commit || this.config.immediate_commit || 'none'
);
pool_tree = this.make_hier_tree(pool_tree);
// First try last_clean_pgs to minimize data movement
let prev_pgs = [];
for (const pg in ((this.state.history.last_clean_pgs.items||{})[pool_id]||{}))
@ -1275,11 +1419,15 @@ class Mon
}
const old_pg_count = prev_pgs.length;
const optimize_cfg = {
osd_tree: pool_tree,
osd_weights: Object.values(pool_tree).filter(item => item.level === 'osd').reduce((a, c) => { a[c.id] = c.size; return a; }, {}),
combinator: !this.config.use_old_pg_combinator || pool_cfg.level_placement || pool_cfg.raw_placement
// new algorithm:
? new RuleCombinator(pool_tree, this.get_pg_rules(pool_id, pool_cfg), pool_cfg.max_osd_combinations)
// old algorithm:
: new SimpleCombinator(flatten_tree(pool_tree[''].children, levels, pool_cfg.failure_domain, 'osd'), pool_cfg.pg_size, pool_cfg.max_osd_combinations),
pg_count: pool_cfg.pg_count,
pg_size: pool_cfg.pg_size,
pg_minsize: pool_cfg.pg_minsize,
max_combinations: pool_cfg.max_osd_combinations,
ordered: pool_cfg.scheme != 'replicated',
};
let optimize_result;
@ -1311,7 +1459,15 @@ class Mon
}
console.log(`Pool ${pool_id} (${pool_cfg.name || 'unnamed'}):`);
LPOptimizer.print_change_stats(optimize_result);
const pg_effsize = Math.min(pool_cfg.pg_size, Object.keys(pool_tree).length);
let pg_effsize = pool_cfg.pg_size;
for (const pg of optimize_result.int_pgs)
{
const this_pg_size = pg.filter(osd => osd != LPOptimizer.NO_OSD).length;
if (this_pg_size && this_pg_size < pg_effsize)
{
pg_effsize = this_pg_size;
}
}
return {
pool_id,
pgs: optimize_result.int_pgs,
@ -1348,8 +1504,8 @@ class Mon
// Something has changed
console.log('Pool configuration or OSD tree changed, re-optimizing');
// First re-optimize PGs, but don't look at history yet
const optimize_results = await Promise.all(Object.keys(this.state.config.pools)
.map(pool_id => this.generate_pool_pgs(pool_id, osd_tree, levels)));
const optimize_results = (await Promise.all(Object.keys(this.state.config.pools)
.map(pool_id => this.generate_pool_pgs(pool_id, osd_tree, levels)))).filter(r => r);
// Then apply the modification in the form of an optimistic transaction,
// each time considering new pg/history modifications (OSDs modify it during rebalance)
while (!await this.apply_pool_pgs(optimize_results, up_osds, osd_tree, tree_hash))
@ -1393,11 +1549,14 @@ class Mon
{
continue;
}
const replicated = pool_cfg.scheme === 'replicated';
const aff_osds = this.get_affinity_osds(pool_cfg, up_osds, osd_tree);
this.reset_rng();
for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
{
if (!this.state.config.pgs.items[pool_id])
{
continue;
}
const pg_cfg = this.state.config.pgs.items[pool_id][pg_num];
if (pg_cfg)
{
@ -1567,7 +1726,6 @@ class Mon
derive_osd_stats(st, prev, prev_diff)
{
const zero_stats = { op: { bps: 0n, iops: 0n, lat: 0n }, subop: { iops: 0n, lat: 0n }, recovery: { bps: 0n, iops: 0n } };
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
if (!st || !st.time || !prev || !prev.time || prev.time >= st.time)
{
@ -1607,7 +1765,7 @@ class Mon
}
for (const pool_id in st.inode_stats||{})
{
const pool_diff = diff.inode_stats[pool_id] = {};
diff.inode_stats[pool_id] = {};
for (const inode_num in st.inode_stats[pool_id])
{
const inode_diff = diff.inode_stats[pool_id][inode_num] = {};
@ -1736,8 +1894,11 @@ class Mon
for (const inode_num in this.state.osd.space[osd_num][pool_id])
{
const u = BigInt(this.state.osd.space[osd_num][pool_id][inode_num]||0);
inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
inode_stats[pool_id][inode_num].raw_used += u;
if (inode_num)
{
inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
inode_stats[pool_id][inode_num].raw_used += u;
}
this.state.pool.stats[pool_id].used_raw_tb += u;
}
}
@ -2022,7 +2183,7 @@ class Mon
_die(err, code)
{
// In fact we can just try to rejoin
console.error(new Error(err || 'Cluster connection failed'));
console.error(err instanceof Error ? err : new Error(err || 'Cluster connection failed'));
process.exit(code || 2);
}
@ -2046,7 +2207,7 @@ class Mon
function POST(url, body, timeout)
{
return new Promise((ok, no) =>
return new Promise(ok =>
{
const body_text = Buffer.from(JSON.stringify(body));
let timer_id = timeout > 0 ? setTimeout(() =>

38
mon/murmur3.js Normal file
View File

@ -0,0 +1,38 @@
function select_murmur3(count, cb)
{
if (!count)
{
return null;
}
else
{
let i = 0, maxh = -1;
for (let j = 0; j < count; j++)
{
const h = murmur3(cb(j));
if (h > maxh)
{
i = j;
maxh = h;
}
}
return i;
}
}
function murmur3(s)
{
let hash = 0x12345678;
for (let i = 0; i < s.length; i++)
{
hash ^= s.charCodeAt(i);
hash = (hash*0x5bd1e995) & 0xFFFFFFFF;
hash ^= (hash >> 15);
}
return hash;
}
module.exports = {
murmur3,
select_murmur3,
};

View File

@ -1,6 +1,6 @@
{
"name": "vitastor-mon",
"version": "1.4.7",
"version": "1.6.1",
"description": "Vitastor SDS monitor service",
"main": "mon-main.js",
"scripts": {
@ -11,5 +11,15 @@
"dependencies": {
"sprintf-js": "^1.1.2",
"ws": "^7.2.5"
},
"devDependencies": {
"eslint": "^8.0.0",
"eslint-plugin-node": "^11.1.0"
},
"engines": {
"node": ">=12.0.0"
},
"scripts": {
"lint": "eslint *.js"
}
}

View File

@ -38,7 +38,7 @@ async function run()
const st = await fs.stat(options.device);
options.device_block_size = st.blksize;
if (st.isBlockDevice())
device_size = Number(await system("/sbin/blockdev --getsize64 "+options.device))
device_size = Number(await system("/sbin/blockdev --getsize64 "+options.device));
else
device_size = st.size;
}
@ -91,7 +91,7 @@ async function run()
function system(cmd)
{
return new Promise((ok, no) => child_process.exec(cmd, { maxBuffer: 64*1024*1024 }, (err, stdout, stderr) => (err ? no(err.message) : ok(stdout))));
return new Promise((ok, no) => child_process.exec(cmd, { maxBuffer: 64*1024*1024 }, (err, stdout/*, stderr*/) => (err ? no(err.message) : ok(stdout))));
}
run().catch(err => { console.error(err); process.exit(1); });

241
mon/simple_pgs.js Normal file
View File

@ -0,0 +1,241 @@
const { select_murmur3 } = require('./murmur3.js');
const NO_OSD = 'Z';
class SimpleCombinator
{
constructor(flat_tree, pg_size, max_combinations, ordered)
{
this.osd_tree = flat_tree;
this.pg_size = pg_size;
this.max_combinations = max_combinations;
this.ordered = ordered;
}
random_combinations()
{
return random_combinations(this.osd_tree, this.pg_size, this.max_combinations, this.ordered);
}
check_combinations(pgs)
{
return check_combinations(this.osd_tree, pgs);
}
}
// Convert multi-level osd_tree = { level: number|string, id?: string, size?: number, children?: osd_tree }[]
// levels = { string: number }
// to a two-level osd_tree suitable for all_combinations()
function flatten_tree(osd_tree, levels, failure_domain_level, osd_level, domains = {}, i = { i: 1 })
{
osd_level = levels[osd_level] || osd_level;
failure_domain_level = levels[failure_domain_level] || failure_domain_level;
for (const node of osd_tree)
{
if ((levels[node.level] || node.level) < failure_domain_level)
{
flatten_tree(node.children||[], levels, failure_domain_level, osd_level, domains, i);
}
else
{
domains['dom'+(i.i++)] = extract_osds([ node ], levels, osd_level);
}
}
return domains;
}
function extract_osds(osd_tree, levels, osd_level, osds = {})
{
for (const node of osd_tree)
{
if ((levels[node.level] || node.level) >= osd_level)
{
osds[node.id] = node.size;
}
else
{
extract_osds(node.children||[], levels, osd_level, osds);
}
}
return osds;
}
// ordered = don't treat (x,y) and (y,x) as equal
function random_combinations(osd_tree, pg_size, count, ordered)
{
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
const hosts = Object.keys(osd_tree).sort().filter(h => osds[h].length > 0);
const r = {};
// Generate random combinations including each OSD at least once
for (let h = 0; h < hosts.length; h++)
{
for (let o = 0; o < osds[hosts[h]].length; o++)
{
const pg = [ osds[hosts[h]][o] ];
const cur_hosts = [ ...hosts ];
cur_hosts.splice(h, 1);
for (let i = 1; i < pg_size && i < hosts.length; i++)
{
const next_host = select_murmur3(cur_hosts.length, i => pg[0]+':i:'+cur_hosts[i]);
const next_osd = select_murmur3(osds[cur_hosts[next_host]].length, i => pg[0]+':i:'+osds[cur_hosts[next_host]][i]);
pg.push(osds[cur_hosts[next_host]][next_osd]);
cur_hosts.splice(next_host, 1);
}
while (pg.length < pg_size)
{
pg.push(NO_OSD);
}
r['pg_'+pg.join('_')] = pg;
}
}
// Generate purely random combinations
while (count > 0)
{
let host_idx = [];
const cur_hosts = [ ...hosts.map((h, i) => i) ];
const max_hosts = pg_size < hosts.length ? pg_size : hosts.length;
if (ordered)
{
for (let i = 0; i < max_hosts; i++)
{
const r = select_murmur3(cur_hosts.length, i => count+':h:'+cur_hosts[i]);
host_idx[i] = cur_hosts[r];
cur_hosts.splice(r, 1);
}
}
else
{
for (let i = 0; i < max_hosts; i++)
{
const r = select_murmur3(cur_hosts.length - (max_hosts - i - 1), i => count+':h:'+cur_hosts[i]);
host_idx[i] = cur_hosts[r];
cur_hosts.splice(0, r+1);
}
}
let pg = host_idx.map(h => osds[hosts[h]][select_murmur3(osds[hosts[h]].length, i => count+':o:'+osds[hosts[h]][i])]);
while (pg.length < pg_size)
{
pg.push(NO_OSD);
}
r['pg_'+pg.join('_')] = pg;
count--;
}
return r;
}
// Super-stupid algorithm. Given the current OSD tree, generate all possible OSD combinations
// osd_tree = { failure_domain1: { osd1: size1, ... }, ... }
// ordered = return combinations without duplicates having different order
function all_combinations(osd_tree, pg_size, ordered, count)
{
const hosts = Object.keys(osd_tree).sort();
const osds = Object.keys(osd_tree).reduce((a, c) => { a[c] = Object.keys(osd_tree[c]).sort(); return a; }, {});
while (hosts.length < pg_size)
{
osds[NO_OSD] = [ NO_OSD ];
hosts.push(NO_OSD);
}
let host_idx = [];
let osd_idx = [];
for (let i = 0; i < pg_size; i++)
{
host_idx.push(i);
osd_idx.push(0);
}
const r = [];
while (!count || count < 0 || r.length < count)
{
r.push(host_idx.map((hi, i) => osds[hosts[hi]][osd_idx[i]]));
let inc = pg_size-1;
while (inc >= 0)
{
osd_idx[inc]++;
if (osd_idx[inc] >= osds[hosts[host_idx[inc]]].length)
{
osd_idx[inc] = 0;
inc--;
}
else
{
break;
}
}
if (inc < 0)
{
// no osds left in the current host combination, select the next one
inc = pg_size-1;
same_again: while (inc >= 0)
{
host_idx[inc]++;
for (let prev_host = 0; prev_host < inc; prev_host++)
{
if (host_idx[prev_host] == host_idx[inc])
{
continue same_again;
}
}
if (host_idx[inc] < (ordered ? hosts.length-(pg_size-1-inc) : hosts.length))
{
while ((++inc) < pg_size)
{
host_idx[inc] = (ordered ? host_idx[inc-1]+1 : 0);
}
break;
}
else
{
inc--;
}
}
if (inc < 0)
{
break;
}
}
}
return r;
}
function check_combinations(osd_tree, pgs)
{
const host_per_osd = {};
for (const host in osd_tree)
{
for (const osd in osd_tree[host])
{
host_per_osd[osd] = host;
}
}
const res = [];
skip_pg: for (const pg of pgs)
{
const seen_hosts = {};
for (const osd of pg)
{
if (!host_per_osd[osd] || seen_hosts[host_per_osd[osd]])
{
continue skip_pg;
}
seen_hosts[host_per_osd[osd]] = true;
}
res.push(pg);
}
return res;
}
function compat(params)
{
return {
...params,
osd_weights: Object.assign({}, ...Object.values(params.osd_tree)),
combinator: new SimpleCombinator(params.osd_tree, params.pg_size, params.max_combinations||10000),
};
}
module.exports = {
flatten_tree,
all_combinations,
SimpleCombinator,
compat,
NO_OSD,
};

View File

@ -7,6 +7,7 @@
// This leads to really uneven OSD fill ratio in Ceph even when PGs are perfectly balanced.
// But we support this case with the "parity_space" parameter in optimize_initial()/optimize_change().
const { SimpleCombinator } = require('./simple_pgs.js');
const LPOptimizer = require('./lp-optimizer.js');
const osd_tree = {
@ -114,16 +115,17 @@ Fine, let's try to optimize for it.
async function run()
{
const all_weights = Object.assign({}, ...Object.values(osd_tree));
const total_weight = Object.values(all_weights).reduce((a, c) => Number(a) + Number(c), 0);
const eff = LPOptimizer.pg_list_space_efficiency(prev_pgs, all_weights, 2, 2.26);
const osd_weights = Object.assign({}, ...Object.values(osd_tree));
const total_weight = Object.values(osd_weights).reduce((a, c) => Number(a) + Number(c), 0);
const eff = LPOptimizer.pg_list_space_efficiency(prev_pgs, osd_weights, 2, 2.26);
const orig = eff*4.26 / total_weight;
console.log('Original efficiency was: '+Math.round(orig*10000)/100+' %');
let prev = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 256, parity_space: 2.26 });
const combinator = new SimpleCombinator(osd_tree, 3, 10000);
let prev = await LPOptimizer.optimize_initial({ osd_weights, combinator, pg_size: 3, pg_count: 256, parity_space: 2.26 });
LPOptimizer.print_change_stats(prev);
let next = await LPOptimizer.optimize_change({ prev_pgs, osd_tree, pg_size: 3, max_combinations: 10000, parity_space: 2.26 });
let next = await LPOptimizer.optimize_change({ prev_pgs, osd_weights, combinator, pg_size: 3, parity_space: 2.26 });
LPOptimizer.print_change_stats(next);
}

View File

@ -1,6 +1,7 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
const { compat } = require('./simple_pgs.js');
const LPOptimizer = require('./lp-optimizer.js');
async function run()
@ -14,26 +15,26 @@ async function run()
let res;
console.log('16 PGs, size=3');
res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 16, ordered: false });
res = await LPOptimizer.optimize_initial(compat({ osd_tree, pg_size: 3, pg_count: 16, ordered: false }));
LPOptimizer.print_change_stats(res, false);
assert(res.space == 3, 'Initial distribution');
console.log('\nChange size to 2');
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2, ordered: false });
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2, ordered: false }));
LPOptimizer.print_change_stats(res, false);
assert(res.space >= 3*14/16 && res.osd_differs == 0, 'Redistribution');
console.log('\nRemove OSD 3');
const no3_tree = { ...osd_tree };
delete no3_tree['300'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: no3_tree, pg_size: 2, ordered: false });
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree: no3_tree, pg_size: 2, ordered: false }));
LPOptimizer.print_change_stats(res, false);
assert(res.space == 2, 'Redistribution after OSD removal');
console.log('\n16 PGs, size=3, ordered');
res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 16, ordered: true });
res = await LPOptimizer.optimize_initial(compat({ osd_tree, pg_size: 3, pg_count: 16, ordered: true }));
LPOptimizer.print_change_stats(res, false);
assert(res.space == 3, 'Initial distribution');
console.log('\nChange size to 2, ordered');
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2, ordered: true });
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2, ordered: true }));
LPOptimizer.print_change_stats(res, false);
assert(res.space >= 3*14/16 && res.osd_differs < 8, 'Redistribution');
}

View File

@ -1,6 +1,7 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
const { compat, flatten_tree } = require('./simple_pgs.js');
const LPOptimizer = require('./lp-optimizer.js');
const crush_tree = [
@ -36,44 +37,44 @@ const crush_tree = [
] },
];
const osd_tree = LPOptimizer.flatten_tree(crush_tree, {}, 1, 3);
const osd_tree = flatten_tree(crush_tree, {}, 1, 3);
console.log(osd_tree);
async function run()
{
const cur_tree = {};
console.log('Empty tree:');
let res = await LPOptimizer.optimize_initial({ osd_tree: cur_tree, pg_size: 3, pg_count: 256 });
let res = await LPOptimizer.optimize_initial(compat({ osd_tree: cur_tree, pg_size: 3, pg_count: 256 }));
LPOptimizer.print_change_stats(res, false);
assert(res.space == 0);
console.log('\nAdding 1st failure domain:');
cur_tree['dom1'] = osd_tree['dom1'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }));
LPOptimizer.print_change_stats(res, false);
assert(res.space == 12 && res.total_space == 12);
console.log('\nAdding 2nd failure domain:');
cur_tree['dom2'] = osd_tree['dom2'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }));
LPOptimizer.print_change_stats(res, false);
assert(res.space == 24 && res.total_space == 24);
console.log('\nAdding 3rd failure domain:');
cur_tree['dom3'] = osd_tree['dom3'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }));
LPOptimizer.print_change_stats(res, false);
assert(res.space == 36 && res.total_space == 36);
console.log('\nRemoving 3rd failure domain:');
delete cur_tree['dom3'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }));
LPOptimizer.print_change_stats(res, false);
assert(res.space == 24 && res.total_space == 24);
console.log('\nRemoving 2nd failure domain:');
delete cur_tree['dom2'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }));
LPOptimizer.print_change_stats(res, false);
assert(res.space == 12 && res.total_space == 12);
console.log('\nRemoving 1st failure domain:');
delete cur_tree['dom1'];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 });
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree: cur_tree, pg_size: 3 }));
LPOptimizer.print_change_stats(res, false);
assert(res.space == 0);
}

View File

@ -1,6 +1,7 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
const { compat } = require('./simple_pgs.js');
const LPOptimizer = require('./lp-optimizer.js');
const osd_tree = {
@ -20,13 +21,13 @@ async function run()
{
let res;
console.log('256 PGs, 3+3 OSDs, size=2');
res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 2, pg_count: 256 });
res = await LPOptimizer.optimize_initial(compat({ osd_tree, pg_size: 2, pg_count: 256 }));
LPOptimizer.print_change_stats(res, false);
// Should NOT fail with the "unfeasible or unbounded" exception
console.log('\nRemoving osd.2');
delete osd_tree[100][2];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 });
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 }));
LPOptimizer.print_change_stats(res, false);
}

View File

@ -1,6 +1,7 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
const { compat, flatten_tree } = require('./simple_pgs.js');
const LPOptimizer = require('./lp-optimizer.js');
const osd_tree = {
@ -19,7 +20,7 @@ const osd_tree = {
},
500: {
4: 3.58498,
// 8: 3.58589,
/*8: 3.58589,*/
9: 3.63869,
},
600: {
@ -84,31 +85,31 @@ async function run()
// Space efficiency is ~99% in all cases.
console.log('256 PGs, size=2');
res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 2, pg_count: 256 });
res = await LPOptimizer.optimize_initial(compat({ osd_tree, pg_size: 2, pg_count: 256 }));
LPOptimizer.print_change_stats(res, false);
console.log('\nAdding osd.8');
osd_tree[500][8] = 3.58589;
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 });
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 }));
LPOptimizer.print_change_stats(res, false);
console.log('\nRemoving osd.8');
delete osd_tree[500][8];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 });
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree, pg_size: 2 }));
LPOptimizer.print_change_stats(res, false);
console.log('\n256 PGs, size=3');
res = await LPOptimizer.optimize_initial({ osd_tree, pg_size: 3, pg_count: 256 });
res = await LPOptimizer.optimize_initial(compat({ osd_tree, pg_size: 3, pg_count: 256 }));
LPOptimizer.print_change_stats(res, false);
console.log('\nAdding osd.8');
osd_tree[500][8] = 3.58589;
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 3 });
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree, pg_size: 3 }));
LPOptimizer.print_change_stats(res, false);
console.log('\nRemoving osd.8');
delete osd_tree[500][8];
res = await LPOptimizer.optimize_change({ prev_pgs: res.int_pgs, osd_tree, pg_size: 3 });
res = await LPOptimizer.optimize_change(compat({ prev_pgs: res.int_pgs, osd_tree, pg_size: 3 }));
LPOptimizer.print_change_stats(res, false);
console.log('\n256 PGs, size=3, failure domain=rack');
res = await LPOptimizer.optimize_initial({ osd_tree: LPOptimizer.flatten_tree(crush_tree, {}, 1, 3), pg_size: 3, pg_count: 256 });
res = await LPOptimizer.optimize_initial(compat({ osd_tree: flatten_tree(crush_tree, {}, 1, 3), pg_size: 3, pg_count: 256 }));
LPOptimizer.print_change_stats(res, false);
}

118
mon/test-parse-dsl.js Normal file
View File

@ -0,0 +1,118 @@
const { random_custom_combinations, index_tree, parse_level_indexes, parse_pg_dsl } = require('./dsl_pgs.js');
function check(result, expected)
{
console.dir(result, { depth: null });
if (JSON.stringify(result) !== JSON.stringify(expected))
{
process.stderr.write('Unexpected value, expected: ');
console.dir(expected, { depth: null });
process.exit(1);
}
}
check(
parse_pg_dsl("any, dc=1 host!=1, dc!=1, dc=3 host!=3, dc!=(1,3), dc=5 host!=5"),
[
[],
[ [ 'dc', '=', 1 ], [ 'host', '!=', 1 ] ],
[ [ 'dc', '!=', 1 ] ],
[ [ 'dc', '=', 3 ], [ 'host', '!=', 3 ] ],
[ [ 'dc', '!=', [ 1, 3 ] ] ],
[ [ 'dc', '=', 5 ], [ 'host', '!=', 5 ] ],
]
);
check(
parse_pg_dsl("dc=meow, dc!=1, dc>2"),
[
[ [ 'dc', '=', { id: 'meow' } ] ],
[ [ 'dc', '!=', 1 ] ],
[ [ 'dc', '>', 2 ] ],
]
);
check(
parse_level_indexes({ dc: '112233', host: 'ABCDEF' }),
[
[],
[ [ 'dc', '=', 1 ], [ 'host', '!=', [ 1 ] ] ],
[ [ 'dc', '!=', [ 1 ] ], [ 'host', '!=', [ 1, 2 ] ] ],
[ [ 'dc', '=', 3 ], [ 'host', '!=', [ 1, 2, 3 ] ] ],
[ [ 'dc', '!=', [ 1, 3 ] ], [ 'host', '!=', [ 1, 2, 3, 4 ] ] ],
[ [ 'dc', '=', 5 ], [ 'host', '!=', [ 1, 2, 3, 4, 5 ] ] ],
]
);
check(
parse_level_indexes({ dc: '112233', host: 'ABCDEF' }, [ 'dc', 'host' ]),
[
[],
[ [ 'dc', '=', 1 ], [ 'host', '!=', [ 1 ] ] ],
[ [ 'dc', '!=', [ 1 ] ] ],
[ [ 'dc', '=', 3 ], [ 'host', '!=', [ 3 ] ] ],
[ [ 'dc', '!=', [ 1, 3 ] ] ],
[ [ 'dc', '=', 5 ], [ 'host', '!=', [ 5 ] ] ],
]
);
check(
parse_level_indexes({ dc: '112211223333', host: '123456789ABC' }),
[
[],
[ [ 'dc', '=', 1 ], [ 'host', '!=', [ 1 ] ] ],
[ [ 'dc', '!=', [ 1 ] ], [ 'host', '!=', [ 1, 2 ] ] ],
[ [ 'dc', '=', 3 ], [ 'host', '!=', [ 1, 2, 3 ] ] ],
[ [ 'dc', '=', 1 ], [ 'host', '!=', [ 1, 2, 3, 4 ] ] ],
[ [ 'dc', '=', 1 ], [ 'host', '!=', [ 1, 2, 3, 4, 5 ] ] ],
[ [ 'dc', '=', 3 ], [ 'host', '!=', [ 1, 2, 3, 4, 5, 6 ] ] ],
[ [ 'dc', '=', 3 ], [ 'host', '!=', [ 1, 2, 3, 4, 5, 6, 7 ] ] ],
[ [ 'dc', '!=', [ 1, 3 ] ], [ 'host', '!=', [ 1, 2, 3, 4, 5, 6, 7, 8 ] ] ],
[ [ 'dc', '=', 9 ], [ 'host', '!=', [ 1, 2, 3, 4, 5, 6, 7, 8, 9 ] ] ],
[ [ 'dc', '=', 9 ], [ 'host', '!=', [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ] ] ],
[ [ 'dc', '=', 9 ], [ 'host', '!=', [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 ] ] ],
]
);
check(
parse_level_indexes({ dc: '112211223333', host: '123456789ABC' }, [ 'dc', 'host' ]),
[
[],
[ [ 'dc', '=', 1 ], [ 'host', '!=', [ 1 ] ] ],
[ [ 'dc', '!=', [ 1 ] ] ],
[ [ 'dc', '=', 3 ], [ 'host', '!=', [ 3 ] ] ],
[ [ 'dc', '=', 1 ], [ 'host', '!=', [ 1, 2 ] ] ],
[ [ 'dc', '=', 1 ], [ 'host', '!=', [ 1, 2, 5 ] ] ],
[ [ 'dc', '=', 3 ], [ 'host', '!=', [ 3, 4 ] ] ],
[ [ 'dc', '=', 3 ], [ 'host', '!=', [ 3, 4, 7 ] ] ],
[ [ 'dc', '!=', [ 1, 3 ] ] ],
[ [ 'dc', '=', 9 ], [ 'host', '!=', [ 9 ] ] ],
[ [ 'dc', '=', 9 ], [ 'host', '!=', [ 9, 10 ] ] ],
[ [ 'dc', '=', 9 ], [ 'host', '!=', [ 9, 10, 11 ] ] ]
]
);
check(
Object.keys(random_custom_combinations(index_tree([
{ id: '1', size: 1, level: 'osd' },
{ id: '2', size: 2, level: 'osd' },
{ id: '3', size: 3, level: 'osd' }
]), parse_level_indexes({ osd: '12' }), 10000)).sort(),
[ 'pg_1_2', 'pg_1_3', 'pg_2_3' ]
);
check(
Object.keys(random_custom_combinations(index_tree([
{ id: 'h1', level: 'host' },
{ id: 'h2', level: 'host' },
{ id: 'h3', level: 'host' },
{ id: '1', size: 1, level: 'osd', parent: 'h1' },
{ id: '2', size: 1, level: 'osd', parent: 'h2' },
{ id: '3', size: 1, level: 'osd', parent: 'h2' },
{ id: '4', size: 1, level: 'osd', parent: 'h3' },
{ id: '5', size: 1, level: 'osd', parent: 'h3' },
]), parse_level_indexes({ host: '1122', osd: '1234' }), 10000)).sort(),
[ 'pg_2_3_4_5' ]
);
console.log('OK');

View File

@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VERSION = '1.4.7'
VERSION = '1.6.1'
LOG = logging.getLogger(__name__)

View File

@ -0,0 +1,692 @@
commit d85024bd803b3b91f15578ed22de4ce31856626f
Author: Vitaliy Filippov <vitalif@yourcmc.ru>
Date: Wed Jan 24 18:07:43 2024 +0300
Add Vitastor support
diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng
index 7fa5c2b8b5..2d77f391e7 100644
--- a/docs/schemas/domaincommon.rng
+++ b/docs/schemas/domaincommon.rng
@@ -1898,6 +1898,35 @@
</element>
</define>
+ <define name="diskSourceNetworkProtocolVitastor">
+ <element name="source">
+ <interleave>
+ <attribute name="protocol">
+ <value>vitastor</value>
+ </attribute>
+ <ref name="diskSourceCommon"/>
+ <optional>
+ <attribute name="name"/>
+ </optional>
+ <optional>
+ <attribute name="query"/>
+ </optional>
+ <zeroOrMore>
+ <ref name="diskSourceNetworkHost"/>
+ </zeroOrMore>
+ <optional>
+ <element name="config">
+ <attribute name="file">
+ <ref name="absFilePath"/>
+ </attribute>
+ <empty/>
+ </element>
+ </optional>
+ <empty/>
+ </interleave>
+ </element>
+ </define>
+
<define name="diskSourceNetworkProtocolISCSI">
<element name="source">
<attribute name="protocol">
@@ -2154,6 +2183,7 @@
<ref name="diskSourceNetworkProtocolSimple"/>
<ref name="diskSourceNetworkProtocolVxHS"/>
<ref name="diskSourceNetworkProtocolNFS"/>
+ <ref name="diskSourceNetworkProtocolVitastor"/>
</choice>
</define>
diff --git a/include/libvirt/libvirt-storage.h b/include/libvirt/libvirt-storage.h
index f89856b93e..a8cb9387e2 100644
--- a/include/libvirt/libvirt-storage.h
+++ b/include/libvirt/libvirt-storage.h
@@ -246,6 +246,7 @@ typedef enum {
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS = 1 << 17,
VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE = 1 << 18,
VIR_CONNECT_LIST_STORAGE_POOLS_ISCSI_DIRECT = 1 << 19,
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR = 1 << 20,
} virConnectListAllStoragePoolsFlags;
int virConnectListAllStoragePools(virConnectPtr conn,
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index 5691b8d2d5..6669e8451d 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -8293,7 +8293,8 @@ virDomainDiskSourceNetworkParse(xmlNodePtr node,
src->configFile = virXPathString("string(./config/@file)", ctxt);
if (src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTP ||
- src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS)
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS ||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_VITASTOR)
src->query = virXMLPropString(node, "query");
if (virDomainStorageNetworkParseHosts(node, ctxt, &src->hosts, &src->nhosts) < 0)
@@ -31267,6 +31268,7 @@ virDomainStorageSourceTranslateSourcePool(virStorageSource *src,
case VIR_STORAGE_POOL_MPATH:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_SHEEPDOG:
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_LAST:
diff --git a/src/conf/domain_validate.c b/src/conf/domain_validate.c
index a4271f1247..621c1b7b31 100644
--- a/src/conf/domain_validate.c
+++ b/src/conf/domain_validate.c
@@ -508,7 +508,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
}
}
- /* internal snapshots and config files are currently supported only with rbd: */
+ /* internal snapshots are currently supported only with rbd: */
if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD) {
if (src->snapshot) {
@@ -517,11 +517,15 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
"only with 'rbd' disks"));
return -1;
}
-
+ }
+ /* config files are currently supported only with rbd and vitastor: */
+ if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR) {
if (src->configFile) {
virReportError(VIR_ERR_XML_ERROR, "%s",
_("<config> element is currently supported "
- "only with 'rbd' disks"));
+ "only with 'rbd' and 'vitastor' disks"));
return -1;
}
}
diff --git a/src/conf/storage_conf.c b/src/conf/storage_conf.c
index 6690d26ffd..2255df9d28 100644
--- a/src/conf/storage_conf.c
+++ b/src/conf/storage_conf.c
@@ -60,7 +60,7 @@ VIR_ENUM_IMPL(virStoragePool,
"logical", "disk", "iscsi",
"iscsi-direct", "scsi", "mpath",
"rbd", "sheepdog", "gluster",
- "zfs", "vstorage",
+ "zfs", "vstorage", "vitastor",
);
VIR_ENUM_IMPL(virStoragePoolFormatFileSystem,
@@ -246,6 +246,18 @@ static virStoragePoolTypeInfo poolTypeInfo[] = {
.formatToString = virStorageFileFormatTypeToString,
}
},
+ {.poolType = VIR_STORAGE_POOL_VITASTOR,
+ .poolOptions = {
+ .flags = (VIR_STORAGE_POOL_SOURCE_HOST |
+ VIR_STORAGE_POOL_SOURCE_NETWORK |
+ VIR_STORAGE_POOL_SOURCE_NAME),
+ },
+ .volOptions = {
+ .defaultFormat = VIR_STORAGE_FILE_RAW,
+ .formatFromString = virStorageVolumeFormatFromString,
+ .formatToString = virStorageFileFormatTypeToString,
+ }
+ },
{.poolType = VIR_STORAGE_POOL_SHEEPDOG,
.poolOptions = {
.flags = (VIR_STORAGE_POOL_SOURCE_HOST |
@@ -546,6 +558,11 @@ virStoragePoolDefParseSource(xmlXPathContextPtr ctxt,
_("element 'name' is mandatory for RBD pool"));
return -1;
}
+ if (pool_type == VIR_STORAGE_POOL_VITASTOR && source->name == NULL) {
+ virReportError(VIR_ERR_XML_ERROR, "%s",
+ _("element 'name' is mandatory for Vitastor pool"));
+ return -1;
+ }
if (options->formatFromString) {
g_autofree char *format = NULL;
@@ -1176,6 +1193,7 @@ virStoragePoolDefFormatBuf(virBuffer *buf,
/* RBD, Sheepdog, Gluster and Iscsi-direct devices are not local block devs nor
* files, so they don't have a target */
if (def->type != VIR_STORAGE_POOL_RBD &&
+ def->type != VIR_STORAGE_POOL_VITASTOR &&
def->type != VIR_STORAGE_POOL_SHEEPDOG &&
def->type != VIR_STORAGE_POOL_GLUSTER &&
def->type != VIR_STORAGE_POOL_ISCSI_DIRECT) {
diff --git a/src/conf/storage_conf.h b/src/conf/storage_conf.h
index aaecf138d6..97172db38b 100644
--- a/src/conf/storage_conf.h
+++ b/src/conf/storage_conf.h
@@ -106,6 +106,7 @@ typedef enum {
VIR_STORAGE_POOL_GLUSTER, /* Gluster device */
VIR_STORAGE_POOL_ZFS, /* ZFS */
VIR_STORAGE_POOL_VSTORAGE, /* Virtuozzo Storage */
+ VIR_STORAGE_POOL_VITASTOR, /* Vitastor */
VIR_STORAGE_POOL_LAST,
} virStoragePoolType;
@@ -466,6 +467,7 @@ VIR_ENUM_DECL(virStoragePartedFs);
VIR_CONNECT_LIST_STORAGE_POOLS_SCSI | \
VIR_CONNECT_LIST_STORAGE_POOLS_MPATH | \
VIR_CONNECT_LIST_STORAGE_POOLS_RBD | \
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR | \
VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG | \
VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER | \
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS | \
diff --git a/src/conf/storage_source_conf.c b/src/conf/storage_source_conf.c
index d42f715f26..29d8da3d10 100644
--- a/src/conf/storage_source_conf.c
+++ b/src/conf/storage_source_conf.c
@@ -86,6 +86,7 @@ VIR_ENUM_IMPL(virStorageNetProtocol,
"ssh",
"vxhs",
"nfs",
+ "vitastor",
);
@@ -1265,6 +1266,7 @@ virStorageSourceNetworkDefaultPort(virStorageNetProtocol protocol)
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
return 24007;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_RBD:
/* we don't provide a default for RBD */
return 0;
diff --git a/src/conf/storage_source_conf.h b/src/conf/storage_source_conf.h
index c4a026881c..67568e9181 100644
--- a/src/conf/storage_source_conf.h
+++ b/src/conf/storage_source_conf.h
@@ -128,6 +128,7 @@ typedef enum {
VIR_STORAGE_NET_PROTOCOL_SSH,
VIR_STORAGE_NET_PROTOCOL_VXHS,
VIR_STORAGE_NET_PROTOCOL_NFS,
+ VIR_STORAGE_NET_PROTOCOL_VITASTOR,
VIR_STORAGE_NET_PROTOCOL_LAST
} virStorageNetProtocol;
diff --git a/src/conf/virstorageobj.c b/src/conf/virstorageobj.c
index 02903ac487..504df599fb 100644
--- a/src/conf/virstorageobj.c
+++ b/src/conf/virstorageobj.c
@@ -1481,6 +1481,7 @@ virStoragePoolObjSourceFindDuplicateCb(const void *payload,
return 1;
break;
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_RBD:
case VIR_STORAGE_POOL_LAST:
break;
@@ -1978,6 +1979,8 @@ virStoragePoolObjMatch(virStoragePoolObj *obj,
(obj->def->type == VIR_STORAGE_POOL_MPATH)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_RBD) &&
(obj->def->type == VIR_STORAGE_POOL_RBD)) ||
+ (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR) &&
+ (obj->def->type == VIR_STORAGE_POOL_VITASTOR)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG) &&
(obj->def->type == VIR_STORAGE_POOL_SHEEPDOG)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER) &&
diff --git a/src/libvirt-storage.c b/src/libvirt-storage.c
index cbc522b300..b4760fa58d 100644
--- a/src/libvirt-storage.c
+++ b/src/libvirt-storage.c
@@ -92,6 +92,7 @@ virStoragePoolGetConnect(virStoragePoolPtr pool)
* VIR_CONNECT_LIST_STORAGE_POOLS_SCSI
* VIR_CONNECT_LIST_STORAGE_POOLS_MPATH
* VIR_CONNECT_LIST_STORAGE_POOLS_RBD
+ * VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR
* VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG
* VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER
* VIR_CONNECT_LIST_STORAGE_POOLS_ZFS
diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c
index 1ac6253ad7..abe4587f94 100644
--- a/src/libxl/libxl_conf.c
+++ b/src/libxl/libxl_conf.c
@@ -962,6 +962,7 @@ libxlMakeNetworkDiskSrcStr(virStorageSource *src,
case VIR_STORAGE_NET_PROTOCOL_SSH:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
case VIR_STORAGE_NET_PROTOCOL_NFS:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
virReportError(VIR_ERR_NO_SUPPORT,
diff --git a/src/libxl/xen_xl.c b/src/libxl/xen_xl.c
index 7604e3d534..6453bb9776 100644
--- a/src/libxl/xen_xl.c
+++ b/src/libxl/xen_xl.c
@@ -1506,6 +1506,7 @@ xenFormatXLDiskSrcNet(virStorageSource *src)
case VIR_STORAGE_NET_PROTOCOL_SSH:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
case VIR_STORAGE_NET_PROTOCOL_NFS:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
virReportError(VIR_ERR_NO_SUPPORT,
diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c
index e5ff653a60..884ecc79ea 100644
--- a/src/qemu/qemu_block.c
+++ b/src/qemu/qemu_block.c
@@ -943,6 +943,38 @@ qemuBlockStorageSourceGetRBDProps(virStorageSource *src,
}
+static virJSONValue *
+qemuBlockStorageSourceGetVitastorProps(virStorageSource *src)
+{
+ virJSONValue *ret = NULL;
+ virStorageNetHostDef *host;
+ size_t i;
+ g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER;
+ g_autofree char *etcd = NULL;
+
+ for (i = 0; i < src->nhosts; i++) {
+ host = src->hosts + i;
+ if ((virStorageNetHostTransport)host->transport != VIR_STORAGE_NET_HOST_TRANS_TCP) {
+ return NULL;
+ }
+ virBufferAsprintf(&buf, i > 0 ? ",%s:%u" : "%s:%u", host->name, host->port);
+ }
+ if (src->nhosts > 0) {
+ etcd = virBufferContentAndReset(&buf);
+ }
+
+ if (virJSONValueObjectCreate(&ret,
+ "S:etcd-host", etcd,
+ "S:etcd-prefix", src->query,
+ "S:config-path", src->configFile,
+ "s:image", src->path,
+ NULL) < 0)
+ return NULL;
+
+ return ret;
+}
+
+
static virJSONValue *
qemuBlockStorageSourceGetSheepdogProps(virStorageSource *src)
{
@@ -1233,6 +1265,12 @@ qemuBlockStorageSourceGetBackendProps(virStorageSource *src,
return NULL;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ driver = "vitastor";
+ if (!(fileprops = qemuBlockStorageSourceGetVitastorProps(src)))
+ return NULL;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
driver = "sheepdog";
if (!(fileprops = qemuBlockStorageSourceGetSheepdogProps(src)))
@@ -2244,6 +2282,7 @@ qemuBlockGetBackingStoreString(virStorageSource *src,
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
case VIR_STORAGE_NET_PROTOCOL_NFS:
case VIR_STORAGE_NET_PROTOCOL_SSH:
@@ -2626,6 +2665,12 @@ qemuBlockStorageSourceCreateGetStorageProps(virStorageSource *src,
return -1;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ driver = "vitastor";
+ if (!(location = qemuBlockStorageSourceGetVitastorProps(src)))
+ return -1;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
driver = "sheepdog";
if (!(location = qemuBlockStorageSourceGetSheepdogProps(src)))
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index d822533ccb..afe2087303 100644
--- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c
@@ -1723,6 +1723,43 @@ qemuBuildNetworkDriveStr(virStorageSource *src,
ret = virBufferContentAndReset(&buf);
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ if (strchr(src->path, ':')) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+ _("':' not allowed in Vitastor source volume name '%s'"),
+ src->path);
+ return NULL;
+ }
+
+ virBufferStrcat(&buf, "vitastor:image=", src->path, NULL);
+
+ if (src->nhosts > 0) {
+ virBufferAddLit(&buf, ":etcd-host=");
+ for (i = 0; i < src->nhosts; i++) {
+ if (i)
+ virBufferAddLit(&buf, ",");
+
+ /* assume host containing : is ipv6 */
+ if (strchr(src->hosts[i].name, ':'))
+ virBufferEscape(&buf, '\\', ":", "[%s]",
+ src->hosts[i].name);
+ else
+ virBufferAsprintf(&buf, "%s", src->hosts[i].name);
+
+ if (src->hosts[i].port)
+ virBufferAsprintf(&buf, "\\:%u", src->hosts[i].port);
+ }
+ }
+
+ if (src->configFile)
+ virBufferEscape(&buf, '\\', ":", ":config-path=%s", src->configFile);
+
+ if (src->query)
+ virBufferEscape(&buf, '\\', ":", ":etcd-prefix=%s", src->query);
+
+ ret = virBufferContentAndReset(&buf);
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_VXHS:
virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
_("VxHS protocol does not support URI syntax"));
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index a8401bac30..3dc1fe6db0 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -4731,7 +4731,8 @@ qemuDomainValidateStorageSource(virStorageSource *src,
if (src->query &&
(actualType != VIR_STORAGE_TYPE_NETWORK ||
(src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTPS &&
- src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP))) {
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR))) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
_("query is supported only with HTTP(S) protocols"));
return -1;
@@ -9919,6 +9920,7 @@ qemuDomainPrepareStorageSourceTLS(virStorageSource *src,
break;
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
diff --git a/src/qemu/qemu_snapshot.c b/src/qemu/qemu_snapshot.c
index f92e00f9c0..854a3fbc90 100644
--- a/src/qemu/qemu_snapshot.c
+++ b/src/qemu/qemu_snapshot.c
@@ -393,6 +393,7 @@ qemuSnapshotPrepareDiskExternalInactive(virDomainSnapshotDiskDef *snapdisk,
case VIR_STORAGE_NET_PROTOCOL_NONE:
case VIR_STORAGE_NET_PROTOCOL_NBD:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
@@ -485,6 +486,7 @@ qemuSnapshotPrepareDiskExternalActive(virDomainObj *vm,
case VIR_STORAGE_NET_PROTOCOL_NONE:
case VIR_STORAGE_NET_PROTOCOL_NBD:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
case VIR_STORAGE_NET_PROTOCOL_HTTP:
@@ -638,6 +640,7 @@ qemuSnapshotPrepareDiskInternal(virDomainDiskDef *disk,
case VIR_STORAGE_NET_PROTOCOL_NONE:
case VIR_STORAGE_NET_PROTOCOL_NBD:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
diff --git a/src/storage/storage_driver.c b/src/storage/storage_driver.c
index 4df2c75a2b..5a5e48ef71 100644
--- a/src/storage/storage_driver.c
+++ b/src/storage/storage_driver.c
@@ -1643,6 +1643,7 @@ storageVolLookupByPathCallback(virStoragePoolObj *obj,
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_SHEEPDOG:
case VIR_STORAGE_POOL_ZFS:
case VIR_STORAGE_POOL_LAST:
diff --git a/src/storage_file/storage_source_backingstore.c b/src/storage_file/storage_source_backingstore.c
index e48ae725ab..2017ccc88c 100644
--- a/src/storage_file/storage_source_backingstore.c
+++ b/src/storage_file/storage_source_backingstore.c
@@ -284,6 +284,75 @@ virStorageSourceParseRBDColonString(const char *rbdstr,
}
+static int
+virStorageSourceParseVitastorColonString(const char *colonstr,
+ virStorageSource *src)
+{
+ char *p, *e, *next;
+ g_autofree char *options = NULL;
+
+ /* optionally skip the "vitastor:" prefix if provided */
+ if (STRPREFIX(colonstr, "vitastor:"))
+ colonstr += strlen("vitastor:");
+
+ options = g_strdup(colonstr);
+
+ p = options;
+ while (*p) {
+ /* find : delimiter or end of string */
+ for (e = p; *e && *e != ':'; ++e) {
+ if (*e == '\\') {
+ e++;
+ if (*e == '\0')
+ break;
+ }
+ }
+ if (*e == '\0') {
+ next = e; /* last kv pair */
+ } else {
+ next = e + 1;
+ *e = '\0';
+ }
+
+ if (STRPREFIX(p, "image=")) {
+ src->path = g_strdup(p + strlen("image="));
+ } else if (STRPREFIX(p, "etcd-prefix=")) {
+ src->query = g_strdup(p + strlen("etcd-prefix="));
+ } else if (STRPREFIX(p, "config-path=")) {
+ src->configFile = g_strdup(p + strlen("config-path="));
+ } else if (STRPREFIX(p, "etcd-host=")) {
+ char *h, *sep;
+
+ h = p + strlen("etcd-host=");
+ while (h < e) {
+ for (sep = h; sep < e; ++sep) {
+ if (*sep == '\\' && (sep[1] == ',' ||
+ sep[1] == ';' ||
+ sep[1] == ' ')) {
+ *sep = '\0';
+ sep += 2;
+ break;
+ }
+ }
+
+ if (virStorageSourceRBDAddHost(src, h) < 0)
+ return -1;
+
+ h = sep;
+ }
+ }
+
+ p = next;
+ }
+
+ if (!src->path) {
+ return -1;
+ }
+
+ return 0;
+}
+
+
static int
virStorageSourceParseNBDColonString(const char *nbdstr,
virStorageSource *src)
@@ -396,6 +465,11 @@ virStorageSourceParseBackingColon(virStorageSource *src,
return -1;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ if (virStorageSourceParseVitastorColonString(path, src) < 0)
+ return -1;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
@@ -984,6 +1058,54 @@ virStorageSourceParseBackingJSONRBD(virStorageSource *src,
return 0;
}
+static int
+virStorageSourceParseBackingJSONVitastor(virStorageSource *src,
+ virJSONValue *json,
+ const char *jsonstr G_GNUC_UNUSED,
+ int opaque G_GNUC_UNUSED)
+{
+ const char *filename;
+ const char *image = virJSONValueObjectGetString(json, "image");
+ const char *conf = virJSONValueObjectGetString(json, "config-path");
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd-prefix");
+ virJSONValue *servers = virJSONValueObjectGetArray(json, "server");
+ size_t nservers;
+ size_t i;
+
+ src->type = VIR_STORAGE_TYPE_NETWORK;
+ src->protocol = VIR_STORAGE_NET_PROTOCOL_VITASTOR;
+
+ /* legacy syntax passed via 'filename' option */
+ if ((filename = virJSONValueObjectGetString(json, "filename")))
+ return virStorageSourceParseVitastorColonString(filename, src);
+
+ if (!image) {
+ virReportError(VIR_ERR_INVALID_ARG, "%s",
+ _("missing image name in Vitastor backing volume "
+ "JSON specification"));
+ return -1;
+ }
+
+ src->path = g_strdup(image);
+ src->configFile = g_strdup(conf);
+ src->query = g_strdup(etcd_prefix);
+
+ if (servers) {
+ nservers = virJSONValueArraySize(servers);
+
+ src->hosts = g_new0(virStorageNetHostDef, nservers);
+ src->nhosts = nservers;
+
+ for (i = 0; i < nservers; i++) {
+ if (virStorageSourceParseBackingJSONInetSocketAddress(src->hosts + i,
+ virJSONValueArrayGet(servers, i)) < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
static int
virStorageSourceParseBackingJSONRaw(virStorageSource *src,
virJSONValue *json,
@@ -1162,6 +1284,7 @@ static const struct virStorageSourceJSONDriverParser jsonParsers[] = {
{"sheepdog", false, virStorageSourceParseBackingJSONSheepdog, 0},
{"ssh", false, virStorageSourceParseBackingJSONSSH, 0},
{"rbd", false, virStorageSourceParseBackingJSONRBD, 0},
+ {"vitastor", false, virStorageSourceParseBackingJSONVitastor, 0},
{"raw", true, virStorageSourceParseBackingJSONRaw, 0},
{"nfs", false, virStorageSourceParseBackingJSONNFS, 0},
{"vxhs", false, virStorageSourceParseBackingJSONVxHS, 0},
diff --git a/src/test/test_driver.c b/src/test/test_driver.c
index 0e93b79922..b4d33f5f56 100644
--- a/src/test/test_driver.c
+++ b/src/test/test_driver.c
@@ -7367,6 +7367,7 @@ testStorageVolumeTypeForPool(int pooltype)
case VIR_STORAGE_POOL_ISCSI_DIRECT:
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
return VIR_STORAGE_VOL_NETWORK;
case VIR_STORAGE_POOL_LOGICAL:
case VIR_STORAGE_POOL_DISK:
diff --git a/tests/storagepoolcapsschemadata/poolcaps-fs.xml b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
index eee75af746..8bd0a57bdd 100644
--- a/tests/storagepoolcapsschemadata/poolcaps-fs.xml
+++ b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
@@ -204,4 +204,11 @@
</enum>
</volOptions>
</pool>
+ <pool type='vitastor' supported='no'>
+ <volOptions>
+ <defaultFormat type='raw'/>
+ <enum name='targetFormatType'>
+ </enum>
+ </volOptions>
+ </pool>
</storagepoolCapabilities>
diff --git a/tests/storagepoolcapsschemadata/poolcaps-full.xml b/tests/storagepoolcapsschemadata/poolcaps-full.xml
index 805950a937..852df0de16 100644
--- a/tests/storagepoolcapsschemadata/poolcaps-full.xml
+++ b/tests/storagepoolcapsschemadata/poolcaps-full.xml
@@ -204,4 +204,11 @@
</enum>
</volOptions>
</pool>
+ <pool type='vitastor' supported='yes'>
+ <volOptions>
+ <defaultFormat type='raw'/>
+ <enum name='targetFormatType'>
+ </enum>
+ </volOptions>
+ </pool>
</storagepoolCapabilities>
diff --git a/tests/storagepoolxml2argvtest.c b/tests/storagepoolxml2argvtest.c
index 449b745519..7f95cc8e08 100644
--- a/tests/storagepoolxml2argvtest.c
+++ b/tests/storagepoolxml2argvtest.c
@@ -68,6 +68,7 @@ testCompareXMLToArgvFiles(bool shouldFail,
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_ZFS:
case VIR_STORAGE_POOL_VSTORAGE:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_LAST:
default:
VIR_TEST_DEBUG("pool type '%s' has no xml2argv test", defTypeStr);
diff --git a/tools/virsh-pool.c b/tools/virsh-pool.c
index d391257f6e..46799c4a90 100644
--- a/tools/virsh-pool.c
+++ b/tools/virsh-pool.c
@@ -1213,6 +1213,9 @@ cmdPoolList(vshControl *ctl, const vshCmd *cmd G_GNUC_UNUSED)
case VIR_STORAGE_POOL_VSTORAGE:
flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE;
break;
+ case VIR_STORAGE_POOL_VITASTOR:
+ flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR;
+ break;
case VIR_STORAGE_POOL_LAST:
break;
}

View File

@ -24,4 +24,4 @@ rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-1.4.7/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.7$(rpm --eval '%dist').tar.gz *
tar --transform 's#^#vitastor-1.6.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.6.1$(rpm --eval '%dist').tar.gz *

View File

@ -10,7 +10,7 @@ RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
RUN yum -y --enablerepo=extras install centos-release-scl epel-release yum-utils rpm-build
RUN yum -y install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm
RUN yum -y install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gcc make cmake gperftools-devel \
fio rh-nodejs12 jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel
fio rh-nodejs12 jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel libnl3-devel
RUN yumdownloader --disablerepo=centos-sclo-rh --source fio
RUN rpm --nomd5 -i fio*.src.rpm
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
@ -36,7 +36,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-1.4.7.el7.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.6.1.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@ -1,11 +1,11 @@
Name: vitastor
Version: 1.4.7
Version: 1.6.1
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.4.7.el7.tar.gz
Source0: vitastor-1.6.1.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel
@ -17,6 +17,7 @@ BuildRequires: libisa-l-devel
BuildRequires: gf-complete-devel
BuildRequires: libibverbs-devel
BuildRequires: cmake3
BuildRequires: libnl3-devel
Requires: vitastor-osd = %{version}-%{release}
Requires: vitastor-mon = %{version}-%{release}
Requires: vitastor-client = %{version}-%{release}
@ -103,7 +104,7 @@ rm -rf $RPM_BUILD_ROOT
%make_install
. /opt/rh/rh-nodejs12/enable
cd mon
npm install
npm install --production
cd ..
mkdir -p %buildroot/usr/lib/vitastor
cp -r mon %buildroot/usr/lib/vitastor
@ -149,9 +150,12 @@ mkdir -p /etc/vitastor
%_bindir/vitastor-nfs
%_bindir/vitastor-cli
%_bindir/vitastor-rm
%_bindir/vitastor-kv
%_bindir/vitastor-kv-stress
%_bindir/vita
%_libdir/libvitastor_blk.so*
%_libdir/libvitastor_client.so*
%_libdir/libvitastor_kv.so*
%files -n vitastor-client-devel

View File

@ -11,7 +11,7 @@ RUN dnf -y install centos-release-advanced-virtualization epel-release dnf-plugi
RUN sed -i 's/^mirrorlist=/#mirrorlist=/; s!#baseurl=.*!baseurl=http://vault.centos.org/centos/8.4.2105/virt/$basearch/$avdir/!; s!^baseurl=.*Source/.*!baseurl=http://vault.centos.org/centos/8.4.2105/virt/Source/advanced-virtualization/!' /etc/yum.repos.d/CentOS-Advanced-Virtualization.repo
RUN yum -y install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm
RUN dnf -y install gcc-toolset-9 gcc-toolset-9-gcc-c++ gperftools-devel \
fio nodejs rpm-build jerasure-devel libisa-l-devel gf-complete-devel libibverbs-devel libarchive cmake
fio nodejs rpm-build jerasure-devel libisa-l-devel gf-complete-devel libibverbs-devel libarchive cmake libnl3-devel
RUN dnf download --source fio
RUN rpm --nomd5 -i fio*.src.rpm
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=powertools --spec fio.spec
@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-1.4.7.el8.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.6.1.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@ -1,11 +1,11 @@
Name: vitastor
Version: 1.4.7
Version: 1.6.1
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.4.7.el8.tar.gz
Source0: vitastor-1.6.1.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel
@ -16,6 +16,7 @@ BuildRequires: libisa-l-devel
BuildRequires: gf-complete-devel
BuildRequires: libibverbs-devel
BuildRequires: cmake
BuildRequires: libnl3-devel
Requires: vitastor-osd = %{version}-%{release}
Requires: vitastor-mon = %{version}-%{release}
Requires: vitastor-client = %{version}-%{release}
@ -100,7 +101,7 @@ Vitastor fio drivers for benchmarking.
rm -rf $RPM_BUILD_ROOT
%make_install
cd mon
npm install
npm install --production
cd ..
mkdir -p %buildroot/usr/lib/vitastor
cp -r mon %buildroot/usr/lib/vitastor
@ -146,9 +147,12 @@ mkdir -p /etc/vitastor
%_bindir/vitastor-nfs
%_bindir/vitastor-cli
%_bindir/vitastor-rm
%_bindir/vitastor-kv
%_bindir/vitastor-kv-stress
%_bindir/vita
%_libdir/libvitastor_blk.so*
%_libdir/libvitastor_client.so*
%_libdir/libvitastor_kv.so*
%files -n vitastor-client-devel

View File

@ -8,7 +8,7 @@ WORKDIR /root
RUN sed -i 's/enabled=0/enabled=1/' /etc/yum.repos.d/*.repo
RUN dnf -y install epel-release dnf-plugins-core
RUN dnf -y install https://vitastor.io/rpms/centos/9/vitastor-release-1.0-1.el9.noarch.rpm
RUN dnf -y install gcc-c++ gperftools-devel fio nodejs rpm-build jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel libarchive liburing-devel cmake
RUN dnf -y install gcc-c++ gperftools-devel fio nodejs rpm-build jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel libarchive liburing-devel cmake libnl3-devel
RUN dnf download --source fio
RUN rpm --nomd5 -i fio*.src.rpm
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --spec fio.spec
@ -18,7 +18,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-1.4.7.el9.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.6.1.el9.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@ -1,11 +1,11 @@
Name: vitastor
Version: 1.4.7
Version: 1.6.1
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.4.7.el9.tar.gz
Source0: vitastor-1.6.1.el9.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel
@ -16,6 +16,7 @@ BuildRequires: libisa-l-devel
BuildRequires: gf-complete-devel
BuildRequires: rdma-core-devel
BuildRequires: cmake
BuildRequires: libnl3-devel
Requires: vitastor-osd = %{version}-%{release}
Requires: vitastor-mon = %{version}-%{release}
Requires: vitastor-client = %{version}-%{release}
@ -93,7 +94,7 @@ Vitastor fio drivers for benchmarking.
rm -rf $RPM_BUILD_ROOT
%cmake_install
cd mon
npm install
npm install --production
cd ..
mkdir -p %buildroot/usr/lib/vitastor
cp -r mon %buildroot/usr/lib/vitastor
@ -139,9 +140,12 @@ mkdir -p /etc/vitastor
%_bindir/vitastor-nfs
%_bindir/vitastor-cli
%_bindir/vitastor-rm
%_bindir/vitastor-kv
%_bindir/vitastor-kv-stress
%_bindir/vita
%_libdir/libvitastor_blk.so*
%_libdir/libvitastor_client.so*
%_libdir/libvitastor_kv.so*
%files -n vitastor-client-devel

View File

@ -4,6 +4,9 @@ project(vitastor)
include(GNUInstallDirs)
include(CTest)
include(CheckIncludeFile)
find_package(PkgConfig)
set(WITH_QEMU false CACHE BOOL "Build QEMU driver inside Vitastor source tree")
set(WITH_FIO true CACHE BOOL "Build FIO driver")
@ -16,7 +19,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
add_definitions(-DVERSION="1.4.7")
add_definitions(-DVERSION="1.6.1")
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
add_link_options(-fno-omit-frame-pointer)
if (${WITH_ASAN})
@ -25,15 +28,15 @@ if (${WITH_ASAN})
endif (${WITH_ASAN})
set(CMAKE_BUILD_TYPE RelWithDebInfo)
string(REGEX REPLACE "([\\/\\-]O)[12]?" "\\13" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
string(REGEX REPLACE "([\\/\\-]O)[12]?" "\\13" CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL}")
string(REGEX REPLACE "([\\/\\-]O)[12]?" "\\13" CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
string(REGEX REPLACE "([\\/\\-]O)[^ \t\r\n]*" "\\13" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
string(REGEX REPLACE "([\\/\\-]O)[^ \t\r\n]*" "\\13" CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL}")
string(REGEX REPLACE "([\\/\\-]O)[^ \t\r\n]*" "\\13" CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL}")
string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
string(REGEX REPLACE "([\\/\\-]O)[12]?" "\\13" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
string(REGEX REPLACE "([\\/\\-]O)[12]?" "\\13" CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL}")
string(REGEX REPLACE "([\\/\\-]O)[12]?" "\\13" CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO}")
string(REGEX REPLACE "([\\/\\-]O)[^ \t\r\n]*" "\\13" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
string(REGEX REPLACE "([\\/\\-]O)[^ \t\r\n]*" "\\13" CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL}")
string(REGEX REPLACE "([\\/\\-]O)[^ \t\r\n]*" "\\13" CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO}")
string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL}")
string(REGEX REPLACE "([\\/\\-]D) *NDEBUG" "" CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO}")
@ -43,6 +46,8 @@ macro(install_symlink filepath sympath)
install(CODE "message(\"-- Created symlink: ${sympath} -> ${filepath}\")")
endmacro(install_symlink)
check_include_file("linux/nbd-netlink.h" HAVE_NBD_NETLINK_H)
find_package(PkgConfig)
pkg_check_modules(LIBURING REQUIRED liburing)
if (${WITH_QEMU})
@ -145,7 +150,6 @@ add_library(vitastor_client SHARED
cli_status.cpp
cli_describe.cpp
cli_fix.cpp
cli_df.cpp
cli_ls.cpp
cli_create.cpp
cli_modify.cpp
@ -154,6 +158,11 @@ add_library(vitastor_client SHARED
cli_rm_data.cpp
cli_rm.cpp
cli_rm_osd.cpp
cli_pool_cfg.cpp
cli_pool_create.cpp
cli_pool_ls.cpp
cli_pool_modify.cpp
cli_pool_rm.cpp
)
set_target_properties(vitastor_client PROPERTIES PUBLIC_HEADER "vitastor_c.h")
target_link_libraries(vitastor_client
@ -174,17 +183,58 @@ if (${WITH_FIO})
endif (${WITH_FIO})
# vitastor-nbd
pkg_check_modules(NL3 libnl-3.0 libnl-genl-3.0)
add_executable(vitastor-nbd
nbd_proxy.cpp
)
target_link_libraries(vitastor-nbd
target_include_directories(vitastor-nbd PUBLIC ${NL3_INCLUDE_DIRS})
target_link_libraries(vitastor-nbd vitastor_client ${NL3_LIBRARIES})
if (HAVE_NBD_NETLINK_H AND NL3_LIBRARIES)
target_compile_definitions(vitastor-nbd PUBLIC HAVE_NBD_NETLINK_H)
endif (HAVE_NBD_NETLINK_H AND NL3_LIBRARIES)
# libvitastor_kv.so
add_library(vitastor_kv SHARED
kv_db.cpp
kv_db.h
)
target_link_libraries(vitastor_kv
vitastor_client
)
set_target_properties(vitastor_kv PROPERTIES VERSION ${VERSION} SOVERSION 0)
# vitastor-kv
add_executable(vitastor-kv
kv_cli.cpp
)
target_link_libraries(vitastor-kv
vitastor_kv
)
add_executable(vitastor-kv-stress
kv_stress.cpp
)
target_link_libraries(vitastor-kv-stress
vitastor_kv
)
# vitastor-nfs
add_executable(vitastor-nfs
nfs_proxy.cpp
nfs_conn.cpp
nfs_block.cpp
nfs_kv.cpp
nfs_kv_create.cpp
nfs_kv_getattr.cpp
nfs_kv_link.cpp
nfs_kv_lookup.cpp
nfs_kv_read.cpp
nfs_kv_readdir.cpp
nfs_kv_remove.cpp
nfs_kv_rename.cpp
nfs_kv_setattr.cpp
nfs_kv_write.cpp
nfs_fsstat.cpp
nfs_mount.cpp
nfs_portmap.cpp
sha256.c
nfs/xdr_impl.cpp
@ -194,6 +244,7 @@ add_executable(vitastor-nfs
)
target_link_libraries(vitastor-nfs
vitastor_client
vitastor_kv
)
# vitastor-cli
@ -318,12 +369,12 @@ add_test(NAME test_cluster_client COMMAND test_cluster_client)
### Install
install(TARGETS vitastor-osd vitastor-disk vitastor-nbd vitastor-nfs vitastor-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
install(TARGETS vitastor-osd vitastor-disk vitastor-nbd vitastor-nfs vitastor-cli vitastor-kv vitastor-kv-stress RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
install_symlink(vitastor-disk ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-dump-journal)
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-rm)
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vita)
install(
TARGETS vitastor_blk vitastor_client
TARGETS vitastor_blk vitastor_client vitastor_kv
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)

View File

@ -82,3 +82,8 @@ uint32_t blockstore_t::get_bitmap_granularity()
{
return impl->get_bitmap_granularity();
}
void blockstore_t::set_no_inode_stats(const std::vector<uint64_t> & pool_ids)
{
impl->set_no_inode_stats(pool_ids);
}

View File

@ -216,6 +216,9 @@ public:
// Get per-inode space usage statistics
std::map<uint64_t, uint64_t> & get_inode_space_stats();
// Set per-pool no_inode_stats
void set_no_inode_stats(const std::vector<uint64_t> & pool_ids);
// Print diagnostics to stdout
void dump_diagnostics();

View File

@ -86,6 +86,8 @@ void journal_flusher_t::loop()
cur_flusher_count--;
}
}
if (trim_wanted)
co[0].try_trim = true;
for (int i = 0; (active_flushers > 0 || dequeuing || trim_wanted > 0) && i < cur_flusher_count; i++)
co[i].loop();
}
@ -364,10 +366,10 @@ resume_0:
!flusher->flush_queue.size() || !flusher->dequeuing)
{
stop_flusher:
if (flusher->trim_wanted > 0 && cur.oid.inode != 0)
if (flusher->trim_wanted > 0 && try_trim)
{
// Attempt forced trim
cur.oid = {};
try_trim = false;
flusher->active_flushers++;
goto trim_journal;
}
@ -375,6 +377,7 @@ stop_flusher:
wait_state = 0;
return true;
}
try_trim = true;
cur.oid = flusher->flush_queue.front();
cur.version = flusher->flush_versions[cur.oid];
flusher->flush_queue.pop_front();

View File

@ -60,6 +60,7 @@ class journal_flusher_co
std::map<object_id, uint64_t>::iterator repeat_it;
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_rj, simple_callback_w;
bool try_trim = false;
bool skip_copy, has_delete, has_writes;
std::vector<copy_buffer_t> v;
std::vector<copy_buffer_t>::iterator it;

View File

@ -733,3 +733,86 @@ void blockstore_impl_t::disk_error_abort(const char *op, int retval, int expecte
fprintf(stderr, "Disk %s failed: result is %d, expected %d. Can't continue, sorry :-(\n", op, retval, expected);
exit(1);
}
void blockstore_impl_t::set_no_inode_stats(const std::vector<uint64_t> & pool_ids)
{
for (auto & np: no_inode_stats)
{
np.second = 2;
}
for (auto pool_id: pool_ids)
{
if (!no_inode_stats[pool_id])
recalc_inode_space_stats(pool_id, false);
no_inode_stats[pool_id] = 1;
}
for (auto np_it = no_inode_stats.begin(); np_it != no_inode_stats.end(); )
{
if (np_it->second == 2)
{
recalc_inode_space_stats(np_it->first, true);
no_inode_stats.erase(np_it++);
}
else
np_it++;
}
}
void blockstore_impl_t::recalc_inode_space_stats(uint64_t pool_id, bool per_inode)
{
auto sp_begin = inode_space_stats.lower_bound((pool_id << (64-POOL_ID_BITS)));
auto sp_end = inode_space_stats.lower_bound(((pool_id+1) << (64-POOL_ID_BITS)));
inode_space_stats.erase(sp_begin, sp_end);
auto sh_it = clean_db_shards.lower_bound((pool_id << (64-POOL_ID_BITS)));
while (sh_it != clean_db_shards.end() &&
(sh_it->first >> (64-POOL_ID_BITS)) == pool_id)
{
for (auto & pair: sh_it->second)
{
uint64_t space_id = per_inode ? pair.first.inode : (pool_id << (64-POOL_ID_BITS));
inode_space_stats[space_id] += dsk.data_block_size;
}
sh_it++;
}
object_id last_oid = {};
bool last_exists = false;
auto dirty_it = dirty_db.lower_bound((obj_ver_id){ .oid = { .inode = (pool_id << (64-POOL_ID_BITS)) } });
while (dirty_it != dirty_db.end() && (dirty_it->first.oid.inode >> (64-POOL_ID_BITS)) == pool_id)
{
if (IS_STABLE(dirty_it->second.state) && (IS_BIG_WRITE(dirty_it->second.state) || IS_DELETE(dirty_it->second.state)))
{
bool exists = false;
if (last_oid == dirty_it->first.oid)
{
exists = last_exists;
}
else
{
auto & clean_db = clean_db_shard(dirty_it->first.oid);
auto clean_it = clean_db.find(dirty_it->first.oid);
exists = clean_it != clean_db.end();
}
uint64_t space_id = per_inode ? dirty_it->first.oid.inode : (pool_id << (64-POOL_ID_BITS));
if (IS_BIG_WRITE(dirty_it->second.state))
{
if (!exists)
inode_space_stats[space_id] += dsk.data_block_size;
last_exists = true;
}
else
{
if (exists)
{
auto & sp = inode_space_stats[space_id];
if (sp > dsk.data_block_size)
sp -= dsk.data_block_size;
else
inode_space_stats.erase(space_id);
}
last_exists = false;
}
last_oid = dirty_it->first.oid;
}
dirty_it++;
}
}

View File

@ -272,6 +272,7 @@ class blockstore_impl_t
std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
std::map<uint64_t, int> no_inode_stats;
uint8_t *clean_bitmaps = NULL;
blockstore_dirty_db_t dirty_db;
std::vector<blockstore_op_t*> submit_queue;
@ -318,6 +319,7 @@ class blockstore_impl_t
blockstore_clean_db_t& clean_db_shard(object_id oid);
void reshard_clean_db(pool_id_t pool_id, uint32_t pg_count, uint32_t pg_stripe_size);
void recalc_inode_space_stats(uint64_t pool_id, bool per_inode);
// Journaling
void prepare_journal_sector_write(int sector, blockstore_op_t *op);
@ -428,6 +430,9 @@ public:
// Space usage statistics
std::map<uint64_t, uint64_t> inode_space_stats;
// Set per-pool no_inode_stats
void set_no_inode_stats(const std::vector<uint64_t> & pool_ids);
// Print diagnostics to stdout
void dump_diagnostics();

View File

@ -32,7 +32,7 @@ void blockstore_init_meta::handle_event(ring_data_t *data, int buf_num)
if (data->res < 0)
{
throw std::runtime_error(
std::string("read metadata failed at offset ") + std::to_string(bufs[buf_num].offset) +
std::string("read metadata failed at offset ") + std::to_string(buf_num >= 0 ? bufs[buf_num].offset : last_read_offset) +
std::string(": ") + strerror(-data->res)
);
}
@ -63,6 +63,7 @@ int blockstore_init_meta::loop()
throw std::runtime_error("Failed to allocate metadata read buffer");
// Read superblock
GET_SQE();
last_read_offset = 0;
data->iov = { metadata_buffer, (size_t)bs->dsk.meta_block_size };
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset);
@ -100,6 +101,7 @@ resume_1:
{
printf("Initializing metadata area\n");
GET_SQE();
last_read_offset = 0;
data->iov = (struct iovec){ metadata_buffer, (size_t)bs->dsk.meta_block_size };
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset);
@ -236,6 +238,7 @@ resume_2:
data->iov = { bufs[i].buf, (size_t)bufs[i].size };
data->callback = [this, i](ring_data_t *data) { handle_event(data, i); };
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + bufs[i].offset);
bs->ringloop->submit();
bufs[i].state = INIT_META_WRITING;
submitted++;
}
@ -259,9 +262,11 @@ resume_2:
next_offset = entries_to_zero[i]/entries_per_block;
for (j = i; j < entries_to_zero.size() && entries_to_zero[j]/entries_per_block == next_offset; j++) {}
GET_SQE();
last_read_offset = (1+next_offset)*bs->dsk.meta_block_size;
data->iov = { metadata_buffer, (size_t)bs->dsk.meta_block_size };
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
bs->ringloop->submit();
submitted++;
resume_5:
if (submitted > 0)
@ -278,6 +283,7 @@ resume_5:
data->iov = { metadata_buffer, (size_t)bs->dsk.meta_block_size };
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + (1+next_offset)*bs->dsk.meta_block_size);
bs->ringloop->submit();
submitted++;
resume_6:
if (submitted > 0)
@ -299,6 +305,7 @@ resume_6:
{
GET_SQE();
my_uring_prep_fsync(sqe, bs->dsk.meta_fd, IORING_FSYNC_DATASYNC);
last_read_offset = 0;
data->iov = { 0 };
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
submitted++;

View File

@ -23,6 +23,7 @@ class blockstore_init_meta
struct ring_data_t *data;
uint64_t md_offset = 0;
uint64_t next_offset = 0;
uint64_t last_read_offset = 0;
uint64_t entries_loaded = 0;
unsigned entries_per_block = 0;
int i = 0, j = 0;

View File

@ -177,6 +177,7 @@ void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_
ring_data_t *data = ((ring_data_t*)sqe->user_data);
journal.sector_info[cur_sector].written = true;
journal.sector_info[cur_sector].submit_id = ++journal.submit_id;
assert(journal.submit_id != 0); // check overflow
journal.submitting_sectors.push_back(cur_sector);
journal.sector_info[cur_sector].flush_count++;
data->iov = (struct iovec){
@ -192,8 +193,8 @@ void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_
}
journal.sector_info[cur_sector].dirty = false;
// But always remember that this operation has to wait until this exact journal write is finished
journal.flushing_ops.insert((pending_journaling_t){
.flush_id = journal.sector_info[cur_sector].submit_id,
journal.flushing_ops.emplace(journal.sector_info[cur_sector].submit_id, (pending_journaling_t){
.pending = 1,
.sector = cur_sector,
.op = op,
});
@ -213,23 +214,43 @@ void blockstore_impl_t::handle_journal_write(ring_data_t *data, uint64_t flush_i
// FIXME: our state becomes corrupted after a write error. maybe do something better than just die
disk_error_abort("journal write", data->res, data->iov.iov_len);
}
auto fl_it = journal.flushing_ops.upper_bound((pending_journaling_t){ .flush_id = flush_id });
if (fl_it != journal.flushing_ops.end() && fl_it->flush_id == flush_id)
auto fl_it = journal.flushing_ops.lower_bound(flush_id);
if (fl_it != journal.flushing_ops.end() && fl_it->first == flush_id && fl_it->second.sector >= 0)
{
journal.sector_info[fl_it->sector].flush_count--;
journal.sector_info[fl_it->second.sector].flush_count--;
}
while (fl_it != journal.flushing_ops.end() && fl_it->flush_id == flush_id)
auto is_first = fl_it == journal.flushing_ops.begin();
while (fl_it != journal.flushing_ops.end())
{
auto priv = PRIV(fl_it->op);
priv->pending_ops--;
assert(priv->pending_ops >= 0);
if (priv->pending_ops == 0)
bool del = false;
if (fl_it->first == flush_id)
{
release_journal_sectors(fl_it->op);
priv->op_state++;
ringloop->wakeup();
fl_it->second.pending = 0;
del = is_first;
}
else
{
del = !fl_it->second.pending;
}
if (del)
{
// Do not complete this operation if previous writes are unfinished
// Otherwise also complete following operations waiting for this one
auto priv = PRIV(fl_it->second.op);
priv->pending_ops--;
assert(priv->pending_ops >= 0);
if (priv->pending_ops == 0)
{
release_journal_sectors(fl_it->second.op);
priv->op_state++;
ringloop->wakeup();
}
journal.flushing_ops.erase(fl_it++);
}
else
{
fl_it++;
}
journal.flushing_ops.erase(fl_it++);
}
}

View File

@ -156,16 +156,11 @@ struct journal_sector_info_t
struct pending_journaling_t
{
uint64_t flush_id;
int pending;
int sector;
blockstore_op_t *op;
};
inline bool operator < (const pending_journaling_t & a, const pending_journaling_t & b)
{
return a.flush_id < b.flush_id || a.flush_id == b.flush_id && a.op < b.op;
}
struct journal_t
{
int fd;
@ -191,7 +186,7 @@ struct journal_t
int cur_sector = 0;
int in_sector_pos = 0;
std::vector<int> submitting_sectors;
std::set<pending_journaling_t> flushing_ops;
std::multimap<uint64_t, pending_journaling_t> flushing_ops;
uint64_t submit_id = 0;
// Used sector map

View File

@ -487,18 +487,24 @@ void blockstore_impl_t::mark_stable(obj_ver_id v, bool forget_dirty)
}
if (!exists)
{
inode_space_stats[dirty_it->first.oid.inode] += dsk.data_block_size;
uint64_t space_id = dirty_it->first.oid.inode;
if (no_inode_stats[dirty_it->first.oid.inode >> (64-POOL_ID_BITS)])
space_id = space_id & ~(((uint64_t)1 << (64-POOL_ID_BITS)) - 1);
inode_space_stats[space_id] += dsk.data_block_size;
used_blocks++;
}
big_to_flush++;
}
else if (IS_DELETE(dirty_it->second.state))
{
auto & sp = inode_space_stats[dirty_it->first.oid.inode];
uint64_t space_id = dirty_it->first.oid.inode;
if (no_inode_stats[dirty_it->first.oid.inode >> (64-POOL_ID_BITS)])
space_id = space_id & ~(((uint64_t)1 << (64-POOL_ID_BITS)) - 1);
auto & sp = inode_space_stats[space_id];
if (sp > dsk.data_block_size)
sp -= dsk.data_block_size;
else
inode_space_stats.erase(dirty_it->first.oid.inode);
inode_space_stats.erase(space_id);
used_blocks--;
big_to_flush++;
}

View File

@ -95,11 +95,13 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
// Issue an additional sync so the delete reaches the journal
blockstore_op_t *sync_op = new blockstore_op_t;
sync_op->opcode = BS_OP_SYNC;
sync_op->callback = [this, op](blockstore_op_t *sync_op)
sync_op->oid = op->oid;
sync_op->version = op->version;
sync_op->callback = [this](blockstore_op_t *sync_op)
{
flusher->unshift_flush((obj_ver_id){
.oid = op->oid,
.version = op->version-1,
.oid = sync_op->oid,
.version = sync_op->version-1,
}, true);
delete sync_op;
};
@ -425,7 +427,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
);
write_iodepth++;
// Got SQEs. Prepare previous journal sector write if required
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
if (immediate_commit == IMMEDIATE_NONE &&
!journal.entry_fits(sizeof(journal_entry_small_write) + dyn_size))
{
@ -501,7 +502,15 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
}
BS_SUBMIT_GET_SQE(sqe2, data2);
data2->iov = (struct iovec){ op->buf, op->len };
data2->callback = cb;
++journal.submit_id;
assert(journal.submit_id != 0); // check overflow
// Make subsequent journal writes wait for our data write
journal.flushing_ops.emplace(journal.submit_id, (pending_journaling_t){
.pending = 1,
.sector = -1,
.op = op,
});
data2->callback = [this, flush_id = journal.submit_id](ring_data_t *data) { handle_journal_write(data, flush_id); };
my_uring_prep_writev(
sqe2, dsk.journal_fd, &data2->iov, 1, journal.offset + journal.next_free
);

View File

@ -46,18 +46,21 @@ static const char* help_text =
"vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot>\n"
" Create a snapshot of image <name>. May be used live if only a single writer is active.\n"
"\n"
"vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]\n"
"vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force] [--down-ok]\n"
" Rename, resize image or change its readonly status. Images with children can't be made read-write.\n"
" If the new size is smaller than the old size, extra data will be purged.\n"
" You should resize file system in the image, if present, before shrinking it.\n"
" -f|--force Proceed with shrinking or setting readwrite flag even if the image has children.\n"
" --down-ok Proceed with shrinking even if some data will be left on unavailable OSDs.\n"
"\n"
"vitastor-cli rm <from> [<to>] [--writers-stopped]\n"
"vitastor-cli rm <from> [<to>] [--writers-stopped] [--down-ok]\n"
" Remove <from> or all layers between <from> and <to> (<to> must be a child of <from>),\n"
" rebasing all their children accordingly. --writers-stopped allows merging to be a bit\n"
" more effective in case of a single 'slim' read-write child and 'fat' removed parent:\n"
" the child is merged into parent and parent is renamed to child in that case.\n"
" In other cases parent layers are always merged into children.\n"
" Other options:\n"
" --down-ok Continue deletion/merging even if some data will be left on unavailable OSDs.\n"
"\n"
"vitastor-cli flatten <layer>\n"
" Flatten a layer, i.e. merge data and detach it from parents.\n"
@ -73,8 +76,8 @@ static const char* help_text =
" <to> must be a child of <from> and <target> may be one of the layers between\n"
" <from> and <to>, including <from> and <to>.\n"
"\n"
"vitastor-cli describe [--osds <osds>] [--object-state <states>] [--pool <pool>] [--inode <ino>] [--min-inode <ino>] [--max-inode <ino>] [--min-offset <offset>] [--max-offset <offset>]\n"
" Describe unclean object locations in the cluster.\n"
"vitastor-cli describe [OPTIONS]\n"
" Describe unclean object locations in the cluster. Options:\n"
" --osds <osds>\n"
" Only list objects from primary OSD(s) <osds>.\n"
" --object-state <states>\n"
@ -82,6 +85,8 @@ static const char* help_text =
" degraded, misplaced, incomplete, corrupted, inconsistent.\n"
" --pool <pool name or number>\n"
" Only list objects in the given pool.\n"
" --pg <pg number>\n"
" Only list objects in the given PG of the pool.\n"
" --inode, --min-inode, --max-inode\n"
" Restrict listing to specific inode numbers.\n"
" --min-offset, --max-offset\n"
@ -113,6 +118,58 @@ static const char* help_text =
" With --dry-run only checks if deletion is possible without data loss and\n"
" redundancy degradation.\n"
"\n"
"vitastor-cli create-pool|pool-create <name> (-s <pg_size>|--ec <N>+<K>) -n <pg_count> [OPTIONS]\n"
" Create a pool. Required parameters:\n"
" -s|--pg_size R Number of replicas for replicated pools\n"
" --ec N+K Number of data (N) and parity (K) chunks for erasure-coded pools\n"
" -n|--pg_count N PG count for the new pool (start with 10*<OSD count>/pg_size rounded to a power of 2)\n"
" Optional parameters:\n"
" --pg_minsize <number> R or N+K minus number of failures to tolerate without downtime\n"
" --failure_domain host Failure domain: host, osd or a level from placement_levels. Default: host\n"
" --root_node <node> Put pool only on child OSDs of this placement tree node\n"
" --osd_tags <tag>[,<tag>]... Put pool only on OSDs tagged with all specified tags\n"
" --block_size 128k Put pool only on OSDs with this data block size\n"
" --bitmap_granularity 4k Put pool only on OSDs with this logical sector size\n"
" --immediate_commit none Put pool only on OSDs with this or larger immediate_commit (none < small < all)\n"
" --level_placement <rules> Use additional failure domain rules (example: \"dc=112233\")\n"
" --raw_placement <rules> Specify raw PG generation rules (see documentation for details)\n"
" --primary_affinity_tags tags Prefer to put primary copies on OSDs with all specified tags\n"
" --scrub_interval <time> Enable regular scrubbing for this pool. Format: number + unit s/m/h/d/M/y\n"
" --used_for_fs <name> Mark pool as used for VitastorFS with metadata in image <name>\n"
" --pg_stripe_size <number> Increase object grouping stripe\n"
" --max_osd_combinations 10000 Maximum number of random combinations for LP solver input\n"
" --wait Wait for the new pool to come online\n"
" -f|--force Do not check that cluster has enough OSDs to create the pool\n"
" Examples:\n"
" vitastor-cli create-pool test_x4 -s 4 -n 32\n"
" vitastor-cli create-pool test_ec42 --ec 4+2 -n 32\n"
"\n"
"vitastor-cli modify-pool|pool-modify <id|name> [--name <new_name>] [PARAMETERS...]\n"
" Modify an existing pool. Modifiable parameters:\n"
" [-s|--pg_size <number>] [--pg_minsize <number>] [-n|--pg_count <count>]\n"
" [--failure_domain <level>] [--root_node <node>] [--osd_tags <tags>] [--used_for_fs <name>]\n"
" [--max_osd_combinations <number>] [--primary_affinity_tags <tags>] [--scrub_interval <time>]\n"
" [--level_placement <rules>] [--raw_placement <rules>]\n"
" Non-modifiable parameters (changing them WILL lead to data loss):\n"
" [--block_size <size>] [--bitmap_granularity <size>]\n"
" [--immediate_commit <all|small|none>] [--pg_stripe_size <size>]\n"
" These, however, can still be modified with -f|--force.\n"
" See create-pool for parameter descriptions.\n"
" Examples:\n"
" vitastor-cli modify-pool pool_A --name pool_B\n"
" vitastor-cli modify-pool 2 --pg_size 4 -n 128\n"
"\n"
"vitastor-cli rm-pool|pool-rm [--force] <id|name>\n"
" Remove a pool. Refuses to remove pools with images without --force.\n"
"\n"
"vitastor-cli ls-pools|pool-ls|ls-pool|pools [-l] [--detail] [--sort FIELD] [-r] [-n N] [--stats] [<glob> ...]\n"
" List pools (only matching <glob> patterns if passed).\n"
" -l|--long Also report I/O statistics\n"
" --detail Use list format (not table), show all details\n"
" --sort FIELD Sort by specified field (see fields in --json output)\n"
" -r|--reverse Sort in descending order\n"
" -n|--count N Only list first N items\n"
"\n"
"Use vitastor-cli --help <command> for command details or vitastor-cli --help --all for all details.\n"
"\n"
"GLOBAL OPTIONS:\n"
@ -122,7 +179,7 @@ static const char* help_text =
" --parallel_osds M Work with M osds in parallel when possible (default 4)\n"
" --progress 1|0 Report progress (default 1)\n"
" --cas 1|0 Use CAS writes for flatten, merge, rm (default is decide automatically)\n"
" --no-color Disable colored output\n"
" --color 1|0 Enable/disable colored output and CR symbols (default 1 if stdout is a terminal)\n"
" --json JSON output\n"
;
@ -133,6 +190,7 @@ static json11::Json::object parse_args(int narg, const char *args[])
cfg["progress"] = "1";
for (int i = 1; i < narg; i++)
{
bool argHasValue = (!(i == narg-1) && (args[i+1][0] != '-'));
if (args[i][0] == '-' && args[i][1] == 'h' && args[i][2] == 0)
{
cfg["help"] = "1";
@ -143,15 +201,15 @@ static json11::Json::object parse_args(int narg, const char *args[])
}
else if (args[i][0] == '-' && args[i][1] == 'n' && args[i][2] == 0)
{
cfg["count"] = args[++i];
cfg["count"] = argHasValue ? args[++i] : "";
}
else if (args[i][0] == '-' && args[i][1] == 'p' && args[i][2] == 0)
{
cfg["pool"] = args[++i];
cfg["pool"] = argHasValue ? args[++i] : "";
}
else if (args[i][0] == '-' && args[i][1] == 's' && args[i][2] == 0)
{
cfg["size"] = args[++i];
cfg["size"] = argHasValue ? args[++i] : "";
}
else if (args[i][0] == '-' && args[i][1] == 'r' && args[i][2] == 0)
{
@ -164,17 +222,24 @@ static json11::Json::object parse_args(int narg, const char *args[])
else if (args[i][0] == '-' && args[i][1] == '-')
{
const char *opt = args[i]+2;
cfg[opt] = i == narg-1 || !strcmp(opt, "json") ||
if (!strcmp(opt, "json") || !strcmp(opt, "wait") ||
!strcmp(opt, "wait-list") || !strcmp(opt, "wait_list") ||
!strcmp(opt, "long") || !strcmp(opt, "del") ||
!strcmp(opt, "long") || !strcmp(opt, "detail") || !strcmp(opt, "del") ||
!strcmp(opt, "no-color") || !strcmp(opt, "no_color") ||
!strcmp(opt, "readonly") || !strcmp(opt, "readwrite") ||
!strcmp(opt, "force") || !strcmp(opt, "reverse") ||
!strcmp(opt, "allow-data-loss") || !strcmp(opt, "allow_data_loss") ||
!strcmp(opt, "down-ok") || !strcmp(opt, "down_ok") ||
!strcmp(opt, "dry-run") || !strcmp(opt, "dry_run") ||
!strcmp(opt, "help") || !strcmp(opt, "all") ||
(!strcmp(opt, "writers-stopped") || !strcmp(opt, "writers_stopped")) && strcmp("1", args[i+1]) != 0
? "1" : args[++i];
!strcmp(opt, "writers-stopped") || !strcmp(opt, "writers_stopped"))
{
cfg[opt] = "1";
}
else
{
cfg[opt] = argHasValue ? args[++i] : "";
}
}
else
{
@ -217,7 +282,7 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
else if (cmd[0] == "df")
{
// Show pool space stats
action_cb = p->start_df(cfg);
action_cb = p->start_pool_ls(cfg);
}
else if (cmd[0] == "ls")
{
@ -324,6 +389,44 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
// Allocate a new OSD number
action_cb = p->start_alloc_osd(cfg);
}
else if (cmd[0] == "create-pool" || cmd[0] == "pool-create")
{
// Create a new pool
if (cmd.size() > 1 && cfg["name"].is_null())
{
cfg["name"] = cmd[1];
}
action_cb = p->start_pool_create(cfg);
}
else if (cmd[0] == "modify-pool" || cmd[0] == "pool-modify")
{
// Modify existing pool
if (cmd.size() > 1)
{
cfg["old_name"] = cmd[1];
}
action_cb = p->start_pool_modify(cfg);
}
else if (cmd[0] == "rm-pool" || cmd[0] == "pool-rm")
{
// Remove existing pool
if (cmd.size() > 1)
{
cfg["pool"] = cmd[1];
}
action_cb = p->start_pool_rm(cfg);
}
else if (cmd[0] == "ls-pool" || cmd[0] == "pool-ls" || cmd[0] == "ls-pools" || cmd[0] == "pools")
{
// Show pool list
cfg["show_recovery"] = 1;
if (cmd.size() > 1)
{
cmd.erase(cmd.begin(), cmd.begin()+1);
cfg["names"] = cmd;
}
action_cb = p->start_pool_ls(cfg);
}
else
{
result = { .err = EINVAL, .text = "unknown command: "+cmd[0].string_value() };

View File

@ -46,6 +46,7 @@ public:
json11::Json etcd_result;
void parse_config(json11::Json::object & cfg);
json11::Json parse_tags(std::string tags);
void change_parent(inode_t cur, inode_t new_parent, cli_result_t *result);
inode_config_t* get_inode_cfg(const std::string & name);
@ -58,7 +59,6 @@ public:
std::function<bool(cli_result_t &)> start_status(json11::Json);
std::function<bool(cli_result_t &)> start_describe(json11::Json);
std::function<bool(cli_result_t &)> start_fix(json11::Json);
std::function<bool(cli_result_t &)> start_df(json11::Json);
std::function<bool(cli_result_t &)> start_ls(json11::Json);
std::function<bool(cli_result_t &)> start_create(json11::Json);
std::function<bool(cli_result_t &)> start_modify(json11::Json);
@ -68,6 +68,10 @@ public:
std::function<bool(cli_result_t &)> start_rm(json11::Json);
std::function<bool(cli_result_t &)> start_rm_osd(json11::Json cfg);
std::function<bool(cli_result_t &)> start_alloc_osd(json11::Json cfg);
std::function<bool(cli_result_t &)> start_pool_create(json11::Json);
std::function<bool(cli_result_t &)> start_pool_modify(json11::Json);
std::function<bool(cli_result_t &)> start_pool_rm(json11::Json);
std::function<bool(cli_result_t &)> start_pool_ls(json11::Json);
// Should be called like loop_and_wait(start_status(), <completion callback>)
void loop_and_wait(std::function<bool(cli_result_t &)> loop_cb, std::function<void(const cli_result_t &)> complete_cb);
@ -77,8 +81,13 @@ public:
std::string print_table(json11::Json items, json11::Json header, bool use_esc);
size_t print_detail_title_len(json11::Json item, std::vector<std::pair<std::string, std::string>> names, size_t prev_len);
std::string print_detail(json11::Json item, std::vector<std::pair<std::string, std::string>> names, size_t title_len, bool use_esc);
std::string format_lat(uint64_t lat);
std::string format_q(double depth);
bool stupid_glob(const std::string str, const std::string glob);
std::string implode(const std::string & sep, json11::Json array);

View File

@ -1,6 +1,7 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include <unistd.h>
#include "str_util.h"
#include "cluster_client.h"
#include "cli.h"
@ -113,7 +114,12 @@ void cli_tool_t::parse_config(json11::Json::object & cfg)
else
kv_it++;
}
color = !cfg["no_color"].bool_value();
if (cfg.find("no_color") != cfg.end())
color = !cfg["no_color"].bool_value();
else if (cfg.find("color") != cfg.end())
color = cfg["color"].bool_value();
else
color = isatty(1);
json_output = cfg["json"].bool_value();
iodepth = cfg["iodepth"].uint64_value();
if (!iodepth)
@ -147,6 +153,7 @@ void cli_tool_t::loop_and_wait(std::function<bool(cli_result_t &)> loop_cb, std:
ringloop->unregister_consumer(&looper->consumer);
looper->loop_cb = NULL;
looper->complete_cb(looper->result);
ringloop->submit();
delete looper;
return;
}

View File

@ -27,6 +27,7 @@ struct image_creator_t
std::string image_name, new_snap, new_parent;
json11::Json new_meta;
uint64_t size;
bool force = false;
bool force_size = false;
pool_id_t old_pool_id = 0;
@ -45,6 +46,7 @@ struct image_creator_t
void loop()
{
auto & pools = parent->cli->st_cli.pool_config;
if (state >= 1)
goto resume_1;
if (image_name == "")
@ -62,7 +64,6 @@ struct image_creator_t
}
if (new_pool_id)
{
auto & pools = parent->cli->st_cli.pool_config;
if (pools.find(new_pool_id) == pools.end())
{
result = (cli_result_t){ .err = ENOENT, .text = "Pool "+std::to_string(new_pool_id)+" does not exist" };
@ -72,7 +73,7 @@ struct image_creator_t
}
else if (new_pool_name != "")
{
for (auto & ic: parent->cli->st_cli.pool_config)
for (auto & ic: pools)
{
if (ic.second.name == new_pool_name)
{
@ -87,10 +88,20 @@ struct image_creator_t
return;
}
}
else if (parent->cli->st_cli.pool_config.size() == 1)
else if (pools.size() == 1)
{
auto it = parent->cli->st_cli.pool_config.begin();
new_pool_id = it->first;
new_pool_id = pools.begin()->first;
}
if (new_pool_id && !pools.at(new_pool_id).used_for_fs.empty() && !force)
{
result = (cli_result_t){
.err = EINVAL,
.text = "Pool "+pools.at(new_pool_id).name+
" is used for VitastorFS "+pools.at(new_pool_id).used_for_fs+
". Use --force if you really know what you are doing",
};
state = 100;
return;
}
state = 1;
resume_1:
@ -183,7 +194,16 @@ resume_3:
// Save into inode_config for library users to be able to take it from there immediately
new_cfg.mod_revision = parent->etcd_result["responses"][0]["response_put"]["header"]["revision"].uint64_value();
parent->cli->st_cli.insert_inode_config(new_cfg);
result = (cli_result_t){ .err = 0, .text = "Image "+image_name+" created" };
result = (cli_result_t){
.err = 0,
.text = "Image "+image_name+" created",
.data = json11::Json::object {
{ "name", image_name },
{ "pool", new_pool_name },
{ "parent", new_parent },
{ "size", size },
}
};
state = 100;
}
@ -251,7 +271,16 @@ resume_4:
// Save into inode_config for library users to be able to take it from there immediately
new_cfg.mod_revision = parent->etcd_result["responses"][0]["response_put"]["header"]["revision"].uint64_value();
parent->cli->st_cli.insert_inode_config(new_cfg);
result = (cli_result_t){ .err = 0, .text = "Snapshot "+image_name+"@"+new_snap+" created" };
result = (cli_result_t){
.err = 0,
.text = "Snapshot "+image_name+"@"+new_snap+" created",
.data = json11::Json::object {
{ "name", image_name+"@"+new_snap },
{ "pool", (uint64_t)new_pool_id },
{ "parent", new_parent },
{ "size", size },
}
};
state = 100;
}
@ -514,6 +543,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_create(json11::Json cfg)
image_creator->image_name = cfg["image"].string_value();
image_creator->new_pool_id = cfg["pool"].uint64_value();
image_creator->new_pool_name = cfg["pool"].string_value();
image_creator->force = cfg["force"].bool_value();
image_creator->force_size = cfg["force_size"].bool_value();
if (cfg["image_meta"].is_object())
{

View File

@ -37,6 +37,7 @@ struct cli_describe_t
{
uint64_t object_state = 0;
pool_id_t only_pool = 0;
pg_num_t only_pg = 0;
std::vector<uint64_t> only_osds;
uint64_t min_inode = 0, max_inode = 0;
uint64_t min_offset = 0, max_offset = 0;
@ -68,6 +69,7 @@ struct cli_describe_t
}
}
}
only_pg = cfg["pg"].uint64_value();
min_inode = cfg["inode"].uint64_value();
if (min_inode)
{
@ -142,8 +144,8 @@ struct cli_describe_t
{
osd_op_t *op = new osd_op_t;
op->req = (osd_any_op_t){
.describe = {
.header = {
.describe = (osd_op_describe_t){
.header = (osd_op_header_t){
.magic = SECONDARY_OSD_OP_MAGIC,
.id = parent->cli->next_op_id(),
.opcode = OSD_OP_DESCRIBE,
@ -153,6 +155,8 @@ struct cli_describe_t
.min_offset = min_offset,
.max_inode = max_inode,
.max_offset = max_offset,
.pool_id = only_pool,
.pg_num = only_pg,
},
};
op->callback = [this, osd_num = only_osds[i]](osd_op_t *op)
@ -182,7 +186,7 @@ struct cli_describe_t
printf(
(parent->json_output
? (count > 0 ? ",\n " FMT : " " FMT)
: "%jx:%jx part %u on OSD %ju%s%s%s\n"),
: "0x%jx:0x%jx part %u on OSD %ju%s%s%s\n"),
#undef FMT
items[i].inode, items[i].stripe,
items[i].role, items[i].osd_num,

View File

@ -1,243 +0,0 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include "cli.h"
#include "cluster_client.h"
#include "str_util.h"
// List pools with space statistics
struct pool_lister_t
{
cli_tool_t *parent;
int state = 0;
json11::Json space_info;
cli_result_t result;
std::map<pool_id_t, json11::Json::object> pool_stats;
bool is_done()
{
return state == 100;
}
void get_stats()
{
if (state == 1)
goto resume_1;
// Space statistics - pool/stats/<pool>
parent->etcd_txn(json11::Json::object {
{ "success", json11::Json::array {
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(
parent->cli->st_cli.etcd_prefix+"/pool/stats/"
) },
{ "range_end", base64_encode(
parent->cli->st_cli.etcd_prefix+"/pool/stats0"
) },
} },
},
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(
parent->cli->st_cli.etcd_prefix+"/osd/stats/"
) },
{ "range_end", base64_encode(
parent->cli->st_cli.etcd_prefix+"/osd/stats0"
) },
} },
},
} },
});
state = 1;
resume_1:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
space_info = parent->etcd_result;
std::map<pool_id_t, uint64_t> osd_free;
for (auto & kv_item: space_info["responses"][0]["response_range"]["kvs"].array_items())
{
auto kv = parent->cli->st_cli.parse_etcd_kv(kv_item);
// pool ID
pool_id_t pool_id;
char null_byte = 0;
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/pool/stats/%u%c", &pool_id, &null_byte);
if (scanned != 1 || !pool_id || pool_id >= POOL_ID_MAX)
{
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
continue;
}
// pool/stats/<N>
pool_stats[pool_id] = kv.value.object_items();
}
for (auto & kv_item: space_info["responses"][1]["response_range"]["kvs"].array_items())
{
auto kv = parent->cli->st_cli.parse_etcd_kv(kv_item);
// osd ID
osd_num_t osd_num;
char null_byte = 0;
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/osd/stats/%ju%c", &osd_num, &null_byte);
if (scanned != 1 || !osd_num || osd_num >= POOL_ID_MAX)
{
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
continue;
}
// osd/stats/<N>::free
osd_free[osd_num] = kv.value["free"].uint64_value();
}
// Calculate max_avail for each pool
for (auto & pp: parent->cli->st_cli.pool_config)
{
auto & pool_cfg = pp.second;
uint64_t pool_avail = UINT64_MAX;
std::map<osd_num_t, uint64_t> pg_per_osd;
for (auto & pgp: pool_cfg.pg_config)
{
for (auto pg_osd: pgp.second.target_set)
{
if (pg_osd != 0)
{
pg_per_osd[pg_osd]++;
}
}
}
for (auto pg_per_pair: pg_per_osd)
{
uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.real_pg_count / pg_per_pair.second;
if (pool_avail > pg_free)
{
pool_avail = pg_free;
}
}
if (pool_avail == UINT64_MAX)
{
pool_avail = 0;
}
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
{
pool_avail *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
}
pool_stats[pool_cfg.id] = json11::Json::object {
{ "id", (uint64_t)pool_cfg.id },
{ "name", pool_cfg.name },
{ "pg_count", pool_cfg.pg_count },
{ "real_pg_count", pool_cfg.real_pg_count },
{ "scheme", pool_cfg.scheme == POOL_SCHEME_REPLICATED ? "replicated" : "ec" },
{ "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
: "EC "+std::to_string(pool_cfg.pg_size-pool_cfg.parity_chunks)+"+"+std::to_string(pool_cfg.parity_chunks) },
{ "used_raw", (uint64_t)(pool_stats[pool_cfg.id]["used_raw_tb"].number_value() * ((uint64_t)1<<40)) },
{ "total_raw", (uint64_t)(pool_stats[pool_cfg.id]["total_raw_tb"].number_value() * ((uint64_t)1<<40)) },
{ "max_available", pool_avail },
{ "raw_to_usable", pool_stats[pool_cfg.id]["raw_to_usable"].number_value() },
{ "space_efficiency", pool_stats[pool_cfg.id]["space_efficiency"].number_value() },
{ "pg_real_size", pool_stats[pool_cfg.id]["pg_real_size"].uint64_value() },
{ "failure_domain", pool_cfg.failure_domain },
};
}
}
json11::Json::array to_list()
{
json11::Json::array list;
for (auto & kv: pool_stats)
{
list.push_back(kv.second);
}
return list;
}
void loop()
{
get_stats();
if (parent->waiting > 0)
return;
if (state == 100)
return;
if (parent->json_output)
{
// JSON output
result.data = to_list();
state = 100;
return;
}
// Table output: name, scheme_name, pg_count, total, used, max_avail, used%, efficiency
json11::Json::array cols;
cols.push_back(json11::Json::object{
{ "key", "name" },
{ "title", "NAME" },
});
cols.push_back(json11::Json::object{
{ "key", "scheme_name" },
{ "title", "SCHEME" },
});
cols.push_back(json11::Json::object{
{ "key", "pg_count_fmt" },
{ "title", "PGS" },
});
cols.push_back(json11::Json::object{
{ "key", "total_fmt" },
{ "title", "TOTAL" },
});
cols.push_back(json11::Json::object{
{ "key", "used_fmt" },
{ "title", "USED" },
});
cols.push_back(json11::Json::object{
{ "key", "max_avail_fmt" },
{ "title", "AVAILABLE" },
});
cols.push_back(json11::Json::object{
{ "key", "used_pct" },
{ "title", "USED%" },
});
cols.push_back(json11::Json::object{
{ "key", "eff_fmt" },
{ "title", "EFFICIENCY" },
});
json11::Json::array list;
for (auto & kv: pool_stats)
{
double raw_to = kv.second["raw_to_usable"].number_value();
if (raw_to < 0.000001 && raw_to > -0.000001)
raw_to = 1;
kv.second["pg_count_fmt"] = kv.second["real_pg_count"] == kv.second["pg_count"]
? kv.second["real_pg_count"].as_string()
: kv.second["real_pg_count"].as_string()+"->"+kv.second["pg_count"].as_string();
kv.second["total_fmt"] = format_size(kv.second["total_raw"].uint64_value() / raw_to);
kv.second["used_fmt"] = format_size(kv.second["used_raw"].uint64_value() / raw_to);
kv.second["max_avail_fmt"] = format_size(kv.second["max_available"].uint64_value());
kv.second["used_pct"] = format_q(kv.second["total_raw"].uint64_value()
? (100 - 100*kv.second["max_available"].uint64_value() *
kv.second["raw_to_usable"].number_value() / kv.second["total_raw"].uint64_value())
: 100)+"%";
kv.second["eff_fmt"] = format_q(kv.second["space_efficiency"].number_value()*100)+"%";
}
result.data = to_list();
result.text = print_table(result.data, cols, parent->color);
state = 100;
}
};
std::function<bool(cli_result_t &)> cli_tool_t::start_df(json11::Json cfg)
{
auto lister = new pool_lister_t();
lister->parent = this;
return [lister](cli_result_t & result)
{
lister->loop();
if (lister->is_done())
{
result = lister->result;
delete lister;
return true;
}
return false;
};
}

View File

@ -275,7 +275,9 @@ struct snap_merger_t
processed++;
if (parent->progress && !(processed % 128))
{
printf("\rFiltering target blocks: %ju/%ju", processed, to_process);
fprintf(stderr, parent->color
? "\rFiltering target blocks: %ju/%ju"
: "Filtering target blocks: %ju/%ju\n", processed, to_process);
}
}
if (in_flight > 0 || oit != merge_offsets.end())
@ -285,7 +287,9 @@ struct snap_merger_t
}
if (parent->progress)
{
printf("\r%ju full blocks of target filtered out\n", to_process-merge_offsets.size());
fprintf(stderr, parent->color
? "\r%ju full blocks of target filtered out\n"
: "%ju full blocks of target filtered out\n", to_process-merge_offsets.size());
}
}
state = 3;
@ -320,7 +324,9 @@ struct snap_merger_t
processed++;
if (parent->progress && !(processed % 128))
{
printf("\rOverwriting blocks: %ju/%ju", processed, to_process);
fprintf(stderr, parent->color
? "\rOverwriting blocks: %ju/%ju"
: "Overwriting blocks: %ju/%ju\n", processed, to_process);
}
}
if (in_flight == 0 && rwo_error.size())
@ -339,10 +345,16 @@ struct snap_merger_t
}
if (parent->progress)
{
printf("\rOverwriting blocks: %ju/%ju\n", to_process, to_process);
fprintf(stderr, parent->color
? "\rOverwriting blocks: %ju/%ju\n"
: "Overwriting blocks: %ju/%ju\n", to_process, to_process);
}
// Done
result = (cli_result_t){ .text = "Done, layers from "+from_name+" to "+to_name+" merged into "+target_name };
result = (cli_result_t){ .text = "Done, layers from "+from_name+" to "+to_name+" merged into "+target_name, .data = json11::Json::object {
{ "from", from_name },
{ "to", to_name },
{ "into", target_name },
}};
state = 100;
resume_100:
return;

View File

@ -15,6 +15,7 @@ struct image_changer_t
uint64_t new_size = 0;
bool force_size = false, inc_size = false;
bool set_readonly = false, set_readwrite = false, force = false;
bool down_ok = false;
// interval between fsyncs
int fsync_interval = 128;
@ -84,7 +85,10 @@ struct image_changer_t
(!new_size && !force_size || cfg.size == new_size || cfg.size >= new_size && inc_size) &&
(new_name == "" || new_name == image_name))
{
result = (cli_result_t){ .text = "No change" };
result = (cli_result_t){ .err = 0, .text = "No change", .data = json11::Json::object {
{ "error_code", 0 },
{ "error_text", "No change" },
}};
state = 100;
return;
}
@ -105,6 +109,7 @@ struct image_changer_t
{ "pool", (uint64_t)INODE_POOL(inode_num) },
{ "fsync-interval", fsync_interval },
{ "min-offset", ((new_size+4095)/4096)*4096 },
{ "down-ok", down_ok },
});
resume_1:
while (!cb(result))
@ -220,7 +225,16 @@ resume_2:
parent->cli->st_cli.inode_by_name.erase(image_name);
}
parent->cli->st_cli.insert_inode_config(cfg);
result = (cli_result_t){ .err = 0, .text = "Image "+image_name+" modified" };
result = (cli_result_t){
.err = 0,
.text = "Image "+image_name+" modified",
.data = json11::Json::object {
{ "name", image_name },
{ "inode", INODE_NO_POOL(inode_num) },
{ "pool", (uint64_t)INODE_POOL(inode_num) },
{ "size", new_size },
}
};
state = 100;
}
};
@ -240,6 +254,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_modify(json11::Json cfg)
changer->fsync_interval = cfg["fsync_interval"].uint64_value();
if (!changer->fsync_interval)
changer->fsync_interval = 128;
changer->down_ok = cfg["down_ok"].bool_value();
// FIXME Check that the image doesn't have children when shrinking
return [changer](cli_result_t & result)
{

339
src/cli_pool_cfg.cpp Normal file
View File

@ -0,0 +1,339 @@
// Copyright (c) Vitaliy Filippov, 2024
// License: VNPL-1.1 (see README.md for details)
#include "cli_pool_cfg.h"
#include "etcd_state_client.h"
#include "str_util.h"
std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json old_cfg,
uint64_t global_block_size, uint64_t global_bitmap_granularity, bool force)
{
// short option names
if (new_cfg.find("count") != new_cfg.end())
{
new_cfg["pg_count"] = new_cfg["count"];
new_cfg.erase("count");
}
if (new_cfg.find("size") != new_cfg.end())
{
new_cfg["pg_size"] = new_cfg["size"];
new_cfg.erase("size");
}
// --ec shortcut
if (new_cfg.find("ec") != new_cfg.end())
{
if (new_cfg.find("scheme") != new_cfg.end() ||
new_cfg.find("pg_size") != new_cfg.end() ||
new_cfg.find("parity_chunks") != new_cfg.end())
{
return "--ec can't be used with --pg_size, --parity_chunks or --scheme";
}
// pg_size = N+K
// parity_chunks = K
uint64_t data_chunks = 0, parity_chunks = 0;
char null_byte = 0;
int ret = sscanf(new_cfg["ec"].string_value().c_str(), "%ju+%ju%c", &data_chunks, &parity_chunks, &null_byte);
if (ret != 2 || !data_chunks || !parity_chunks)
{
return "--ec should be <N>+<K> format (<N>, <K> - numbers)";
}
new_cfg.erase("ec");
new_cfg["scheme"] = "ec";
new_cfg["pg_size"] = data_chunks+parity_chunks;
new_cfg["parity_chunks"] = parity_chunks;
}
if (old_cfg.is_null() && new_cfg["scheme"].string_value() == "")
{
// Default scheme
new_cfg["scheme"] = "replicated";
}
if (new_cfg.find("pg_minsize") == new_cfg.end() && (old_cfg.is_null() || new_cfg.find("pg_size") != new_cfg.end()))
{
// Default pg_minsize
if (new_cfg["scheme"] == "replicated")
{
// pg_minsize = (N+K > 2) ? 2 : 1
new_cfg["pg_minsize"] = new_cfg["pg_size"].uint64_value() > 2 ? 2 : 1;
}
else // ec or xor
{
// pg_minsize = (K > 1) ? N + 1 : N
new_cfg["pg_minsize"] = new_cfg["pg_size"].uint64_value() - new_cfg["parity_chunks"].uint64_value() +
(new_cfg["parity_chunks"].uint64_value() > 1 ? 1 : 0);
}
}
// Check integer values and unknown keys
for (auto kv_it = new_cfg.begin(); kv_it != new_cfg.end(); )
{
auto & key = kv_it->first;
auto & value = kv_it->second;
if (key == "pg_size" || key == "parity_chunks" || key == "pg_minsize" ||
key == "pg_count" || key == "max_osd_combinations" ||
key == "bitmap_granularity" || key == "pg_stripe_size")
{
if (value.is_number() && value.uint64_value() != value.number_value() ||
value.is_string() && !value.uint64_value() && value.string_value() != "0")
{
return key+" must be a non-negative integer";
}
value = value.uint64_value();
}
else if (key == "block_size")
{
value = value.is_string() ? parse_size(value.string_value()) : value.uint64_value();
if (!value)
{
return key+" must be an integer with or without size suffix (K/M/G/T)";
}
}
else if (key == "name" || key == "scheme" || key == "immediate_commit" ||
key == "failure_domain" || key == "root_node" || key == "scrub_interval" || key == "used_for_fs" ||
key == "raw_placement")
{
if (!value.is_string())
{
return key+" must be a string";
}
}
else if (key == "level_placement")
{
// level=rule, level=rule, ...
if (!value.is_object())
{
json11::Json::object obj;
for (auto & item: explode(",", value.string_value(), true))
{
auto pair = explode("=", item, true);
if (pair.size() >= 2)
{
obj[pair[0]] = pair[1];
}
}
if (obj.size())
{
value = obj;
}
else
{
new_cfg.erase(kv_it++);
continue;
}
}
}
else if (key == "osd_tags" || key == "primary_affinity_tags")
{
if (value.is_string())
{
value = explode(",", value.string_value(), true);
}
}
else
{
// Unknown parameter
new_cfg.erase(kv_it++);
continue;
}
kv_it++;
}
// Merge with the old config
if (!old_cfg.is_null())
{
for (auto & kv: old_cfg.object_items())
{
if (new_cfg.find(kv.first) == new_cfg.end())
{
new_cfg[kv.first] = kv.second;
}
}
}
// Check after merging
if (new_cfg["scheme"] != "ec")
{
new_cfg.erase("parity_chunks");
}
if (new_cfg.find("used_for_fs") != new_cfg.end() && new_cfg["used_for_fs"].string_value() == "")
{
new_cfg.erase("used_for_fs");
}
// Prevent autovivification of object keys. Now we don't modify the config, we just check it
json11::Json cfg = new_cfg;
// Validate changes
if (!old_cfg.is_null() && !force)
{
if (old_cfg["scheme"] != cfg["scheme"])
{
return "Changing scheme for an existing pool will lead to data loss. Use --force to proceed";
}
if (etcd_state_client_t::parse_scheme(old_cfg["scheme"].string_value()) == POOL_SCHEME_EC)
{
uint64_t old_data_chunks = old_cfg["pg_size"].uint64_value() - old_cfg["parity_chunks"].uint64_value();
uint64_t new_data_chunks = cfg["pg_size"].uint64_value() - cfg["parity_chunks"].uint64_value();
if (old_data_chunks != new_data_chunks)
{
return "Changing EC data chunk count for an existing pool will lead to data loss. Use --force to proceed";
}
}
if (old_cfg["block_size"] != cfg["block_size"] ||
old_cfg["bitmap_granularity"] != cfg["bitmap_granularity"] ||
old_cfg["immediate_commit"] != cfg["immediate_commit"])
{
return "Changing block_size, bitmap_granularity or immediate_commit"
" for an existing pool will lead to incomplete PGs. Use --force to proceed";
}
if (old_cfg["pg_stripe_size"] != cfg["pg_stripe_size"])
{
return "Changing pg_stripe_size for an existing pool will lead to data loss. Use --force to proceed";
}
}
// Validate values
if (cfg["name"].string_value() == "")
{
return "Non-empty pool name is required";
}
// scheme
auto scheme = etcd_state_client_t::parse_scheme(cfg["scheme"].string_value());
if (!scheme)
{
return "Scheme must be one of \"replicated\", \"ec\" or \"xor\"";
}
// pg_size
auto pg_size = cfg["pg_size"].uint64_value();
if (!pg_size)
{
return "Non-zero PG size is required";
}
if (scheme != POOL_SCHEME_REPLICATED && pg_size < 3)
{
return "PG size can't be smaller than 3 for EC/XOR pools";
}
if (pg_size > 256)
{
return "PG size can't be greater than 256";
}
// PG rules
if (!cfg["level_placement"].is_null())
{
for (auto & lr: cfg["level_placement"].object_items())
{
int len = 0;
if (lr.second.is_array())
{
for (auto & lri: lr.second.array_items())
{
if (!lri.is_string() && !lri.is_number())
{
return "--level_placement contains an array with non-scalar value: "+lri.dump();
}
}
len = lr.second.array_items().size();
}
else if (!lr.second.is_string())
{
return "--level_placement contains a non-array and non-string value: "+lr.second.dump();
}
else
{
len = lr.second.string_value().size();
}
if (len != pg_size)
{
return "values in --level_placement should be exactly pg_size ("+std::to_string(pg_size)+") long";
}
}
}
// parity_chunks
uint64_t parity_chunks = 1;
if (scheme == POOL_SCHEME_EC)
{
parity_chunks = cfg["parity_chunks"].uint64_value();
if (!parity_chunks)
{
return "Non-zero parity_chunks is required";
}
if (parity_chunks > pg_size-2)
{
return "parity_chunks can't be greater than "+std::to_string(pg_size-2)+" (PG size - 2)";
}
}
// pg_minsize
auto pg_minsize = cfg["pg_minsize"].uint64_value();
if (!pg_minsize)
{
return "Non-zero pg_minsize is required";
}
else if (pg_minsize > pg_size)
{
return "pg_minsize can't be greater than "+std::to_string(pg_size)+" (PG size)";
}
else if (scheme != POOL_SCHEME_REPLICATED && pg_minsize < pg_size-parity_chunks)
{
return "pg_minsize can't be smaller than "+std::to_string(pg_size-parity_chunks)+
" (pg_size - parity_chunks) for XOR/EC pool";
}
// pg_count
if (!cfg["pg_count"].uint64_value())
{
return "Non-zero pg_count is required";
}
// max_osd_combinations
if (!cfg["max_osd_combinations"].is_null() && cfg["max_osd_combinations"].uint64_value() < 100)
{
return "max_osd_combinations must be at least 100, but it is "+cfg["max_osd_combinations"].as_string();
}
// block_size
auto block_size = cfg["block_size"].uint64_value();
if (!cfg["block_size"].is_null() && ((block_size & (block_size-1)) ||
block_size < MIN_DATA_BLOCK_SIZE || block_size > MAX_DATA_BLOCK_SIZE))
{
return "block_size must be a power of two between "+std::to_string(MIN_DATA_BLOCK_SIZE)+
" and "+std::to_string(MAX_DATA_BLOCK_SIZE)+", but it is "+std::to_string(block_size);
}
block_size = (block_size ? block_size : global_block_size);
// bitmap_granularity
auto bitmap_granularity = cfg["bitmap_granularity"].uint64_value();
if (!cfg["bitmap_granularity"].is_null() && (!bitmap_granularity || (bitmap_granularity % 512)))
{
return "bitmap_granularity must be a multiple of 512, but it is "+std::to_string(bitmap_granularity);
}
bitmap_granularity = (bitmap_granularity ? bitmap_granularity : global_bitmap_granularity);
if (block_size % bitmap_granularity)
{
return "bitmap_granularity must divide data block size ("+std::to_string(block_size)+"), but it is "+std::to_string(bitmap_granularity);
}
// immediate_commit
if (!cfg["immediate_commit"].is_null() && !etcd_state_client_t::parse_immediate_commit(cfg["immediate_commit"].string_value()))
{
return "immediate_commit must be one of \"all\", \"small\", or \"none\", but it is "+cfg["immediate_commit"].as_string();
}
// scrub_interval
if (!cfg["scrub_interval"].is_null())
{
bool ok;
parse_time(cfg["scrub_interval"].string_value(), &ok);
if (!ok)
{
return "scrub_interval must be a time interval (number + unit s/m/h/d/M/y), but it is "+cfg["scrub_interval"].as_string();
}
}
return "";
}

10
src/cli_pool_cfg.h Normal file
View File

@ -0,0 +1,10 @@
// Copyright (c) Vitaliy Filippov, 2024
// License: VNPL-1.1 (see README.md for details)
#pragma once
#include "json11/json11.hpp"
#include <stdint.h>
std::string validate_pool_config(json11::Json::object & new_cfg, json11::Json old_cfg,
uint64_t global_block_size, uint64_t global_bitmap_granularity, bool force);

626
src/cli_pool_create.cpp Normal file
View File

@ -0,0 +1,626 @@
// Copyright (c) MIND Software LLC, 2023 (info@mindsw.io)
// I accept Vitastor CLA: see CLA-en.md for details
// Copyright (c) Vitaliy Filippov, 2024
// License: VNPL-1.1 (see README.md for details)
#include <ctype.h>
#include "cli.h"
#include "cli_pool_cfg.h"
#include "cluster_client.h"
#include "epoll_manager.h"
#include "pg_states.h"
#include "str_util.h"
struct pool_creator_t
{
cli_tool_t *parent;
json11::Json cfg;
bool force = false;
bool wait = false;
int state = 0;
cli_result_t result;
struct {
uint32_t retries = 5;
uint32_t interval = 0;
bool passed = false;
} create_check;
uint64_t new_id = 1;
uint64_t new_pools_mod_rev;
json11::Json state_node_tree;
json11::Json new_pools;
bool is_done() { return state == 100; }
void loop()
{
if (state == 1)
goto resume_1;
else if (state == 2)
goto resume_2;
else if (state == 3)
goto resume_3;
else if (state == 4)
goto resume_4;
else if (state == 5)
goto resume_5;
else if (state == 6)
goto resume_6;
else if (state == 7)
goto resume_7;
else if (state == 8)
goto resume_8;
// Validate pool parameters
{
auto new_cfg = cfg.object_items();
result.text = validate_pool_config(new_cfg, json11::Json(), parent->cli->st_cli.global_block_size,
parent->cli->st_cli.global_bitmap_granularity, force);
cfg = new_cfg;
}
if (result.text != "")
{
result.err = EINVAL;
state = 100;
return;
}
state = 1;
resume_1:
// If not forced, check that we have enough osds for pg_size
if (!force)
{
// Get node_placement configuration from etcd
parent->etcd_txn(json11::Json::object {
{ "success", json11::Json::array {
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/node_placement") },
} }
},
} },
});
state = 2;
resume_2:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
// Get state_node_tree based on node_placement and osd peer states
{
auto kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][0]["response_range"]["kvs"][0]);
state_node_tree = get_state_node_tree(kv.value.object_items());
}
// Skip tag checks, if pool has none
if (cfg["osd_tags"].array_items().size())
{
// Get osd configs (for tags) of osds in state_node_tree
{
json11::Json::array osd_configs;
for (auto osd_num: state_node_tree["osds"].array_items())
{
osd_configs.push_back(json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/osd/"+osd_num.as_string()) },
} }
});
}
parent->etcd_txn(json11::Json::object { { "success", osd_configs, }, });
}
state = 3;
resume_3:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
// Filter out osds from state_node_tree based on pool/osd tags
{
std::vector<json11::Json> osd_configs;
for (auto & ocr: parent->etcd_result["responses"].array_items())
{
auto kv = parent->cli->st_cli.parse_etcd_kv(ocr["response_range"]["kvs"][0]);
osd_configs.push_back(kv.value);
}
state_node_tree = filter_state_node_tree_by_tags(state_node_tree, osd_configs);
}
}
// Get stats (for block_size, bitmap_granularity, ...) of osds in state_node_tree
{
json11::Json::array osd_stats;
for (auto osd_num: state_node_tree["osds"].array_items())
{
osd_stats.push_back(json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/osd/stats/"+osd_num.as_string()) },
} }
});
}
parent->etcd_txn(json11::Json::object { { "success", osd_stats, }, });
}
state = 4;
resume_4:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
// Filter osds from state_node_tree based on pool parameters and osd stats
{
std::vector<json11::Json> osd_stats;
for (auto & ocr: parent->etcd_result["responses"].array_items())
{
auto kv = parent->cli->st_cli.parse_etcd_kv(ocr["response_range"]["kvs"][0]);
osd_stats.push_back(kv.value);
}
state_node_tree = filter_state_node_tree_by_stats(state_node_tree, osd_stats);
}
// Check that pg_size <= max_pg_size
{
auto failure_domain = cfg["failure_domain"].string_value() == ""
? "host" : cfg["failure_domain"].string_value();
uint64_t max_pg_size = get_max_pg_size(state_node_tree["nodes"].object_items(),
failure_domain, cfg["root_node"].string_value());
if (cfg["pg_size"].uint64_value() > max_pg_size)
{
result = (cli_result_t){
.err = EINVAL,
.text =
"There are "+std::to_string(max_pg_size)+" \""+failure_domain+"\" failure domains with OSDs matching tags and"
" block_size/bitmap_granularity/immediate_commit parameters, but you want to create a"
" pool with "+cfg["pg_size"].as_string()+" OSDs from different failure domains in a PG."
" Change parameters or add --force if you want to create a degraded pool and add OSDs later."
};
state = 100;
return;
}
}
}
// Create pool
state = 5;
resume_5:
// Get pools from etcd
parent->etcd_txn(json11::Json::object {
{ "success", json11::Json::array {
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/pools") },
} }
},
} },
});
state = 6;
resume_6:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
{
// Add new pool
auto kv = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][0]["response_range"]["kvs"][0]);
new_pools = create_pool(kv);
if (new_pools.is_string())
{
result = (cli_result_t){ .err = EEXIST, .text = new_pools.string_value() };
state = 100;
return;
}
new_pools_mod_rev = kv.mod_revision;
}
// Update pools in etcd
parent->etcd_txn(json11::Json::object {
{ "compare", json11::Json::array {
json11::Json::object {
{ "target", "MOD" },
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/pools") },
{ "result", "LESS" },
{ "mod_revision", new_pools_mod_rev+1 },
}
} },
{ "success", json11::Json::array {
json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/pools") },
{ "value", base64_encode(new_pools.dump()) },
} },
},
} },
});
state = 7;
resume_7:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
// Perform final create-check
create_check.interval = parent->cli->config["mon_change_timeout"].uint64_value();
if (!create_check.interval)
create_check.interval = 1000;
state = 8;
resume_8:
if (parent->waiting > 0)
return;
// Unless forced, check that pool was created and is active
if (!wait)
{
create_check.passed = true;
}
else if (create_check.retries)
{
create_check.retries--;
parent->waiting++;
parent->epmgr->tfd->set_timer(create_check.interval, false, [this](int timer_id)
{
if (parent->cli->st_cli.pool_config.find(new_id) != parent->cli->st_cli.pool_config.end())
{
auto & pool_cfg = parent->cli->st_cli.pool_config[new_id];
create_check.passed = pool_cfg.real_pg_count > 0;
for (auto pg_it = pool_cfg.pg_config.begin(); pg_it != pool_cfg.pg_config.end(); pg_it++)
{
if (!(pg_it->second.cur_state & PG_ACTIVE))
{
create_check.passed = false;
break;
}
}
if (create_check.passed)
create_check.retries = 0;
}
parent->waiting--;
parent->ringloop->wakeup();
});
return;
}
if (!create_check.passed)
{
result = (cli_result_t) {
.err = EAGAIN,
.text = "Pool "+cfg["name"].string_value()+" was created, but failed to become active."
" This may indicate that cluster state has changed while the pool was being created."
" Please check the current state and adjust the pool configuration if necessary.",
};
}
else
{
result = (cli_result_t){
.err = 0,
.text = "Pool "+cfg["name"].string_value()+" created",
.data = new_pools[std::to_string(new_id)],
};
}
state = 100;
}
// Returns a JSON object of form {"nodes": {...}, "osds": [...]} that
// contains: all nodes (osds, hosts, ...) based on node_placement config
// and current peer state, and a list of active peer osds.
json11::Json get_state_node_tree(json11::Json::object node_placement)
{
// Erase non-peer osd nodes from node_placement
for (auto np_it = node_placement.begin(); np_it != node_placement.end();)
{
// Numeric nodes are osds
osd_num_t osd_num = stoull_full(np_it->first);
// If node is osd and it is not in peer states, erase it
if (osd_num > 0 &&
parent->cli->st_cli.peer_states.find(osd_num) == parent->cli->st_cli.peer_states.end())
{
node_placement.erase(np_it++);
}
else
np_it++;
}
// List of peer osds
std::vector<std::string> peer_osds;
// Record peer osds and add missing osds/hosts to np
for (auto & ps: parent->cli->st_cli.peer_states)
{
std::string osd_num = std::to_string(ps.first);
// Record peer osd
peer_osds.push_back(osd_num);
// Add osd, if necessary
if (node_placement.find(osd_num) == node_placement.end())
{
std::string osd_host = ps.second["host"].as_string();
// Add host, if necessary
if (node_placement.find(osd_host) == node_placement.end())
{
node_placement[osd_host] = json11::Json::object {
{ "level", "host" }
};
}
node_placement[osd_num] = json11::Json::object {
{ "parent", osd_host }
};
}
}
return json11::Json::object { { "osds", peer_osds }, { "nodes", node_placement } };
}
// Returns new state_node_tree based on given state_node_tree with osds
// filtered out by tags in given osd_configs and current pool config.
// Requires: state_node_tree["osds"] must match osd_configs 1-1
json11::Json filter_state_node_tree_by_tags(const json11::Json & state_node_tree, std::vector<json11::Json> & osd_configs)
{
auto & osds = state_node_tree["osds"].array_items();
// Accepted state_node_tree nodes
auto accepted_nodes = state_node_tree["nodes"].object_items();
// List of accepted osds
std::vector<std::string> accepted_osds;
for (size_t i = 0; i < osd_configs.size(); i++)
{
auto & oc = osd_configs[i].object_items();
// Get osd number
auto osd_num = osds[i].as_string();
// We need tags in config to check against pool tags
if (oc.find("tags") == oc.end())
{
// Exclude osd from state_node_tree nodes
accepted_nodes.erase(osd_num);
continue;
}
else
{
// If all pool tags are in osd tags, accept osd
if (all_in_tags(osd_configs[i]["tags"], cfg["osd_tags"]))
{
accepted_osds.push_back(osd_num);
}
// Otherwise, exclude osd
else
{
// Exclude osd from state_node_tree nodes
accepted_nodes.erase(osd_num);
}
}
}
return json11::Json::object { { "osds", accepted_osds }, { "nodes", accepted_nodes } };
}
// Returns new state_node_tree based on given state_node_tree with osds
// filtered out by stats parameters (block_size, bitmap_granularity) in
// given osd_stats and current pool config.
// Requires: state_node_tree["osds"] must match osd_stats 1-1
json11::Json filter_state_node_tree_by_stats(const json11::Json & state_node_tree, std::vector<json11::Json> & osd_stats)
{
auto & osds = state_node_tree["osds"].array_items();
// Accepted state_node_tree nodes
auto accepted_nodes = state_node_tree["nodes"].object_items();
// List of accepted osds
std::vector<std::string> accepted_osds;
uint64_t p_block_size = cfg["block_size"].uint64_value()
? cfg["block_size"].uint64_value()
: parent->cli->st_cli.global_block_size;
uint64_t p_bitmap_granularity = cfg["bitmap_granularity"].uint64_value()
? cfg["bitmap_granularity"].uint64_value()
: parent->cli->st_cli.global_bitmap_granularity;
uint32_t p_immediate_commit = cfg["immediate_commit"].is_string()
? etcd_state_client_t::parse_immediate_commit(cfg["immediate_commit"].string_value())
: parent->cli->st_cli.global_immediate_commit;
for (size_t i = 0; i < osd_stats.size(); i++)
{
auto & os = osd_stats[i];
// Get osd number
auto osd_num = osds[i].as_string();
if (!os["data_block_size"].is_null() && os["data_block_size"] != p_block_size ||
!os["bitmap_granularity"].is_null() && os["bitmap_granularity"] != p_bitmap_granularity ||
!os["immediate_commit"].is_null() &&
etcd_state_client_t::parse_immediate_commit(os["immediate_commit"].string_value()) < p_immediate_commit)
{
accepted_nodes.erase(osd_num);
}
else
{
accepted_osds.push_back(osd_num);
}
}
return json11::Json::object { { "osds", accepted_osds }, { "nodes", accepted_nodes } };
}
// Returns maximum pg_size possible for given node_tree and failure_domain, starting at parent_node
uint64_t get_max_pg_size(json11::Json::object node_tree, const std::string & level, const std::string & parent_node)
{
uint64_t max_pg_sz = 0;
std::vector<std::string> nodes;
// Check if parent node is an osd (numeric)
if (parent_node != "" && stoull_full(parent_node))
{
// Add it to node list if osd is in node tree
if (node_tree.find(parent_node) != node_tree.end())
nodes.push_back(parent_node);
}
// If parent node given, ...
else if (parent_node != "")
{
// ... look for children nodes of this parent
for (auto & sn: node_tree)
{
auto & props = sn.second.object_items();
auto parent_prop = props.find("parent");
if (parent_prop != props.end() && (parent_prop->second.as_string() == parent_node))
{
nodes.push_back(sn.first);
// If we're not looking for all osds, we only need a single
// child osd node
if (level != "osd" && stoull_full(sn.first))
break;
}
}
}
// No parent node given, and we're not looking for all osds
else if (level != "osd")
{
// ... look for all level nodes
for (auto & sn: node_tree)
{
auto & props = sn.second.object_items();
auto level_prop = props.find("level");
if (level_prop != props.end() && (level_prop->second.as_string() == level))
{
nodes.push_back(sn.first);
}
}
}
// Otherwise, ...
else
{
// ... we're looking for osd nodes only
for (auto & sn: node_tree)
{
if (stoull_full(sn.first))
{
nodes.push_back(sn.first);
}
}
}
// Process gathered nodes
for (auto & node: nodes)
{
// Check for osd node, return constant max size
if (stoull_full(node))
{
max_pg_sz += 1;
}
// Otherwise, ...
else
{
// ... exclude parent node from tree, and ...
node_tree.erase(parent_node);
// ... descend onto the resulting tree
max_pg_sz += get_max_pg_size(node_tree, level, node);
}
}
return max_pg_sz;
}
json11::Json create_pool(const etcd_kv_t & kv)
{
for (auto & p: kv.value.object_items())
{
// ID
uint64_t pool_id = stoull_full(p.first);
new_id = std::max(pool_id+1, new_id);
// Name
if (p.second["name"].string_value() == cfg["name"].string_value())
{
return "Pool with name \""+cfg["name"].string_value()+"\" already exists (ID "+std::to_string(pool_id)+")";
}
}
auto res = kv.value.object_items();
res[std::to_string(new_id)] = cfg;
return res;
}
// Checks whether tags2 tags are all in tags1 tags
bool all_in_tags(json11::Json tags1, json11::Json tags2)
{
if (!tags2.is_array())
{
tags2 = json11::Json::array{ tags2.string_value() };
}
if (!tags1.is_array())
{
tags1 = json11::Json::array{ tags1.string_value() };
}
for (auto & tag2: tags2.array_items())
{
bool found = false;
for (auto & tag1: tags1.array_items())
{
if (tag1 == tag2)
{
found = true;
break;
}
}
if (!found)
{
return false;
}
}
return true;
}
};
std::function<bool(cli_result_t &)> cli_tool_t::start_pool_create(json11::Json cfg)
{
auto pool_creator = new pool_creator_t();
pool_creator->parent = this;
pool_creator->cfg = cfg;
pool_creator->force = cfg["force"].bool_value();
pool_creator->wait = cfg["wait"].bool_value();
return [pool_creator](cli_result_t & result)
{
pool_creator->loop();
if (pool_creator->is_done())
{
result = pool_creator->result;
delete pool_creator;
return true;
}
return false;
};
}

723
src/cli_pool_ls.cpp Normal file
View File

@ -0,0 +1,723 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include <algorithm>
#include "cli.h"
#include "cluster_client.h"
#include "str_util.h"
#include "pg_states.h"
// List pools with space statistics
// - df - minimal list with % used space
// - pool-ls - same but with PG state and recovery %
// - pool-ls -l - same but also include I/O statistics
// - pool-ls --detail - use list format, include PG states, I/O stats and all pool parameters
struct pool_lister_t
{
cli_tool_t *parent;
std::string sort_field;
std::set<std::string> only_names;
bool reverse = false;
int max_count = 0;
bool show_recovery = false;
bool show_stats = false;
bool detailed = false;
int state = 0;
cli_result_t result;
std::map<pool_id_t, json11::Json::object> pool_stats;
struct io_stats_t
{
uint64_t count = 0;
uint64_t read_iops = 0;
uint64_t read_bps = 0;
uint64_t read_lat = 0;
uint64_t write_iops = 0;
uint64_t write_bps = 0;
uint64_t write_lat = 0;
uint64_t delete_iops = 0;
uint64_t delete_bps = 0;
uint64_t delete_lat = 0;
};
struct object_counts_t
{
uint64_t object_count = 0;
uint64_t misplaced_count = 0;
uint64_t degraded_count = 0;
uint64_t incomplete_count = 0;
};
bool is_done()
{
return state == 100;
}
void get_pool_stats(int base_state)
{
if (state == base_state+1)
goto resume_1;
// Space statistics - pool/stats/<pool>
parent->etcd_txn(json11::Json::object {
{ "success", json11::Json::array {
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(
parent->cli->st_cli.etcd_prefix+"/pool/stats/"
) },
{ "range_end", base64_encode(
parent->cli->st_cli.etcd_prefix+"/pool/stats0"
) },
} },
},
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(
parent->cli->st_cli.etcd_prefix+"/osd/stats/"
) },
{ "range_end", base64_encode(
parent->cli->st_cli.etcd_prefix+"/osd/stats0"
) },
} },
},
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(
parent->cli->st_cli.etcd_prefix+"/config/pools"
) },
} },
},
} },
});
state = base_state+1;
resume_1:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
auto space_info = parent->etcd_result;
auto config_pools = space_info["responses"][2]["response_range"]["kvs"][0];
if (!config_pools.is_null())
{
config_pools = parent->cli->st_cli.parse_etcd_kv(config_pools).value;
}
for (auto & kv_item: space_info["responses"][0]["response_range"]["kvs"].array_items())
{
auto kv = parent->cli->st_cli.parse_etcd_kv(kv_item);
// pool ID
pool_id_t pool_id;
char null_byte = 0;
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/pool/stats/%u%c", &pool_id, &null_byte);
if (scanned != 1 || !pool_id || pool_id >= POOL_ID_MAX)
{
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
continue;
}
// pool/stats/<N>
pool_stats[pool_id] = kv.value.object_items();
}
std::map<pool_id_t, uint64_t> osd_free;
for (auto & kv_item: space_info["responses"][1]["response_range"]["kvs"].array_items())
{
auto kv = parent->cli->st_cli.parse_etcd_kv(kv_item);
// osd ID
osd_num_t osd_num;
char null_byte = 0;
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/osd/stats/%ju%c", &osd_num, &null_byte);
if (scanned != 1 || !osd_num || osd_num >= POOL_ID_MAX)
{
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
continue;
}
// osd/stats/<N>::free
osd_free[osd_num] = kv.value["free"].uint64_value();
}
// Calculate max_avail for each pool
for (auto & pp: parent->cli->st_cli.pool_config)
{
auto & pool_cfg = pp.second;
uint64_t pool_avail = UINT64_MAX;
std::map<osd_num_t, uint64_t> pg_per_osd;
bool active = pool_cfg.real_pg_count > 0;
uint64_t pg_states = 0;
for (auto & pgp: pool_cfg.pg_config)
{
if (!(pgp.second.cur_state & PG_ACTIVE))
{
active = false;
}
pg_states |= pgp.second.cur_state;
for (auto pg_osd: pgp.second.target_set)
{
if (pg_osd != 0)
{
pg_per_osd[pg_osd]++;
}
}
}
for (auto pg_per_pair: pg_per_osd)
{
uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.real_pg_count / pg_per_pair.second;
if (pool_avail > pg_free)
{
pool_avail = pg_free;
}
}
if (pool_avail == UINT64_MAX)
{
pool_avail = 0;
}
if (pool_cfg.scheme != POOL_SCHEME_REPLICATED)
{
pool_avail *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
}
// incomplete > has_incomplete > degraded > has_degraded > has_misplaced
std::string status;
if (!active)
status = "inactive";
else if (pg_states & PG_INCOMPLETE)
status = "incomplete";
else if (pg_states & PG_HAS_INCOMPLETE)
status = "has_incomplete";
else if (pg_states & PG_DEGRADED)
status = "degraded";
else if (pg_states & PG_HAS_DEGRADED)
status = "has_degraded";
else if (pg_states & PG_HAS_MISPLACED)
status = "has_misplaced";
else
status = "active";
pool_stats[pool_cfg.id] = json11::Json::object {
{ "id", (uint64_t)pool_cfg.id },
{ "name", pool_cfg.name },
{ "status", status },
{ "pg_count", pool_cfg.pg_count },
{ "real_pg_count", pool_cfg.real_pg_count },
{ "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
: "EC "+std::to_string(pool_cfg.pg_size-pool_cfg.parity_chunks)+"+"+std::to_string(pool_cfg.parity_chunks) },
{ "used_raw", (uint64_t)(pool_stats[pool_cfg.id]["used_raw_tb"].number_value() * ((uint64_t)1<<40)) },
{ "total_raw", (uint64_t)(pool_stats[pool_cfg.id]["total_raw_tb"].number_value() * ((uint64_t)1<<40)) },
{ "max_available", pool_avail },
{ "raw_to_usable", pool_stats[pool_cfg.id]["raw_to_usable"].number_value() },
{ "space_efficiency", pool_stats[pool_cfg.id]["space_efficiency"].number_value() },
{ "pg_real_size", pool_stats[pool_cfg.id]["pg_real_size"].uint64_value() },
{ "osd_count", (uint64_t)pg_per_osd.size() },
};
}
// Include full pool config
for (auto & pp: config_pools.object_items())
{
if (!pp.second.is_object())
{
continue;
}
auto pool_id = stoull_full(pp.first);
auto & st = pool_stats[pool_id];
for (auto & kv: pp.second.object_items())
{
if (st.find(kv.first) == st.end())
st[kv.first] = kv.second;
}
}
}
void get_pg_stats(int base_state)
{
if (state == base_state+1)
goto resume_1;
// Space statistics - pool/stats/<pool>
parent->etcd_txn(json11::Json::object {
{ "success", json11::Json::array {
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(
parent->cli->st_cli.etcd_prefix+"/pg/stats/"
) },
{ "range_end", base64_encode(
parent->cli->st_cli.etcd_prefix+"/pg/stats0"
) },
} },
},
} },
});
state = base_state+1;
resume_1:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
auto pg_stats = parent->etcd_result["responses"][0]["response_range"]["kvs"];
// Calculate recovery percent
std::map<pool_id_t, object_counts_t> counts;
for (auto & kv_item: pg_stats.array_items())
{
auto kv = parent->cli->st_cli.parse_etcd_kv(kv_item);
// pool ID & pg number
pool_id_t pool_id;
pg_num_t pg_num = 0;
char null_byte = 0;
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(),
"/pg/stats/%u/%u%c", &pool_id, &pg_num, &null_byte);
if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX)
{
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
continue;
}
auto & cnt = counts[pool_id];
cnt.object_count += kv.value["object_count"].uint64_value();
cnt.misplaced_count += kv.value["misplaced_count"].uint64_value();
cnt.degraded_count += kv.value["degraded_count"].uint64_value();
cnt.incomplete_count += kv.value["incomplete_count"].uint64_value();
}
for (auto & pp: pool_stats)
{
auto & cnt = counts[pp.first];
auto & st = pp.second;
st["object_count"] = cnt.object_count;
st["misplaced_count"] = cnt.misplaced_count;
st["degraded_count"] = cnt.degraded_count;
st["incomplete_count"] = cnt.incomplete_count;
}
}
void get_inode_stats(int base_state)
{
if (state == base_state+1)
goto resume_1;
// Space statistics - pool/stats/<pool>
parent->etcd_txn(json11::Json::object {
{ "success", json11::Json::array {
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(
parent->cli->st_cli.etcd_prefix+"/inode/stats/"
) },
{ "range_end", base64_encode(
parent->cli->st_cli.etcd_prefix+"/inode/stats0"
) },
} },
},
} },
});
state = base_state+1;
resume_1:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
auto inode_stats = parent->etcd_result["responses"][0]["response_range"]["kvs"];
// Performance statistics
std::map<pool_id_t, io_stats_t> pool_io;
for (auto & kv_item: inode_stats.array_items())
{
auto kv = parent->cli->st_cli.parse_etcd_kv(kv_item);
// pool ID & inode number
pool_id_t pool_id;
inode_t only_inode_num;
char null_byte = 0;
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(),
"/inode/stats/%u/%ju%c", &pool_id, &only_inode_num, &null_byte);
if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX || INODE_POOL(only_inode_num) != 0)
{
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
continue;
}
auto & io = pool_io[pool_id];
io.read_iops += kv.value["read"]["iops"].uint64_value();
io.read_bps += kv.value["read"]["bps"].uint64_value();
io.read_lat += kv.value["read"]["lat"].uint64_value();
io.write_iops += kv.value["write"]["iops"].uint64_value();
io.write_bps += kv.value["write"]["bps"].uint64_value();
io.write_lat += kv.value["write"]["lat"].uint64_value();
io.delete_iops += kv.value["delete"]["iops"].uint64_value();
io.delete_bps += kv.value["delete"]["bps"].uint64_value();
io.delete_lat += kv.value["delete"]["lat"].uint64_value();
io.count++;
}
for (auto & pp: pool_stats)
{
auto & io = pool_io[pp.first];
if (io.count > 0)
{
io.read_lat /= io.count;
io.write_lat /= io.count;
io.delete_lat /= io.count;
}
auto & st = pp.second;
st["read_iops"] = io.read_iops;
st["read_bps"] = io.read_bps;
st["read_lat"] = io.read_lat;
st["write_iops"] = io.write_iops;
st["write_bps"] = io.write_bps;
st["write_lat"] = io.write_lat;
st["delete_iops"] = io.delete_iops;
st["delete_bps"] = io.delete_bps;
st["delete_lat"] = io.delete_lat;
}
}
json11::Json::array to_list()
{
json11::Json::array list;
for (auto & kv: pool_stats)
{
if (!only_names.size())
{
list.push_back(kv.second);
}
else
{
for (auto glob: only_names)
{
if (stupid_glob(kv.second["name"].string_value(), glob))
{
list.push_back(kv.second);
break;
}
}
}
}
if (sort_field == "name" || sort_field == "scheme" ||
sort_field == "scheme_name" || sort_field == "status")
{
std::sort(list.begin(), list.end(), [this](json11::Json a, json11::Json b)
{
auto av = a[sort_field].as_string();
auto bv = b[sort_field].as_string();
return reverse ? av > bv : av < bv;
});
}
else
{
std::sort(list.begin(), list.end(), [this](json11::Json a, json11::Json b)
{
auto av = a[sort_field].number_value();
auto bv = b[sort_field].number_value();
return reverse ? av > bv : av < bv;
});
}
if (max_count > 0 && list.size() > max_count)
{
list.resize(max_count);
}
return list;
}
void loop()
{
if (state == 1)
goto resume_1;
if (state == 2)
goto resume_2;
if (state == 3)
goto resume_3;
if (state == 100)
return;
show_stats = show_stats || detailed;
show_recovery = show_recovery || detailed;
resume_1:
get_pool_stats(0);
if (parent->waiting > 0)
return;
if (show_stats)
{
resume_2:
get_inode_stats(1);
if (parent->waiting > 0)
return;
}
if (show_recovery)
{
resume_3:
get_pg_stats(2);
if (parent->waiting > 0)
return;
}
if (parent->json_output)
{
// JSON output
result.data = to_list();
state = 100;
return;
}
json11::Json::array list;
for (auto & kv: pool_stats)
{
auto & st = kv.second;
double raw_to = st["raw_to_usable"].number_value();
if (raw_to < 0.000001 && raw_to > -0.000001)
raw_to = 1;
st["pg_count_fmt"] = st["real_pg_count"] == st["pg_count"]
? st["real_pg_count"].as_string()
: st["real_pg_count"].as_string()+"->"+st["pg_count"].as_string();
st["total_fmt"] = format_size(st["total_raw"].uint64_value() / raw_to);
st["used_fmt"] = format_size(st["used_raw"].uint64_value() / raw_to);
st["max_avail_fmt"] = format_size(st["max_available"].uint64_value());
st["used_pct"] = format_q(st["total_raw"].uint64_value()
? (100 - 100*st["max_available"].uint64_value() *
st["raw_to_usable"].number_value() / st["total_raw"].uint64_value())
: 100)+"%";
st["eff_fmt"] = format_q(st["space_efficiency"].number_value()*100)+"%";
if (show_stats)
{
st["read_bw"] = format_size(st["read_bps"].uint64_value())+"/s";
st["write_bw"] = format_size(st["write_bps"].uint64_value())+"/s";
st["delete_bw"] = format_size(st["delete_bps"].uint64_value())+"/s";
st["read_iops"] = format_q(st["read_iops"].number_value());
st["write_iops"] = format_q(st["write_iops"].number_value());
st["delete_iops"] = format_q(st["delete_iops"].number_value());
st["read_lat_f"] = format_lat(st["read_lat"].uint64_value());
st["write_lat_f"] = format_lat(st["write_lat"].uint64_value());
st["delete_lat_f"] = format_lat(st["delete_lat"].uint64_value());
}
if (show_recovery)
{
auto object_count = st["object_count"].uint64_value();
auto recovery_pct = 100.0 * (object_count - (st["misplaced_count"].uint64_value() +
st["degraded_count"].uint64_value() + st["incomplete_count"].uint64_value())) /
(object_count ? object_count : 1);
st["recovery_fmt"] = format_q(recovery_pct)+"%";
}
}
if (detailed)
{
for (auto & kv: pool_stats)
{
auto & st = kv.second;
auto total = st["object_count"].uint64_value();
auto obj_size = st["block_size"].uint64_value();
if (!obj_size)
obj_size = parent->cli->st_cli.global_block_size;
if (st["scheme"] == "ec")
obj_size *= st["pg_size"].uint64_value() - st["parity_chunks"].uint64_value();
else if (st["scheme"] == "xor")
obj_size *= st["pg_size"].uint64_value() - 1;
auto n = st["misplaced_count"].uint64_value();
if (n > 0)
st["misplaced_fmt"] = format_size(n * obj_size) + " / " + format_q(100.0 * n / total);
n = st["degraded_count"].uint64_value();
if (n > 0)
st["degraded_fmt"] = format_size(n * obj_size) + " / " + format_q(100.0 * n / total);
n = st["incomplete_count"].uint64_value();
if (n > 0)
st["incomplete_fmt"] = format_size(n * obj_size) + " / " + format_q(100.0 * n / total);
st["read_fmt"] = st["read_bw"].string_value()+", "+st["read_iops"].string_value()+" op/s, "+
st["read_lat_f"].string_value()+" lat";
st["write_fmt"] = st["write_bw"].string_value()+", "+st["write_iops"].string_value()+" op/s, "+
st["write_lat_f"].string_value()+" lat";
st["delete_fmt"] = st["delete_bw"].string_value()+", "+st["delete_iops"].string_value()+" op/s, "+
st["delete_lat_f"].string_value()+" lat";
if (st["scheme"] == "replicated")
st["scheme_name"] = "x"+st["pg_size"].as_string();
if (st["failure_domain"].string_value() == "")
st["failure_domain"] = "host";
st["osd_tags_fmt"] = implode(", ", st["osd_tags"]);
st["primary_affinity_tags_fmt"] = implode(", ", st["primary_affinity_tags"]);
if (st["block_size"].uint64_value())
st["block_size_fmt"] = format_size(st["block_size"].uint64_value());
if (st["bitmap_granularity"].uint64_value())
st["bitmap_granularity_fmt"] = format_size(st["bitmap_granularity"].uint64_value());
}
// All pool parameters are only displayed in the "detailed" mode
// because there's too many of them to show them in table
auto cols = std::vector<std::pair<std::string, std::string>>{
{ "name", "Name" },
{ "id", "ID" },
{ "scheme_name", "Scheme" },
{ "used_for_fs", "Used for VitastorFS" },
{ "status", "Status" },
{ "pg_count_fmt", "PGs" },
{ "pg_minsize", "PG minsize" },
{ "failure_domain", "Failure domain" },
{ "root_node", "Root node" },
{ "osd_tags_fmt", "OSD tags" },
{ "primary_affinity_tags_fmt", "Primary affinity" },
{ "block_size_fmt", "Block size" },
{ "bitmap_granularity_fmt", "Bitmap granularity" },
{ "immediate_commit", "Immediate commit" },
{ "scrub_interval", "Scrub interval" },
{ "inode_stats_fmt", "Per-inode stats" },
{ "pg_stripe_size", "PG stripe size" },
{ "max_osd_combinations", "Max OSD combinations" },
{ "total_fmt", "Total" },
{ "used_fmt", "Used" },
{ "max_avail_fmt", "Available" },
{ "used_pct", "Used%" },
{ "eff_fmt", "Efficiency" },
{ "osd_count", "OSD count" },
{ "misplaced_fmt", "Misplaced" },
{ "degraded_fmt", "Degraded" },
{ "incomplete_fmt", "Incomplete" },
{ "read_fmt", "Read" },
{ "write_fmt", "Write" },
{ "delete_fmt", "Delete" },
};
auto list = to_list();
size_t title_len = 0;
for (auto & item: list)
{
title_len = print_detail_title_len(item, cols, title_len);
}
for (auto & item: list)
{
if (result.text != "")
result.text += "\n";
result.text += print_detail(item, cols, title_len, parent->color);
}
state = 100;
return;
}
// Table output: name, scheme_name, pg_count, total, used, max_avail, used%, efficiency
json11::Json::array cols;
cols.push_back(json11::Json::object{
{ "key", "name" },
{ "title", "NAME" },
});
cols.push_back(json11::Json::object{
{ "key", "scheme_name" },
{ "title", "SCHEME" },
});
cols.push_back(json11::Json::object{
{ "key", "status" },
{ "title", "STATUS" },
});
cols.push_back(json11::Json::object{
{ "key", "pg_count_fmt" },
{ "title", "PGS" },
});
cols.push_back(json11::Json::object{
{ "key", "total_fmt" },
{ "title", "TOTAL" },
});
cols.push_back(json11::Json::object{
{ "key", "used_fmt" },
{ "title", "USED" },
});
cols.push_back(json11::Json::object{
{ "key", "max_avail_fmt" },
{ "title", "AVAILABLE" },
});
cols.push_back(json11::Json::object{
{ "key", "used_pct" },
{ "title", "USED%" },
});
cols.push_back(json11::Json::object{
{ "key", "eff_fmt" },
{ "title", "EFFICIENCY" },
});
if (show_recovery)
{
cols.push_back(json11::Json::object{ { "key", "recovery_fmt" }, { "title", "RECOVERY" } });
}
if (show_stats)
{
cols.push_back(json11::Json::object{ { "key", "read_bw" }, { "title", "READ" } });
cols.push_back(json11::Json::object{ { "key", "read_iops" }, { "title", "IOPS" } });
cols.push_back(json11::Json::object{ { "key", "read_lat_f" }, { "title", "LAT" } });
cols.push_back(json11::Json::object{ { "key", "write_bw" }, { "title", "WRITE" } });
cols.push_back(json11::Json::object{ { "key", "write_iops" }, { "title", "IOPS" } });
cols.push_back(json11::Json::object{ { "key", "write_lat_f" }, { "title", "LAT" } });
cols.push_back(json11::Json::object{ { "key", "delete_bw" }, { "title", "DELETE" } });
cols.push_back(json11::Json::object{ { "key", "delete_iops" }, { "title", "IOPS" } });
cols.push_back(json11::Json::object{ { "key", "delete_lat_f" }, { "title", "LAT" } });
}
result.data = to_list();
result.text = print_table(result.data, cols, parent->color);
state = 100;
}
};
size_t print_detail_title_len(json11::Json item, std::vector<std::pair<std::string, std::string>> names, size_t prev_len)
{
size_t title_len = prev_len;
for (auto & kv: names)
{
if (!item[kv.first].is_null() && (!item[kv.first].is_string() || item[kv.first].string_value() != ""))
{
size_t len = utf8_length(kv.second);
title_len = title_len < len ? len : title_len;
}
}
return title_len;
}
std::string print_detail(json11::Json item, std::vector<std::pair<std::string, std::string>> names, size_t title_len, bool use_esc)
{
std::string str;
for (auto & kv: names)
{
if (!item[kv.first].is_null() && (!item[kv.first].is_string() || item[kv.first].string_value() != ""))
{
str += kv.second;
str += ": ";
size_t len = utf8_length(kv.second);
for (int j = 0; j < title_len-len; j++)
str += ' ';
if (use_esc)
str += "\033[1m";
str += item[kv.first].as_string();
if (use_esc)
str += "\033[0m";
str += "\n";
}
}
return str;
}
std::function<bool(cli_result_t &)> cli_tool_t::start_pool_ls(json11::Json cfg)
{
auto lister = new pool_lister_t();
lister->parent = this;
lister->show_recovery = cfg["show_recovery"].bool_value();
lister->show_stats = cfg["long"].bool_value();
lister->detailed = cfg["detail"].bool_value();
lister->sort_field = cfg["sort"].string_value();
if ((lister->sort_field == "osd_tags") ||
(lister->sort_field == "primary_affinity_tags" ))
lister->sort_field = lister->sort_field + "_fmt";
lister->reverse = cfg["reverse"].bool_value();
lister->max_count = cfg["count"].uint64_value();
for (auto & item: cfg["names"].array_items())
{
lister->only_names.insert(item.string_value());
}
return [lister](cli_result_t & result)
{
lister->loop();
if (lister->is_done())
{
result = lister->result;
delete lister;
return true;
}
return false;
};
}
std::string implode(const std::string & sep, json11::Json array)
{
if (array.is_number() || array.is_bool() || array.is_string())
{
return array.as_string();
}
std::string res;
bool first = true;
for (auto & item: array.array_items())
{
res += (first ? item.as_string() : sep+item.as_string());
first = false;
}
return res;
}

Some files were not shown because too many files have changed in this diff Show More