Compare commits

...

60 Commits

Author SHA1 Message Date
Vitaliy Filippov 5a8f80159f Add bindiff for tests
Test / test_rebalance_verify_ec (push) Successful in 1m45s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m49s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write (push) Successful in 32s Details
Test / test_write_xor (push) Successful in 35s Details
Test / test_resize_auto (push) Successful in 8s Details
Test / test_osd_tags (push) Successful in 10s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_snapshot_pool2 (push) Successful in 15s Details
Test / test_enospc_imm (push) Successful in 9s Details
Test / test_enospc_xor (push) Successful in 14s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 15s Details
Test / test_scrub_zero_osd_2 (push) Successful in 14s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 17s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s Details
Test / test_scrub_ec (push) Successful in 15s Details
Test / test_nfs (push) Successful in 12s Details
Test / test_write_no_same (push) Successful in 10s Details
Test / test_heal_csum_32k (push) Successful in 2m25s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m27s Details
Test / test_heal_pg_size_2 (push) Successful in 2m18s Details
Test / test_heal_ec (push) Successful in 2m17s Details
Test / test_heal_antietcd (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m18s Details
Test / test_resize (push) Successful in 14s Details
Test / test_heal_csum_4k (push) Successful in 2m33s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m33s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m49s Details
2024-11-15 02:21:37 +03:00
Vitaliy Filippov 8202ee9d74 Trigger double autosync when switching PG state to prevent leaving garbage in non-immediate_commit clusters
Test / test_dd (push) Successful in 12s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m39s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_write (push) Successful in 29s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write_xor (push) Successful in 33s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Successful in 2m16s Details
Test / test_heal_antietcd (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m22s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_32k (push) Successful in 2m20s Details
Test / test_resize_auto (push) Successful in 8s Details
Test / test_resize (push) Successful in 12s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m18s Details
Test / test_osd_tags (push) Successful in 7s Details
Test / test_snapshot_pool2 (push) Successful in 15s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_enospc_xor (push) Successful in 13s Details
Test / test_enospc_imm (push) Successful in 10s Details
Test / test_enospc_imm_xor (push) Successful in 12s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 15s Details
Test / test_scrub_xor (push) Successful in 15s Details
Test / test_scrub_pg_size_3 (push) Successful in 15s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 11s Details
Test / test_heal_csum_4k (push) Successful in 2m10s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m20s Details
2024-11-15 01:26:36 +03:00
Vitaliy Filippov 5864bd067c Add missing connection timeout for etcd websockets in OSD
Test / test_rebalance_verify_ec (push) Successful in 1m43s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m44s Details
Test / test_write_no_same (push) Successful in 7s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write (push) Successful in 30s Details
Test / test_write_xor (push) Successful in 34s Details
Test / test_heal_pg_size_2 (push) Successful in 2m14s Details
Test / test_heal_ec (push) Successful in 2m16s Details
Test / test_heal_antietcd (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m16s Details
Test / test_heal_csum_32k (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m18s Details
Test / test_resize_auto (push) Successful in 8s Details
Test / test_resize (push) Successful in 14s Details
Test / test_osd_tags (push) Successful in 9s Details
Test / test_snapshot_pool2 (push) Successful in 14s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_enospc_imm (push) Successful in 10s Details
Test / test_enospc_xor (push) Successful in 13s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 14s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 14s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 13s Details
Test / test_heal_csum_4k (push) Successful in 2m11s Details
2024-11-12 02:28:07 +03:00
Vitaliy Filippov c312557ace Do not execute remaining operations if the client is stopped during read
Test / test_root_node (push) Successful in 10s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m46s Details
Test / test_write_no_same (push) Successful in 10s Details
Test / test_write (push) Successful in 31s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write_xor (push) Successful in 35s Details
Test / test_heal_pg_size_2 (push) Successful in 2m17s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m20s Details
Test / test_heal_ec (push) Successful in 2m28s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m17s Details
Test / test_heal_csum_32k (push) Successful in 2m15s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_resize (push) Successful in 13s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m18s Details
Test / test_resize_auto (push) Successful in 9s Details
Test / test_osd_tags (push) Successful in 7s Details
Test / test_snapshot_pool2 (push) Successful in 15s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 11s Details
Test / test_enospc_imm_xor (push) Successful in 14s Details
Test / test_scrub (push) Successful in 15s Details
Test / test_scrub_zero_osd_2 (push) Successful in 16s Details
Test / test_scrub_xor (push) Successful in 15s Details
Test / test_scrub_pg_size_3 (push) Successful in 17s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 17s Details
Test / test_scrub_ec (push) Successful in 13s Details
Test / test_nfs (push) Successful in 12s Details
Test / test_heal_csum_4k (push) Successful in 2m20s Details
2024-11-10 16:44:13 +03:00
Vitaliy Filippov 5ce20116d8 Postpone trigger_nearest to prevent timer callbacks called from setTimer/clearTimer
Test / test_root_node (push) Successful in 9s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m48s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_write (push) Successful in 31s Details
Test / test_switch_primary (push) Successful in 34s Details
Test / test_write_xor (push) Successful in 35s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Successful in 2m16s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m19s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m17s Details
Test / test_heal_csum_32k (push) Successful in 2m19s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m16s Details
Test / test_resize_auto (push) Successful in 9s Details
Test / test_resize (push) Successful in 12s Details
Test / test_osd_tags (push) Successful in 8s Details
Test / test_snapshot_pool2 (push) Successful in 14s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 11s Details
Test / test_enospc_imm_xor (push) Successful in 14s Details
Test / test_scrub (push) Successful in 15s Details
Test / test_scrub_zero_osd_2 (push) Successful in 15s Details
Test / test_scrub_xor (push) Successful in 15s Details
Test / test_scrub_pg_size_3 (push) Successful in 15s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 10s Details
Test / test_heal_csum_4k (push) Successful in 2m18s Details
2024-11-10 15:51:16 +03:00
Vitaliy Filippov be66791e59 Add another note about 1.8 upgrade 2024-11-09 00:57:58 +03:00
Vitaliy Filippov 141cec2383 Add missing refcounting for flush_batch errors 2024-11-09 00:46:38 +03:00
Vitaliy Filippov 1ce4b1b417 Fix stop condition in osd_flush
Test / test_rebalance_verify_ec (push) Successful in 2m1s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m59s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_write (push) Successful in 32s Details
Test / test_switch_primary (push) Successful in 34s Details
Test / test_write_xor (push) Successful in 34s Details
Test / test_heal_pg_size_2 (push) Successful in 2m17s Details
Test / test_heal_ec (push) Successful in 2m16s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m22s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m19s Details
Test / test_heal_csum_32k (push) Successful in 2m20s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m17s Details
Test / test_resize_auto (push) Successful in 8s Details
Test / test_resize (push) Successful in 13s Details
Test / test_osd_tags (push) Successful in 7s Details
Test / test_enospc (push) Successful in 9s Details
Test / test_snapshot_pool2 (push) Successful in 13s Details
Test / test_enospc_xor (push) Successful in 11s Details
Test / test_enospc_imm (push) Successful in 12s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 11s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 16s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 17s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 12s Details
Test / test_heal_csum_4k (push) Successful in 2m9s Details
Could probably lead to PGs hung in peering states on OSD restart in EC pools,
fixable by primary OSD restart
2024-11-08 00:30:40 +03:00
Vitaliy Filippov ebf24bac9a Fix partition zeroing during prepare
Test / test_root_node (push) Successful in 10s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m46s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write (push) Successful in 30s Details
Test / test_write_xor (push) Successful in 35s Details
Test / test_heal_pg_size_2 (push) Successful in 2m16s Details
Test / test_heal_ec (push) Successful in 2m15s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m20s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m18s Details
Test / test_heal_csum_32k (push) Successful in 2m16s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_resize_auto (push) Successful in 8s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m17s Details
Test / test_resize (push) Successful in 13s Details
Test / test_osd_tags (push) Successful in 8s Details
Test / test_snapshot_pool2 (push) Successful in 15s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_enospc_xor (push) Successful in 13s Details
Test / test_enospc_imm (push) Successful in 11s Details
Test / test_enospc_imm_xor (push) Successful in 14s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 14s Details
Test / test_scrub_xor (push) Successful in 13s Details
Test / test_scrub_pg_size_3 (push) Successful in 15s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 13s Details
Test / test_nfs (push) Successful in 10s Details
Test / test_heal_csum_4k (push) Successful in 2m12s Details
Previously it zeroed area beginning with 0 instead of actual metadata offset
which was leading to non-zeroed metadata when the disk is very small
2024-11-08 00:14:37 +03:00
Vitaliy Filippov edd9051f81 Fix arch.en toc 2024-11-08 00:14:18 +03:00
Vitaliy Filippov 662ca86dc0 Fix libvirt 8 patch 2024-11-07 12:21:32 +03:00
Vitaliy Filippov a1ca573168 Support QEMU 9.1
Test / test_rebalance_verify_ec (push) Successful in 1m42s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m44s Details
Test / test_write_no_same (push) Successful in 7s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write (push) Successful in 33s Details
Test / test_write_xor (push) Successful in 35s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Successful in 2m15s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m13s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m17s Details
Test / test_heal_csum_32k (push) Successful in 2m20s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m16s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m21s Details
Test / test_resize (push) Successful in 12s Details
Test / test_resize_auto (push) Successful in 9s Details
Test / test_osd_tags (push) Successful in 9s Details
Test / test_snapshot_pool2 (push) Successful in 15s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_enospc_imm (push) Successful in 9s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm_xor (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 13s Details
Test / test_scrub (push) Successful in 16s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 15s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s Details
Test / test_scrub_ec (push) Successful in 15s Details
Test / test_nfs (push) Successful in 11s Details
Test / test_heal_csum_4k (push) Successful in 2m11s Details
2024-11-07 12:21:13 +03:00
Vitaliy Filippov f69f801ffb Release 1.9.3
Test / test_rebalance_verify_ec (push) Successful in 1m54s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m56s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write (push) Successful in 31s Details
Test / test_write_xor (push) Successful in 36s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Successful in 2m16s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m11s Details
Test / test_heal_csum_32k (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m13s Details
Test / test_resize_auto (push) Successful in 8s Details
Test / test_resize (push) Successful in 14s Details
Test / test_osd_tags (push) Successful in 7s Details
Test / test_snapshot_pool2 (push) Successful in 15s Details
Test / test_enospc (push) Successful in 12s Details
Test / test_enospc_xor (push) Successful in 11s Details
Test / test_enospc_imm (push) Successful in 12s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 14s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 13s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 17s Details
Test / test_nfs (push) Successful in 11s Details
Test / test_heal_csum_4k (push) Successful in 2m17s Details
- Support custom hybrid OSD creation (`vitastor-disk prepare --hybrid --fast-devices /dev/xxx,/dev/yyy`)
- Auto-change partition paths to /dev/disk/by-partuuid/ in `vitastor-disk prepare`
- Allow to select cached I/O in vitastor-disk commands
- Fix multiple bugs in vitastor-disk resize & add tests for them
- Fix vitastor-disk write-meta/write-journal in superblock-based mode writing it to an incorrect device
- Fix vitastor-disk prepare sometimes again not seeing new partitions
- Cleanup PG history and stats of deleted pools
- Fix "is already mounted" checks in CSI
2024-11-07 01:28:31 +03:00
Vitaliy Filippov af92cbdfcc Dynamic device size in test
Test / test_rebalance_verify_ec (push) Successful in 1m52s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m53s Details
Test / test_write_no_same (push) Successful in 9s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write (push) Successful in 32s Details
Test / test_write_xor (push) Successful in 34s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Successful in 2m16s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m18s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m19s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m22s Details
Test / test_heal_csum_32k (push) Successful in 2m25s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m19s Details
Test / test_resize_auto (push) Successful in 8s Details
Test / test_resize (push) Successful in 13s Details
Test / test_osd_tags (push) Successful in 7s Details
Test / test_snapshot_pool2 (push) Successful in 15s Details
Test / test_enospc (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 10s Details
Test / test_enospc_xor (push) Successful in 14s Details
Test / test_enospc_imm_xor (push) Successful in 14s Details
Test / test_scrub (push) Successful in 13s Details
Test / test_scrub_zero_osd_2 (push) Successful in 12s Details
Test / test_scrub_xor (push) Successful in 15s Details
Test / test_scrub_pg_size_3 (push) Successful in 16s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 17s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 11s Details
Test / test_heal_csum_4k (push) Successful in 2m17s Details
2024-11-06 14:16:58 +03:00
Vitaliy Filippov a775db10cc Also allow cached I/O in dsk.open_*() in disk_tool
Test / test_rebalance_verify_ec (push) Successful in 1m52s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m53s Details
Test / test_write_no_same (push) Successful in 9s Details
Test / test_write (push) Successful in 30s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write_xor (push) Successful in 34s Details
Test / test_heal_pg_size_2 (push) Successful in 2m16s Details
Test / test_heal_ec (push) Successful in 2m16s Details
Test / test_heal_antietcd (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m19s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m17s Details
Test / test_heal_csum_32k (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m17s Details
Test / test_resize_auto (push) Failing after 8s Details
Test / test_resize (push) Successful in 13s Details
Test / test_osd_tags (push) Successful in 7s Details
Test / test_snapshot_pool2 (push) Successful in 15s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_enospc_xor (push) Successful in 11s Details
Test / test_enospc_imm (push) Successful in 11s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 13s Details
Test / test_scrub_xor (push) Successful in 12s Details
Test / test_scrub_pg_size_3 (push) Successful in 15s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 12s Details
Test / test_heal_csum_4k (push) Successful in 2m8s Details
2024-11-06 13:52:25 +03:00
Vitaliy Filippov eafce26049 Add resize and resize-auto tests
Test / test_rebalance_verify_ec (push) Successful in 1m46s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m48s Details
Test / test_write_no_same (push) Successful in 7s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write (push) Successful in 30s Details
Test / test_write_xor (push) Successful in 34s Details
Test / test_heal_pg_size_2 (push) Successful in 2m16s Details
Test / test_heal_ec (push) Successful in 2m18s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m19s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m21s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m13s Details
Test / test_heal_csum_32k (push) Successful in 2m20s Details
Test / test_resize (push) Failing after 12s Details
Test / test_resize_auto (push) Failing after 9s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m19s Details
Test / test_osd_tags (push) Successful in 7s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_snapshot_pool2 (push) Successful in 14s Details
Test / test_enospc_xor (push) Successful in 11s Details
Test / test_enospc_imm (push) Successful in 10s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 13s Details
Test / test_scrub_zero_osd_2 (push) Successful in 12s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 15s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 13s Details
Test / test_nfs (push) Successful in 11s Details
Test / test_heal_csum_4k (push) Successful in 2m19s Details
2024-11-06 13:30:51 +03:00
Vitaliy Filippov 625c74294f Support direct I/O 2024-11-06 13:30:12 +03:00
Vitaliy Filippov ef8c21ad6f Change %lu to %ju 2024-11-06 02:58:51 +03:00
Vitaliy Filippov 2bb8e8999e Do not check length in "data alignment mismatch" 2024-11-06 02:58:26 +03:00
Vitaliy Filippov c2e7c28672 Fix calc_lengths data size recalc during auto-resize
Test / test_dd (push) Successful in 12s Details
Test / test_root_node (push) Successful in 8s Details
Test / test_rebalance_verify_ec (push) Successful in 1m58s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m55s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write (push) Successful in 31s Details
Test / test_write_xor (push) Successful in 34s Details
Test / test_heal_pg_size_2 (push) Successful in 2m14s Details
Test / test_heal_ec (push) Successful in 2m17s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m17s Details
Test / test_heal_csum_32k (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m26s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m19s Details
Test / test_snapshot_pool2 (push) Successful in 14s Details
Test / test_osd_tags (push) Successful in 8s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 12s Details
Test / test_enospc_imm_xor (push) Successful in 15s Details
Test / test_scrub (push) Successful in 15s Details
Test / test_scrub_zero_osd_2 (push) Successful in 14s Details
Test / test_scrub_xor (push) Successful in 15s Details
Test / test_scrub_pg_size_3 (push) Successful in 16s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 17s Details
Test / test_scrub_ec (push) Successful in 16s Details
Test / test_nfs (push) Successful in 10s Details
Test / test_heal_csum_4k (push) Successful in 2m13s Details
2024-11-06 02:27:17 +03:00
Vitaliy Filippov bd22beefb5 Auto-extend new_data_len if new_data_offset is changed too
Test / test_dd (push) Successful in 11s Details
Test / test_root_node (push) Successful in 7s Details
Test / test_rebalance_verify_ec (push) Successful in 1m48s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m49s Details
Test / test_write_no_same (push) Successful in 7s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write (push) Successful in 31s Details
Test / test_write_xor (push) Successful in 34s Details
Test / test_heal_pg_size_2 (push) Successful in 2m14s Details
Test / test_heal_ec (push) Successful in 2m15s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m18s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m16s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m12s Details
Test / test_heal_csum_32k (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m13s Details
Test / test_osd_tags (push) Successful in 8s Details
Test / test_snapshot_pool2 (push) Successful in 15s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 11s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 13s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 15s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 14s Details
Test / test_scrub_ec (push) Successful in 13s Details
Test / test_nfs (push) Successful in 11s Details
Test / test_heal_csum_4k (push) Successful in 2m22s Details
2024-11-06 02:13:30 +03:00
Vitaliy Filippov e7038ab99c Auto-change partition paths to /dev/disk/by-partuuid/
Test / test_dd (push) Successful in 12s Details
Test / test_root_node (push) Successful in 8s Details
Test / test_rebalance_verify_ec (push) Successful in 1m54s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m57s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write (push) Successful in 30s Details
Test / test_write_xor (push) Successful in 34s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Failing after 2m18s Details
Test / test_heal_antietcd (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m17s Details
Test / test_heal_csum_32k (push) Successful in 2m16s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m16s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m17s Details
Test / test_osd_tags (push) Successful in 8s Details
Test / test_snapshot_pool2 (push) Successful in 13s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 12s Details
Test / test_enospc_imm_xor (push) Successful in 14s Details
Test / test_scrub (push) Successful in 13s Details
Test / test_scrub_zero_osd_2 (push) Successful in 12s Details
Test / test_scrub_xor (push) Successful in 13s Details
Test / test_scrub_pg_size_3 (push) Successful in 13s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 15s Details
Test / test_nfs (push) Successful in 13s Details
Test / test_heal_csum_4k (push) Successful in 2m18s Details
2024-11-06 01:04:05 +03:00
Vitaliy Filippov b6f75ebcfd Add missing I/O path description in english 2024-11-06 00:43:17 +03:00
Vitaliy Filippov 9def199981 Auto-reduce new_data_len in resize
Test / test_dd (push) Successful in 11s Details
Test / test_root_node (push) Successful in 8s Details
Test / test_rebalance_verify_ec (push) Successful in 1m45s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m46s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write (push) Successful in 32s Details
Test / test_write_xor (push) Successful in 33s Details
Test / test_heal_pg_size_2 (push) Successful in 2m14s Details
Test / test_heal_ec (push) Successful in 2m17s Details
Test / test_heal_antietcd (push) Successful in 2m15s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m17s Details
Test / test_heal_csum_32k (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m16s Details
Test / test_osd_tags (push) Successful in 8s Details
Test / test_snapshot_pool2 (push) Successful in 16s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 9s Details
Test / test_enospc_imm_xor (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 12s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 14s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s Details
Test / test_scrub_ec (push) Successful in 13s Details
Test / test_nfs (push) Successful in 13s Details
Test / test_heal_csum_4k (push) Successful in 2m18s Details
2024-11-05 02:57:11 +03:00
Vitaliy Filippov c72e8e649e Support test mode for vitastor-disk
Test / test_dd (push) Successful in 11s Details
Test / test_rebalance_verify_ec (push) Successful in 1m51s Details
Test / test_root_node (push) Successful in 10s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m54s Details
Test / test_write_no_same (push) Successful in 7s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write (push) Successful in 32s Details
Test / test_write_xor (push) Successful in 33s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Successful in 2m21s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m23s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m18s Details
Test / test_heal_csum_32k (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m18s Details
Test / test_osd_tags (push) Successful in 7s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m18s Details
Test / test_snapshot_pool2 (push) Successful in 15s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_enospc_imm (push) Successful in 10s Details
Test / test_enospc_xor (push) Successful in 13s Details
Test / test_enospc_imm_xor (push) Successful in 12s Details
Test / test_scrub (push) Successful in 12s Details
Test / test_scrub_zero_osd_2 (push) Successful in 13s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 16s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 14s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 12s Details
Test / test_heal_csum_4k (push) Successful in 2m19s Details
2024-11-05 02:43:55 +03:00
Vitaliy Filippov 8bdb3e8786 Write meta/journal to correct device when used in superblock mode 2024-11-05 02:43:55 +03:00
Vitaliy Filippov a87e236c70 Fix resize --data-size, particularly when expanding the device
Test / test_root_node (push) Successful in 8s Details
Test / test_dd (push) Successful in 12s Details
Test / test_rebalance_verify_ec (push) Successful in 1m47s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m47s Details
Test / test_write_no_same (push) Successful in 7s Details
Test / test_write (push) Successful in 30s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write_xor (push) Successful in 35s Details
Test / test_heal_pg_size_2 (push) Successful in 2m16s Details
Test / test_heal_ec (push) Successful in 2m14s Details
Test / test_heal_antietcd (push) Successful in 2m15s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m15s Details
Test / test_heal_csum_32k_dj (push) Failing after 2m24s Details
Test / test_heal_csum_32k (push) Successful in 2m16s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_osd_tags (push) Successful in 7s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m19s Details
Test / test_snapshot_pool2 (push) Successful in 14s Details
Test / test_enospc (push) Successful in 9s Details
Test / test_enospc_imm (push) Successful in 9s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 12s Details
Test / test_scrub_zero_osd_2 (push) Successful in 11s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 14s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 14s Details
Test / test_scrub_ec (push) Successful in 16s Details
Test / test_nfs (push) Successful in 12s Details
Test / test_heal_csum_4k (push) Successful in 2m17s Details
2024-11-04 18:55:03 +03:00
Vitaliy Filippov 16f67cf6f1 Fix missing metadata checksums after resize
Test / test_dd (push) Successful in 12s Details
Test / test_root_node (push) Successful in 8s Details
Test / test_rebalance_verify_ec (push) Successful in 1m47s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m50s Details
Test / test_write_no_same (push) Successful in 7s Details
Test / test_write (push) Successful in 30s Details
Test / test_switch_primary (push) Successful in 34s Details
Test / test_write_xor (push) Successful in 33s Details
Test / test_heal_pg_size_2 (push) Successful in 2m22s Details
Test / test_heal_ec (push) Successful in 2m20s Details
Test / test_heal_antietcd (push) Successful in 2m19s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m26s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m21s Details
Test / test_heal_csum_32k (push) Successful in 2m25s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m26s Details
Test / test_osd_tags (push) Successful in 8s Details
Test / test_snapshot_pool2 (push) Successful in 14s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_enospc_xor (push) Successful in 11s Details
Test / test_enospc_imm (push) Successful in 12s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 13s Details
Test / test_scrub_xor (push) Successful in 13s Details
Test / test_scrub_pg_size_3 (push) Successful in 14s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s Details
Test / test_scrub_ec (push) Successful in 13s Details
Test / test_nfs (push) Successful in 10s Details
Test / test_heal_csum_4k (push) Successful in 2m16s Details
2024-11-04 18:36:35 +03:00
Vitaliy Filippov 56de4a520d Support custom hybrid OSD creation (--hybrid --fast-devices /dev/xxx,/dev/yyy)
Test / test_dd (push) Successful in 13s Details
Test / test_root_node (push) Successful in 8s Details
Test / test_rebalance_verify_ec (push) Successful in 1m41s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m42s Details
Test / test_write_no_same (push) Successful in 7s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write (push) Successful in 29s Details
Test / test_write_xor (push) Successful in 33s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Successful in 2m15s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m12s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m18s Details
Test / test_heal_csum_32k (push) Successful in 2m16s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m16s Details
Test / test_osd_tags (push) Successful in 7s Details
Test / test_snapshot_pool2 (push) Successful in 14s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 11s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 14s Details
Test / test_scrub_xor (push) Successful in 13s Details
Test / test_scrub_pg_size_3 (push) Successful in 16s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 13s Details
Test / test_nfs (push) Successful in 11s Details
Test / test_heal_csum_4k (push) Successful in 2m15s Details
2024-11-04 17:52:29 +03:00
Vitaliy Filippov adca162278 Note that osd_per_disk is also incompatible
Test / test_dd (push) Successful in 12s Details
Test / test_root_node (push) Successful in 8s Details
Test / test_rebalance_verify_ec (push) Successful in 1m43s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m45s Details
Test / test_write_no_same (push) Successful in 7s Details
Test / test_switch_primary (push) Successful in 31s Details
Test / test_write (push) Successful in 30s Details
Test / test_write_xor (push) Successful in 35s Details
Test / test_heal_pg_size_2 (push) Successful in 2m14s Details
Test / test_heal_ec (push) Successful in 2m17s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m25s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m19s Details
Test / test_heal_csum_32k (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_osd_tags (push) Successful in 8s Details
Test / test_snapshot_pool2 (push) Successful in 14s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m16s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_enospc_imm (push) Successful in 10s Details
Test / test_enospc_xor (push) Successful in 13s Details
Test / test_enospc_imm_xor (push) Successful in 12s Details
Test / test_scrub (push) Successful in 12s Details
Test / test_scrub_zero_osd_2 (push) Successful in 14s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 13s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 9s Details
Test / test_heal_csum_4k (push) Successful in 2m18s Details
2024-11-04 15:20:01 +03:00
Vitaliy Filippov 490b314d72 Rework & fix new partition waiting code
Test / test_dd (push) Successful in 11s Details
Test / test_root_node (push) Successful in 8s Details
Test / test_rebalance_verify_ec (push) Successful in 1m46s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m48s Details
Test / test_write_no_same (push) Successful in 7s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write (push) Successful in 30s Details
Test / test_write_xor (push) Successful in 34s Details
Test / test_heal_pg_size_2 (push) Successful in 2m14s Details
Test / test_heal_ec (push) Successful in 2m15s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m14s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m11s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m24s Details
Test / test_heal_csum_32k (push) Successful in 2m18s Details
Test / test_osd_tags (push) Successful in 7s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m18s Details
Test / test_snapshot_pool2 (push) Successful in 13s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_enospc_imm (push) Successful in 9s Details
Test / test_enospc_xor (push) Successful in 13s Details
Test / test_enospc_imm_xor (push) Successful in 12s Details
Test / test_scrub_zero_osd_2 (push) Successful in 11s Details
Test / test_scrub (push) Successful in 13s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 15s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 17s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 13s Details
Test / test_heal_csum_4k (push) Successful in 2m11s Details
2024-11-04 15:16:30 +03:00
Vitaliy Filippov 9f52074e1e Delete PG history and stats of deleted pools
Test / test_dd (push) Successful in 11s Details
Test / test_rebalance_verify_ec (push) Successful in 1m38s Details
Test / test_root_node (push) Successful in 10s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m40s Details
Test / test_write_no_same (push) Successful in 9s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write (push) Successful in 30s Details
Test / test_write_xor (push) Successful in 34s Details
Test / test_heal_pg_size_2 (push) Successful in 2m13s Details
Test / test_heal_ec (push) Successful in 2m16s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m18s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m17s Details
Test / test_heal_csum_32k (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m16s Details
Test / test_osd_tags (push) Successful in 8s Details
Test / test_snapshot_pool2 (push) Successful in 14s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_enospc_xor (push) Successful in 13s Details
Test / test_enospc_imm (push) Successful in 11s Details
Test / test_enospc_imm_xor (push) Successful in 11s Details
Test / test_scrub_zero_osd_2 (push) Successful in 12s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_xor (push) Successful in 13s Details
Test / test_scrub_pg_size_3 (push) Successful in 15s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 13s Details
Test / test_nfs (push) Successful in 11s Details
Test / test_heal_csum_4k (push) Successful in 2m17s Details
2024-11-01 02:38:31 +03:00
Vitaliy Filippov 2b3e877546 Add notes about vitastor-disk in disable_data_fsync 2024-11-01 02:38:18 +03:00
Vitaliy Filippov 01d55e5420
Merge pull request #64 from 0x00ace/fio_version_fix
use fio 3.35-1 for AlmaLinux 9
2024-10-31 11:55:40 +03:00
Vitaliy Filippov f5aa5cfdfe Fix "is already mounted" checks in CSI 2024-10-26 14:06:21 +03:00
Vitaliy Filippov 2826bb9e7e Add more logging to CSI 2024-10-24 02:07:55 +03:00
Vitaliy Filippov 30d1ad0f66 Add Intel D5-P4320 2024-10-22 23:22:48 +03:00
Vitaliy Filippov 79719e44ac Release 1.9.2
Test / test_root_node (push) Successful in 8s Details
Test / test_dd (push) Successful in 12s Details
Test / test_rebalance_verify_ec (push) Successful in 1m40s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m41s Details
Test / test_write_no_same (push) Successful in 7s Details
Test / test_switch_primary (push) Successful in 31s Details
Test / test_write (push) Successful in 30s Details
Test / test_write_xor (push) Successful in 36s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Successful in 2m15s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m16s Details
Test / test_heal_csum_32k (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m17s Details
Test / test_osd_tags (push) Successful in 7s Details
Test / test_snapshot_pool2 (push) Successful in 13s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 10s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 13s Details
Test / test_scrub_zero_osd_2 (push) Successful in 12s Details
Test / test_scrub_xor (push) Successful in 13s Details
Test / test_scrub_pg_size_3 (push) Successful in 14s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 13s Details
Test / test_nfs (push) Successful in 11s Details
Test / test_heal_csum_4k (push) Successful in 2m14s Details
New features:
- Support resizing normal vitastor-disk partitions and moving journal/metadata: [vitastor-disk resize](https://vitastor.io/docs/usage/disk.html#resize)
- Support simple forms of vitastor-disk {dump,write}-{meta,journal} for OSD partitions

Bug fixes:
- Fix block RWX volumes broken after introducing stage/unstage support
- Do not allow to create non-block RWX volumes in CSI
- Fix vitastor-disk prepare not seeing the newly created partition in rare cases
- Fix non-array tags not showing up in ls-osd/osd-tree
- Make OpenNebula oned.conf patching during installation smarter
- Fix iseek option in vitastor-cli dd not working
- Validate conv=, iflag=, oflag= options in vitastor-cli dd
- Fix vitastor-disk write-meta not writing header checksum to the disk
- Fix JSON format in vitastor-disk dump-meta
- Fix read_chain_bitmap not working for snapshot in another pool
- Fix a possible OSD crash during parallel read & write to an image with snapshots
- Several followups to the READ_CHAIN_BITMAP fix: avoid data reads, fix possible overflow in is_zero(), fix bitmap size
2024-10-20 01:49:13 +03:00
Vitaliy Filippov f5626655df Add new disk command docs 2024-10-20 01:47:46 +03:00
Vitaliy Filippov 7e2dde2702 Fix block RWX volumes broken after introducing stage/unstage support 2024-10-19 11:56:56 +03:00
Vitaliy Filippov 3b0ab317cf Validate non-block RWX in CSI 2024-10-18 01:55:38 +03:00
Vitaliy Filippov 18eb99c494 Implement resizing partitions created with vitastor-disk 2024-10-18 01:55:19 +03:00
Vitaliy Filippov 4e8a1a8895 Run partprobe in add_partition() if /dev/disk/by-partuuid symlink is not present 2024-10-12 18:07:53 +03:00
Vitaliy Filippov d27a8bdabc Make get_parent_device return full path 2024-10-12 13:44:52 +03:00
Vitaliy Filippov ebd616e42f Extract clear_osd_superblock() 2024-10-12 13:44:52 +03:00
Vitaliy Filippov b18d296e01 Extract check_existing_partition(), get_device_size() 2024-10-12 13:44:52 +03:00
Vitaliy Filippov a03508320e Move json_is_true/json_is_false to json_util.cpp
Test / test_dd (push) Successful in 12s Details
Test / test_rebalance_verify_ec (push) Successful in 1m37s Details
Test / test_root_node (push) Successful in 9s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m40s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_switch_primary (push) Successful in 31s Details
Test / test_write (push) Successful in 31s Details
Test / test_write_xor (push) Successful in 35s Details
Test / test_heal_pg_size_2 (push) Successful in 2m14s Details
Test / test_heal_ec (push) Successful in 2m15s Details
Test / test_heal_antietcd (push) Successful in 2m15s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m16s Details
Test / test_heal_csum_32k (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m14s Details
Test / test_osd_tags (push) Successful in 7s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m17s Details
Test / test_snapshot_pool2 (push) Successful in 13s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_enospc_imm (push) Successful in 10s Details
Test / test_enospc_xor (push) Successful in 13s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub_zero_osd_2 (push) Successful in 11s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_xor (push) Successful in 13s Details
Test / test_scrub_pg_size_3 (push) Successful in 14s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 14s Details
Test / test_scrub_ec (push) Successful in 13s Details
Test / test_nfs (push) Successful in 11s Details
Test / test_heal_csum_4k (push) Successful in 2m9s Details
2024-10-12 00:40:39 +03:00
Vitaliy Filippov c9ccc790ec Fix non-array tags not showing up in ls-osd/osd-tree
Test / test_dd (push) Successful in 13s Details
Test / test_rebalance_verify_ec (push) Successful in 1m39s Details
Test / test_root_node (push) Successful in 9s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m41s Details
Test / test_write_no_same (push) Successful in 9s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write (push) Successful in 32s Details
Test / test_write_xor (push) Successful in 33s Details
Test / test_heal_pg_size_2 (push) Successful in 2m16s Details
Test / test_heal_ec (push) Successful in 2m16s Details
Test / test_heal_antietcd (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m18s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m18s Details
Test / test_heal_csum_32k (push) Successful in 2m16s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m19s Details
Test / test_osd_tags (push) Successful in 8s Details
Test / test_snapshot_pool2 (push) Successful in 15s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_enospc_xor (push) Successful in 11s Details
Test / test_enospc_imm (push) Successful in 10s Details
Test / test_enospc_imm_xor (push) Successful in 12s Details
Test / test_scrub_zero_osd_2 (push) Successful in 12s Details
Test / test_scrub (push) Successful in 15s Details
Test / test_scrub_xor (push) Successful in 13s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 17s Details
Test / test_scrub_ec (push) Successful in 13s Details
Test / test_nfs (push) Successful in 11s Details
Test / test_heal_csum_4k (push) Successful in 2m16s Details
2024-10-11 18:33:35 +03:00
Vitaliy Filippov db2d9c5b3d Fix tables in NFS doc 2024-10-08 00:20:10 +03:00
Vitaliy Filippov 09f15f44c9 Fix Toshiba MG and VDUSE Debian kernel note in docs 2024-10-08 00:17:14 +03:00
Vitaliy Filippov c5a58c2e81 Support reading parameters automatically from the superblock in vitastor-disk {dump,write}-{meta,journal}
Test / test_dd (push) Has been cancelled Details
Test / test_root_node (push) Has been cancelled Details
Test / test_switch_primary (push) Has been cancelled Details
Test / test_write (push) Has been cancelled Details
Test / test_write_xor (push) Has been cancelled Details
Test / test_write_no_same (push) Has been cancelled Details
Test / test_heal_pg_size_2 (push) Has been cancelled Details
Test / build (push) Has been cancelled Details
Test / test_heal_ec (push) Has been cancelled Details
Test / test_heal_antietcd (push) Has been cancelled Details
Test / test_heal_csum_32k_dmj (push) Has been cancelled Details
Test / test_heal_csum_32k_dj (push) Has been cancelled Details
Test / test_heal_csum_32k (push) Has been cancelled Details
Test / test_heal_csum_4k_dmj (push) Has been cancelled Details
Test / test_heal_csum_4k_dj (push) Has been cancelled Details
Test / test_heal_csum_4k (push) Has been cancelled Details
Test / test_snapshot_pool2 (push) Has been cancelled Details
Test / test_osd_tags (push) Has been cancelled Details
Test / test_enospc (push) Has been cancelled Details
Test / test_enospc_xor (push) Has been cancelled Details
Test / test_add_osd (push) Has been cancelled Details
Test / test_enospc_imm (push) Has been cancelled Details
Test / test_enospc_imm_xor (push) Has been cancelled Details
Test / test_scrub (push) Has been cancelled Details
Test / test_scrub_zero_osd_2 (push) Has been cancelled Details
Test / test_scrub_xor (push) Has been cancelled Details
Test / test_scrub_pg_size_3 (push) Has been cancelled Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Has been cancelled Details
Test / test_scrub_ec (push) Has been cancelled Details
Test / test_nfs (push) Has been cancelled Details
2024-10-07 02:21:58 +03:00
Vitaliy Filippov 30e7c2ad1e Add custom OpenNebula oned.conf patcher (it uses a SHITTY configuration file format) 2024-10-06 13:46:05 +03:00
Vitaliy Filippov 2e76ceabbe Fix iseek option in vitastor-cli dd 2024-10-05 18:25:38 +03:00
Vitaliy Filippov 3df088c207 Validate conv=, iflag=, oflag= options in vitastor-cli dd 2024-10-05 18:02:36 +03:00
Vitaliy Filippov d882a19eab Fix vitastor-disk write-meta not writing header checksum to the disk... 2024-10-05 17:32:55 +03:00
Vitaliy Filippov 702be3da7a Fix JSON format in vitastor-disk dump-meta 2024-10-05 16:08:34 +03:00
Vitaliy Filippov 99533e1c2f Fix .yml links 2024-10-02 00:38:07 +03:00
Vitaliy Filippov a6cceb43bf Fix read_chain_bitmap not working for snapshot in another pool
Test / test_dd (push) Successful in 13s Details
Test / test_root_node (push) Successful in 10s Details
Test / test_rebalance_verify_ec (push) Successful in 1m43s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m45s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_write (push) Successful in 31s Details
Test / test_switch_primary (push) Successful in 34s Details
Test / test_write_xor (push) Successful in 35s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Successful in 2m16s Details
Test / test_heal_antietcd (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m19s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m20s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m14s Details
Test / test_heal_csum_32k (push) Successful in 2m21s Details
Test / test_osd_tags (push) Successful in 7s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m20s Details
Test / test_snapshot_pool2 (push) Successful in 14s Details
Test / test_enospc (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 11s Details
Test / test_enospc_xor (push) Successful in 14s Details
Test / test_enospc_imm_xor (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 14s Details
Test / test_scrub (push) Successful in 16s Details
Test / test_scrub_xor (push) Successful in 15s Details
Test / test_scrub_pg_size_3 (push) Successful in 17s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s Details
Test / test_scrub_ec (push) Successful in 16s Details
Test / test_nfs (push) Successful in 13s Details
Test / test_heal_csum_4k (push) Successful in 2m17s Details
2024-10-02 00:24:48 +03:00
Vitaliy Filippov 745d89459a Fix link, add title 2024-09-29 22:05:56 +03:00
ace b85dab8583
use fio 3.35-1 for AlmaLinux 9 2024-05-18 21:17:16 +03:00
97 changed files with 2406 additions and 630 deletions

View File

@ -22,7 +22,7 @@ RUN apt-get update
RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \
liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
RUN apt-get -y install jq lp-solve sudo nfs-common
RUN apt-get update && apt-get -y install jq lp-solve sudo nfs-common fdisk parted
RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
RUN set -ex; \

View File

@ -828,6 +828,60 @@ jobs:
echo ""
done
test_resize:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_resize.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_resize_auto:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_resize_auto.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_snapshot_pool2:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_snapshot_pool2.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_osd_tags:
runs-on: ubuntu-latest
needs: build

View File

@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
project(vitastor)
set(VITASTOR_VERSION "1.9.1")
set(VITASTOR_VERSION "1.9.3")
add_subdirectory(src)

View File

@ -1,4 +1,4 @@
## Vitastor
# Vitastor
[Read English version](README.md)
@ -22,7 +22,7 @@ TCP и RDMA и на хорошем железе может достигать з
Vitastor поддерживает QEMU-драйвер, протоколы NBD и NFS, драйверы OpenStack, OpenNebula, Proxmox, Kubernetes.
Другие драйверы могут также быть легко реализованы.
Подробности смотрите в документации по ссылкам ниже.
Подробности смотрите в документации по ссылкам. Можете начать отсюда: [Быстрый старт](docs/intro/quickstart.ru.md).
## Презентации и записи докладов
@ -51,7 +51,7 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
- Параметры
- [Общие](docs/config/common.ru.md)
- [Сетевые](docs/config/network.ru.md)
- [Клиентский код](docs/config/client.en.md)
- [Клиентский код](docs/config/client.ru.md)
- [Глобальные дисковые параметры](docs/config/layout-cluster.ru.md)
- [Дисковые параметры OSD](docs/config/layout-osd.ru.md)
- [Прочие параметры OSD](docs/config/osd.ru.md)

View File

@ -22,7 +22,7 @@ or internal systems of public clouds.
Vitastor supports QEMU, NBD, NFS protocols, OpenStack, OpenNebula, Proxmox, Kubernetes drivers.
More drivers may be created easily.
Read more details below in the documentation.
Read more details in the documentation. You can start from here: [Quick Start](docs/intro/quickstart.en.md).
## Talks and presentations

View File

@ -1,4 +1,4 @@
VITASTOR_VERSION ?= v1.9.1
VITASTOR_VERSION ?= v1.9.3
all: build push

View File

@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v1.9.1
image: vitalif/vitastor-csi:v1.9.3
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@ -121,7 +121,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v1.9.1
image: vitalif/vitastor-csi:v1.9.3
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@ -3,10 +3,10 @@ module vitastor.io/csi
go 1.15
require (
github.com/container-storage-interface/spec v1.4.0
github.com/container-storage-interface/spec v1.8.0
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b
github.com/kubernetes-csi/csi-lib-utils v0.9.1
golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb
golang.org/x/net v0.7.0
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
google.golang.org/grpc v1.33.1
google.golang.org/protobuf v1.24.0

View File

@ -41,8 +41,8 @@ github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWR
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
github.com/container-storage-interface/spec v1.2.0/go.mod h1:6URME8mwIBbpVyZV93Ce5St17xBiQJQY67NDsuohiy4=
github.com/container-storage-interface/spec v1.4.0 h1:ozAshSKxpJnYUfmkpZCTYyF/4MYeYlhdXbAvPvfGmkg=
github.com/container-storage-interface/spec v1.4.0/go.mod h1:6URME8mwIBbpVyZV93Ce5St17xBiQJQY67NDsuohiy4=
github.com/container-storage-interface/spec v1.8.0 h1:D0vhF3PLIZwlwZEf2eNbpujGCNwspwTYf2idJRJx4xI=
github.com/container-storage-interface/spec v1.8.0/go.mod h1:ROLik+GhPslwwWRNFF1KasPzroNARibH2rfz1rkg4H0=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@ -182,6 +182,7 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4=
github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
@ -195,6 +196,7 @@ golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8U
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20191206172530-e9b2fee46413/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
@ -213,6 +215,7 @@ golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCc
golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc=
golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@ -228,8 +231,10 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL
golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb h1:eBmm0M9fYhWpKZLjQUUKka/LtIxf46G4fxeEz5KJr9U=
golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g=
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
@ -240,6 +245,7 @@ golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@ -259,13 +265,22 @@ golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200622214017-ed371f2e16b4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f h1:+Nyd8tzPX9R7BWHguqsrbFdRx3WQ/1ib8I44HXV5yTA=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0 h1:MUK/U/4lj1t1oPg0HfuXDN/Z1wv31ZJ/YcPiGccS4DU=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0 h1:4BRB4x83lYWy72KwLD/qYDuTu7q9PjSagHvijDw7cLo=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
@ -286,8 +301,10 @@ golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgw
golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=

View File

@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "1.9.1"
vitastorCSIDriverVersion = "1.9.3"
)
// Config struct fills the parameters of request or user input

View File

@ -8,11 +8,9 @@ import (
"encoding/json"
"fmt"
"strings"
"bytes"
"strconv"
"time"
"os"
"os/exec"
"io/ioutil"
"github.com/kubernetes-csi/csi-lib-utils/protosanitizer"
@ -114,22 +112,6 @@ func GetConnectionParams(params map[string]string) (map[string]string, error)
return ctxVars, nil
}
func system(program string, args ...string) ([]byte, []byte, error)
{
klog.Infof("Running "+program+" "+strings.Join(args, " "))
c := exec.Command(program, args...)
var stdout, stderr bytes.Buffer
c.Stdout, c.Stderr = &stdout, &stderr
err := c.Run()
if (err != nil)
{
stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
klog.Errorf(program+" "+strings.Join(args, " ")+" failed: %s, status %s\n", stdoutStr+stderrStr, err)
return nil, nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
}
return stdout.Bytes(), stderr.Bytes(), nil
}
func invokeCLI(ctxVars map[string]string, args []string) ([]byte, error)
{
if (ctxVars["configPath"] != "")
@ -158,6 +140,12 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
return nil, status.Error(codes.InvalidArgument, "volume capabilities is a required field")
}
err := cs.checkCaps(volumeCapabilities)
if (err != nil)
{
return nil, err
}
etcdVolumePrefix := req.Parameters["etcdVolumePrefix"]
poolId, _ := strconv.ParseUint(req.Parameters["poolId"], 10, 64)
if (poolId == 0)
@ -301,13 +289,44 @@ func (cs *ControllerServer) ValidateVolumeCapabilities(ctx context.Context, req
return nil, status.Error(codes.InvalidArgument, "volumeCapabilities is nil")
}
err := cs.checkCaps(volumeCapabilities)
if (err != nil)
{
return nil, err
}
return &csi.ValidateVolumeCapabilitiesResponse{
Confirmed: &csi.ValidateVolumeCapabilitiesResponse_Confirmed{
VolumeCapabilities: req.VolumeCapabilities,
},
}, nil
}
func (cs *ControllerServer) checkCaps(volumeCapabilities []*csi.VolumeCapability) error
{
var volumeCapabilityAccessModes []*csi.VolumeCapability_AccessMode
for _, mode := range []csi.VolumeCapability_AccessMode_Mode{
csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER,
csi.VolumeCapability_AccessMode_MULTI_NODE_MULTI_WRITER,
csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY,
csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY,
csi.VolumeCapability_AccessMode_SINGLE_NODE_SINGLE_WRITER,
csi.VolumeCapability_AccessMode_SINGLE_NODE_MULTI_WRITER,
} {
volumeCapabilityAccessModes = append(volumeCapabilityAccessModes, &csi.VolumeCapability_AccessMode{Mode: mode})
}
for _, capability := range volumeCapabilities
{
if (capability.GetBlock() != nil)
{
for _, mode := range []csi.VolumeCapability_AccessMode_Mode{
csi.VolumeCapability_AccessMode_MULTI_NODE_SINGLE_WRITER,
csi.VolumeCapability_AccessMode_MULTI_NODE_MULTI_WRITER,
} {
volumeCapabilityAccessModes = append(volumeCapabilityAccessModes, &csi.VolumeCapability_AccessMode{Mode: mode})
}
break
}
}
capabilitySupport := false
for _, capability := range volumeCapabilities
@ -323,14 +342,10 @@ func (cs *ControllerServer) ValidateVolumeCapabilities(ctx context.Context, req
if (!capabilitySupport)
{
return nil, status.Errorf(codes.NotFound, "%v not supported", req.GetVolumeCapabilities())
return status.Errorf(codes.NotFound, "%v not supported", volumeCapabilities)
}
return &csi.ValidateVolumeCapabilitiesResponse{
Confirmed: &csi.ValidateVolumeCapabilitiesResponse_Confirmed{
VolumeCapabilities: req.VolumeCapabilities,
},
}, nil
return nil
}
// ListVolumes returns a list of volumes

View File

@ -227,7 +227,32 @@ func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVol
isBlock := req.GetVolumeCapability().GetBlock() != nil
// Check that it's not already mounted
_, err = mount.IsNotMountPoint(ns.mounter, targetPath)
notmnt, err := mount.IsNotMountPoint(ns.mounter, targetPath)
if (err == nil)
{
if (!notmnt)
{
klog.Errorf("target path %s is already mounted", targetPath)
return nil, fmt.Errorf("target path %s is already mounted", targetPath)
}
var finfo os.FileInfo
finfo, err = os.Stat(targetPath)
if (err != nil)
{
klog.Errorf("failed to stat %s: %v", targetPath, err)
return nil, err
}
if (finfo.IsDir() != (!isBlock))
{
err = os.Remove(targetPath)
if (err != nil)
{
klog.Errorf("failed to remove %s (to recreate it with correct type): %v", targetPath, err)
return nil, err
}
err = os.ErrNotExist
}
}
if (err != nil)
{
if (os.IsNotExist(err))
@ -280,6 +305,7 @@ func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVol
diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
if (isBlock)
{
klog.Infof("bind-mounting %s to %s", devicePath, targetPath)
err = diskMounter.Mount(devicePath, targetPath, "", []string{"bind"})
}
else
@ -309,39 +335,40 @@ func (ns *NodeServer) NodeStageVolume(ctx context.Context, req *csi.NodeStageVol
readOnly := Contains(opt, "ro")
if (existingFormat == "" && !readOnly)
{
var cmdOut []byte
switch fsType
{
case "ext4":
args := []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
cmdOut, err = diskMounter.Exec.Command("mkfs.ext4", args...).CombinedOutput()
_, err = systemCombined("mkfs.ext4", args...)
case "xfs":
cmdOut, err = diskMounter.Exec.Command("mkfs.xfs", "-K", devicePath).CombinedOutput()
_, err = systemCombined("mkfs.xfs", "-K", devicePath)
}
if (err != nil)
{
klog.Errorf("failed to run mkfs error: %v, output: %v", err, string(cmdOut))
goto unmap
}
}
klog.Infof("formatting and mounting %s to %s with FS %s, options: %v", devicePath, targetPath, fsType, opt)
err = diskMounter.FormatAndMount(devicePath, targetPath, fsType, opt)
if (err == nil)
{
klog.Infof("successfully mounted %s to %s", devicePath, targetPath)
}
// Try to run online resize on mount.
// FIXME: Implement online resize. It requires online resize support in vitastor-nbd.
if (err == nil && existingFormat != "" && !readOnly)
{
var cmdOut []byte
switch (fsType)
{
case "ext4":
cmdOut, err = diskMounter.Exec.Command("resize2fs", devicePath).CombinedOutput()
_, err = systemCombined("resize2fs", devicePath)
case "xfs":
cmdOut, err = diskMounter.Exec.Command("xfs_growfs", devicePath).CombinedOutput()
_, err = systemCombined("xfs_growfs", devicePath)
}
if (err != nil)
{
klog.Errorf("failed to run resizefs error: %v, output: %v", err, string(cmdOut))
goto unmap
}
}
@ -385,7 +412,7 @@ func (ns *NodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstag
defer ns.unlockVolume(ctxVars["configPath"]+":"+volName)
targetPath := req.GetStagingTargetPath()
devicePath, refCount, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath)
devicePath, _, err := mount.GetDeviceNameFromMount(ns.mounter, targetPath)
if (err != nil)
{
if (os.IsNotExist(err))
@ -402,6 +429,16 @@ func (ns *NodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstag
return &csi.NodeUnstageVolumeResponse{}, nil
}
refList, err := ns.mounter.GetMountRefs(targetPath)
if (err != nil)
{
return nil, err
}
if (len(refList) > 0)
{
klog.Warningf("%s is still referenced: %v", targetPath, refList)
}
// unmount
err = mount.CleanupMountPoint(targetPath, ns.mounter, false)
if (err != nil)
@ -410,7 +447,7 @@ func (ns *NodeServer) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstag
}
// unmap device
if (refCount == 1)
if (len(refList) == 0)
{
if (!ns.useVduse)
{
@ -451,15 +488,20 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
isBlock := req.GetVolumeCapability().GetBlock() != nil
// Check that stagingTargetPath is mounted
_, err = mount.IsNotMountPoint(ns.mounter, stagingTargetPath)
notmnt, err := mount.IsNotMountPoint(ns.mounter, stagingTargetPath)
if (err != nil)
{
klog.Errorf("staging path %v is not mounted: %v", stagingTargetPath, err)
return nil, fmt.Errorf("staging path %v is not mounted: %v", stagingTargetPath, err)
klog.Errorf("staging path %v is not mounted: %w", stagingTargetPath, err)
return nil, fmt.Errorf("staging path %v is not mounted: %w", stagingTargetPath, err)
}
else if (notmnt)
{
klog.Errorf("staging path %v is not mounted", stagingTargetPath)
return nil, fmt.Errorf("staging path %v is not mounted", stagingTargetPath)
}
// Check that targetPath is not already mounted
_, err = mount.IsNotMountPoint(ns.mounter, targetPath)
notmnt, err = mount.IsNotMountPoint(ns.mounter, targetPath)
if (err != nil)
{
if (os.IsNotExist(err))
@ -494,6 +536,11 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
return nil, err
}
}
else if (!notmnt)
{
klog.Errorf("target path %s is already mounted", targetPath)
return nil, fmt.Errorf("target path %s is already mounted", targetPath)
}
execArgs := []string{"--bind", stagingTargetPath, targetPath}
if (req.GetReadonly())

View File

@ -4,6 +4,7 @@
package vitastor
import (
"bytes"
"errors"
"encoding/json"
"fmt"
@ -15,6 +16,8 @@ import (
"syscall"
"k8s.io/klog"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)
func Contains(list []string, s string) bool
@ -73,6 +76,10 @@ func checkVduseSupport() bool
" For VDUSE you need at least Linux 5.15 and the following kernel modules: vdpa, virtio-vdpa, vduse.",
)
}
else
{
klog.Infof("VDUSE support enabled successfully")
}
return vduse
}
@ -97,6 +104,7 @@ func mapNbd(volName string, ctxVars map[string]string, readonly bool) (string, e
{
return "", fmt.Errorf("vitastor-nbd did not return the name of NBD device. output: %s", stderr)
}
klog.Infof("Attached volume %s via NBD as %s", volName, dev)
return dev, err
}
@ -217,6 +225,7 @@ func mapVduse(stateDir string, volName string, ctxVars map[string]string, readon
err = os.WriteFile(stateFile, stateJSON, 0600)
if (err == nil)
{
klog.Infof("Attached volume %s via VDUSE as %s (VDPA ID %s)", volName, blockdev, vdpaId)
return blockdev, vdpaId, nil
}
}
@ -299,3 +308,35 @@ func unmapVduseById(stateDir, vdpaId string)
os.Remove(pidFile)
}
}
func system(program string, args ...string) ([]byte, []byte, error)
{
klog.Infof("Running "+program+" "+strings.Join(args, " "))
c := exec.Command(program, args...)
var stdout, stderr bytes.Buffer
c.Stdout, c.Stderr = &stdout, &stderr
err := c.Run()
if (err != nil)
{
stdoutStr, stderrStr := string(stdout.Bytes()), string(stderr.Bytes())
klog.Errorf(program+" "+strings.Join(args, " ")+" failed: %s\nOutput:\n%s", err, stdoutStr+stderrStr)
return nil, nil, status.Error(codes.Internal, stdoutStr+stderrStr+" (status "+err.Error()+")")
}
return stdout.Bytes(), stderr.Bytes(), nil
}
func systemCombined(program string, args ...string) ([]byte, error)
{
klog.Infof("Running "+program+" "+strings.Join(args, " "))
c := exec.Command(program, args...)
var out bytes.Buffer
c.Stdout, c.Stderr = &out, &out
err := c.Run()
if (err != nil)
{
outStr := string(out.Bytes())
klog.Errorf(program+" "+strings.Join(args, " ")+" failed: %s, status %s\n", outStr, err)
return nil, status.Error(codes.Internal, outStr+" (status "+err.Error()+")")
}
return out.Bytes(), nil
}

2
debian/changelog vendored
View File

@ -1,4 +1,4 @@
vitastor (1.9.1-1) unstable; urgency=medium
vitastor (1.9.3-1) unstable; urgency=medium
* Bugfixes

View File

@ -106,8 +106,8 @@ SSD cache or "media-cache" - for example, a lot of Seagate EXOS drives have
it (they have internal SSD cache even though it's not stated in datasheets).
Setting this parameter to "all" or "small" in OSD parameters requires enabling
[disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
[disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
"all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).
[disable_journal_fsync](layout-osd.en.md#disable_journal_fsync) and
[disable_meta_fsync](layout-osd.en.md#disable_meta_fsync), setting it to
"all" also requires enabling [disable_data_fsync](layout-osd.en.md#disable_data_fsync).
vitastor-disk tried to do that by default, first checking/disabling drive cache.
If it can't disable drive cache, OSD get initialized with "none".

View File

@ -112,6 +112,6 @@ HDD-дисках с внутренним SSD или "медиа" кэшем - н
указано в спецификациях).
Указание "all" или "small" в настройках / командной строке OSD требует
включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
[disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).
включения [disable_journal_fsync](layout-osd.ru.md#disable_journal_fsync) и
[disable_meta_fsync](layout-osd.ru.md#disable_meta_fsync), значение "all"
также требует включения [disable_data_fsync](layout-osd.ru.md#disable_data_fsync).

View File

@ -118,12 +118,13 @@ Physical block size of the journal device. Must be a multiple of
- Type: boolean
- Default: false
Do not issue fsyncs to the data device, i.e. do not flush its cache.
Safe ONLY if your data device has write-through cache. If you disable
the cache yourself using `hdparm` or `scsi_disk/cache_type` then make sure
that the cache disable command is run every time before starting Vitastor
OSD, for example, in the systemd unit. See also `immediate_commit` option
for the instructions to disable cache and how to benefit from it.
Do not issue fsyncs to the data device, i.e. do not force it to flush cache.
Safe ONLY if your data device has write-through cache or if write-back
cache is disabled. If you disable drive cache manually with `hdparm` or
writing to `/sys/.../scsi_disk/cache_type` then make sure that you do it
every time before starting Vitastor OSD (vitastor-disk does it automatically).
See also [immediate_commit](layout-cluster.en.md#immediate_commit)
for information about how to benefit from disabled cache.
## disable_meta_fsync
@ -171,8 +172,7 @@ size, it actually has to write the whole 4 KB sector.
Because of this it can actually be beneficial to use SSDs which work well
with 512 byte sectors and use 512 byte disk_alignment, journal_block_size
and meta_block_size. But the only SSD that may fit into this category is
Intel Optane (probably, not tested yet).
and meta_block_size. But at the moment, no such SSDs are known...
Clients don't need to be aware of disk_alignment, so it's not required to
put a modified value into etcd key /vitastor/config/global.

View File

@ -122,13 +122,14 @@ SSD-диске, иначе производительность пострада
- Тип: булево (да/нет)
- Значение по умолчанию: false
Не отправлять fsync-и устройству данных, т.е. не сбрасывать его кэш.
Не отправлять fsync-и устройству данных, т.е. не заставлять его сбрасывать кэш.
Безопасно, ТОЛЬКО если ваше устройство данных имеет кэш со сквозной
записью (write-through). Если вы отключаете кэш через `hdparm` или
`scsi_disk/cache_type`, то удостоверьтесь, что команда отключения кэша
выполняется перед каждым запуском Vitastor OSD, например, в systemd unit-е.
Смотрите также опцию `immediate_commit` для инструкций по отключению кэша
и о том, как из этого извлечь выгоду.
записью (write-through) или если кэш с отложенной записью (write-back) отключён.
Если вы отключаете кэш вручную через `hdparm` или запись в `/sys/.../scsi_disk/cache_type`,
то удостоверьтесь, что вы делаете это каждый раз перед запуском Vitastor OSD
(vitastor-disk делает это автоматически). Смотрите также опцию
[immediate_commit](layout-cluster.ru.md#immediate_commit) для информации о том,
как извлечь выгоду из отключённого кэша.
## disable_meta_fsync
@ -179,9 +180,8 @@ SSD и HDD диски используют 4 КБ физические сект
Поэтому, на самом деле, может быть выгодно найти SSD, хорошо работающие с
меньшими, 512-байтными, блоками и использовать 512-байтные disk_alignment,
journal_block_size и meta_block_size. Однако единственные SSD, которые
теоретически могут попасть в эту категорию - это Intel Optane (но и это
пока не проверялось автором).
journal_block_size и meta_block_size. Однако на данный момент такие SSD
не известны...
Клиентам не обязательно знать про disk_alignment, так что помещать значение
этого параметра в etcd в /vitastor/config/global не нужно.

View File

@ -55,7 +55,7 @@ Examples:
OSD placement tree is set in a separate etcd key `/vitastor/config/node_placement`
in the following JSON format:
`
```
{
"<node name or OSD number>": {
"level": "<level>",
@ -63,7 +63,7 @@ in the following JSON format:
},
...
}
`
```
Here, if a node name is a number then it is assumed to refer to an OSD.
Level of the OSD is always "osd" and cannot be overriden. You may only

View File

@ -54,7 +54,7 @@
Дерево размещения OSD задаётся в отдельном ключе etcd `/vitastor/config/node_placement`
в следующем JSON-формате:
`
```
{
"<имя узла или номер OSD>": {
"level": "<уровень>",
@ -62,7 +62,7 @@
},
...
}
`
```
Здесь, если название узла - число, считается, что это OSD. Уровень OSD
всегда равен "osd" и не может быть переопределён. Для OSD вы можете только

View File

@ -97,9 +97,9 @@
it (they have internal SSD cache even though it's not stated in datasheets).
Setting this parameter to "all" or "small" in OSD parameters requires enabling
[disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
[disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
"all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).
[disable_journal_fsync](layout-osd.en.md#disable_journal_fsync) and
[disable_meta_fsync](layout-osd.en.md#disable_meta_fsync), setting it to
"all" also requires enabling [disable_data_fsync](layout-osd.en.md#disable_data_fsync).
vitastor-disk tried to do that by default, first checking/disabling drive cache.
If it can't disable drive cache, OSD get initialized with "none".
info_ru: |
@ -156,6 +156,6 @@
указано в спецификациях).
Указание "all" или "small" в настройках / командной строке OSD требует
включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
[disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).
включения [disable_journal_fsync](layout-osd.ru.md#disable_journal_fsync) и
[disable_meta_fsync](layout-osd.ru.md#disable_meta_fsync), значение "all"
также требует включения [disable_data_fsync](layout-osd.ru.md#disable_data_fsync).

View File

@ -110,20 +110,22 @@
type: bool
default: false
info: |
Do not issue fsyncs to the data device, i.e. do not flush its cache.
Safe ONLY if your data device has write-through cache. If you disable
the cache yourself using `hdparm` or `scsi_disk/cache_type` then make sure
that the cache disable command is run every time before starting Vitastor
OSD, for example, in the systemd unit. See also `immediate_commit` option
for the instructions to disable cache and how to benefit from it.
Do not issue fsyncs to the data device, i.e. do not force it to flush cache.
Safe ONLY if your data device has write-through cache or if write-back
cache is disabled. If you disable drive cache manually with `hdparm` or
writing to `/sys/.../scsi_disk/cache_type` then make sure that you do it
every time before starting Vitastor OSD (vitastor-disk does it automatically).
See also [immediate_commit](layout-cluster.en.md#immediate_commit)
for information about how to benefit from disabled cache.
info_ru: |
Не отправлять fsync-и устройству данных, т.е. не сбрасывать его кэш.
Не отправлять fsync-и устройству данных, т.е. не заставлять его сбрасывать кэш.
Безопасно, ТОЛЬКО если ваше устройство данных имеет кэш со сквозной
записью (write-through). Если вы отключаете кэш через `hdparm` или
`scsi_disk/cache_type`, то удостоверьтесь, что команда отключения кэша
выполняется перед каждым запуском Vitastor OSD, например, в systemd unit-е.
Смотрите также опцию `immediate_commit` для инструкций по отключению кэша
и о том, как из этого извлечь выгоду.
записью (write-through) или если кэш с отложенной записью (write-back) отключён.
Если вы отключаете кэш вручную через `hdparm` или запись в `/sys/.../scsi_disk/cache_type`,
то удостоверьтесь, что вы делаете это каждый раз перед запуском Vitastor OSD
(vitastor-disk делает это автоматически). Смотрите также опцию
[immediate_commit](layout-cluster.ru.md#immediate_commit) для информации о том,
как извлечь выгоду из отключённого кэша.
- name: disable_meta_fsync
type: bool
default: false
@ -179,8 +181,7 @@
Because of this it can actually be beneficial to use SSDs which work well
with 512 byte sectors and use 512 byte disk_alignment, journal_block_size
and meta_block_size. But the only SSD that may fit into this category is
Intel Optane (probably, not tested yet).
and meta_block_size. But at the moment, no such SSDs are known...
Clients don't need to be aware of disk_alignment, so it's not required to
put a modified value into etcd key /vitastor/config/global.
@ -198,9 +199,8 @@
Поэтому, на самом деле, может быть выгодно найти SSD, хорошо работающие с
меньшими, 512-байтными, блоками и использовать 512-байтные disk_alignment,
journal_block_size и meta_block_size. Однако единственные SSD, которые
теоретически могут попасть в эту категорию - это Intel Optane (но и это
пока не проверялось автором).
journal_block_size и meta_block_size. Однако на данный момент такие SSD
не известны...
Клиентам не обязательно знать про disk_alignment, так что помещать значение
этого параметра в etcd в /vitastor/config/global не нужно.

View File

@ -4,6 +4,8 @@
[Читать на русском](opennebula.ru.md)
# OpenNebula
## Automatic Installation
OpenNebula plugin is packaged as `vitastor-opennebula` Debian and RPM package since Vitastor 1.9.0. So:

View File

@ -4,6 +4,8 @@
[Read in English](opennebula.en.md)
# OpenNebula
## Автоматическая установка
Плагин OpenNebula Vitastor распространяется как Debian и RPM пакет `vitastor-opennebula`, начиная с версии Vitastor 1.9.0. Так что:

View File

@ -6,19 +6,150 @@
# Architecture
- [Server-side components](#server-side-components)
- [Basic concepts](#basic-concepts)
- [Client-side components](#client-side-components)
- [Additional utilities](#additional-utilities)
- [Overall read/write process](#overall-read-write-process)
- [Nuances of request handling](#nuances-of-request-handling)
- [Similarities to Ceph](#similarities-to-ceph)
- [Differences from Ceph](#differences-from-ceph)
- [Implementation Principles](#implementation-principles)
## Server-side components
- **OSD** (Object Storage Daemon) is a process that directly works with the disk, stores data
and serves read/write requests. One OSD serves one disk (or one partition). OSDs talk to etcd
and to each other — they receive cluster state from etcd, and send read/write requests for
secondary copies of data to other OSDs.
- **etcd** — clustered key/value database, used as a reliable storage for configuration
and high-level cluster state. Etcd is the component that prevents splitbrain in the cluster.
Data blocks are not stored in etcd, etcd doesn't participate in data write or read path.
- **Монитор** — a separate node.js based daemon which monitors the cluster, calculates
required configuration changes and saves them to etcd, thus commanding OSDs to apply these
changes. Monitor also aggregates cluster statistics. OSD don't talk to monitor, monitor
only sends and receives data from etcd.
## Basic concepts
- OSD (Object Storage Daemon) is a process that stores data and serves read/write requests.
- PG (Placement Group) is a "shard" of the cluster, group of data stored on one set of replicas.
- Pool is a container for data that has equal redundancy scheme and placement rules.
- Monitor is a separate daemon that watches cluster state and handles failures.
- Failure Domain is a group of OSDs that you allow to fail. It's "host" by default.
- Placement Tree groups OSDs in a hierarchy to later split them into Failure Domains.
- **Pool** is a container for data that has equal redundancy scheme and disk placement rules.
- **PG (Placement Group)** is a "shard" of the cluster, subdivision unit that has its own
set of OSDs for data storage.
- **Failure Domain** is a group of OSDs, from the simultaneous failure of which you are
protected by Vitastor. Default failure domain is "host" (server), but you choose a
larger (for example, a rack of servers) or smaller (a single drive) failure domain
for every pool.
- **Placement Tree** (similar to Ceph CRUSH Tree) groups OSDs in a hierarchy to later
split them into Failure Domains.
## Client-side components
- **Client library** encapsulates client I/O logic. Client library connects to etcd and to all OSDs,
receives cluster state from etcd, sends read and write requests directly to all OSDs. Due
to the symmetric distributed architecture, all data blocks (each 128 KB by default) are placed
to different OSDs, but clients always know where each data block is stored and connect directly
to the right OSD.
All other client-side components are based on the client library:
- **[vitastor-cli](../usage/cli.en.md)** — command-line utility for cluster management.
Allows to view cluster state, manage pools and images, i.e. create, modify and remove
virtual disks, their snapshots and clones.
- **[QEMU driver](../usage/qemu.en.md)** — pluggable QEMU module allowing QEMU/KVM virtual
machines work with virtual Vitastor disks directly from userspace through the client library,
without the need to attach disks as kernel block devices. However, if you want to attach
disks, you can also do that with the same driver and [VDUSE](../usage/qemu.en.md#vduse).
- **[vitastor-nbd](../usage/nbd.en.md)** — utility that allows to attach Vitastor disks as
kernel block devices using NBD (Network Block Device), which works more like "BUSE"
(Block Device In Userspace). Vitastor doesn't have Linux kernel modules for the same task
(at least by now). NBD is an older, non-recommended way to attach disks — you should use
VDUSE whenever you can.
- **[CSI driver](../installation/kubernetes.en.md)** — driver for attaching Vitastor images
as Kubernetes persistent volumes. Works through VDUSE (when available) or NBD — images are
attached as kernel block devices and mounted into containers.
- **Drivers for Proxmox, OpenStack and so on** — pluggable modules for corresponding systems,
allowing to use Vitastor as storage in them.
- **[vitastor-nfs](../usage/nfs.en.md)** — NFS 3.0 server allowing export of two file system variants:
the first is a simplified pseudo-FS for file-based access to Vitastor block images (for non-QEMU
hypervisors with NFS support), the second is **VitastorFS**, full-featured clustered POSIX FS.
Both variants support parallel access from multiple vitastor-nfs servers. In fact, you are
not required to setup separate NFS servers at all and use vitastor-nfs mount command on every
client node — it starts the NFS server and mounts the FS locally.
- **[fio driver](../usage/fio.en.md)** — pluggable module for fio disk benchmarking tool for
running performance tests on your Vitastor cluster.
- **vitastor-kv** — client for a key-value DB working over shared block volumes (usual
vitastor images). VitastorFS metadata is stored in vitastor-kv.
## Additional utilities
- **vitastor-disk** — a Vitastor OSD disk management tool. You can create, remove,
resize and move OSD partitions with it.
## Overall read/write process
- Vitastor stores virtual disks, also named "images" or "inodes".
- Each image is stored in some pool. Pool specifies storage parameters such as redundancy
scheme (replication or EC — erasure codes, i.e. error correction codes), failure domain
and restrictions on OSD selection for image data placement. See [Pool configuration](../config/pool.en.md) for details.
- Each image is split into objects/blocks of fixed size, equal to [block_size](../config/layout-cluster.en.md#block_size)
(128 KB by default), multiplied by data part count for EC or 1 for replicas. That is,
if a pool uses EC 4+2 coding scheme (4 data parts + 2 parity parts), then, with the
default block_size, images are split into 512 KB objects.
- Client read/write requests are split into parts at object boundaries.
- Each object is mapped to a PG number it belongs to, by simply taking a remainder of
division of its offset by PG count of the image's pool.
- Client reads primary OSD for all PGs from etcd. Primary OSD for each PG is assigned
by the monitor during cluster operation, along with the full PG OSD set.
- If not already connected, client connects to primary OSDs of all PGs involved in a
read/write request and sends parts of the request to them.
- If a primary OSD is unavailable, client retries connection attempts indefinitely
either until it becomes available or until the monitor assigns another OSD as primary
for that PG.
- Client also retries requests if the primary OSD replies with error code EPIPE, meaning
that the PG is inactive at this OSD at the moment - for example, when the primary OSD
is switched, or if the primary OSD itself loses connection to replicas during request
handling.
- Primary OSD determines where the parts of the object are stored. By default, all objects
are assumed to be stored at the target OSD set of a PG, but some of them may be present
at a different OSD set if they are degraded or moved, or if the data rebalancing process
is active. OSDs doesn't do any network requests, if calculates locations of all objects
during PG activation and stores it in memory.
- Primary OSD handles the request locally when it can - for example, when it's a read
from a replicated pool or when it's a read from a EC pool involving only one data part
stored on the OSD's local disk.
- When a request requires reads or writes to additional OSDs, primary OSD uses already
established connections to secondary OSDs of the PG to execute these requests. This happens
in parallel to local disk operations. All such connections are guaranteed to be already
established when the PG is active, and if any of them is dropped, PG is restarted and
all current read/write operations to it fail with EPIPE error and are retried by clients.
- After completing all secondary read/write requests, primary OSD sends the response to
the client.
### Nuances of request handling
- If a pool uses erasure codes and some of the OSDs are unavailable, primary OSDs recover
data from the remaining parts during read.
- Each object has a version number. During write, primary OSD first determines the current
version of the object. As primary OSD usually stores the object or its part itself, most
of the time version is read from the memory of the OSD itself. However, if primary OSD
doesn't contain parts of the object, it requests the version number from a secondary OSD
which has that part. Such request still doesn't involve reading from the disk though,
because object metadata, including version number, is always stored in OSD memory.
- If a pool uses erasure codes, partial writes of an object require reading other parts of
it from secondary OSDs or from the local disk of the primary OSD itself. This is called
"read-modify-write" process.
- If a pool uses erasure codes, two-phase write process is used to get rid of the Write Hole
problem: first a new version of object parts is written to all secondary OSDs without
removing the previous version, and then, after receiving successful write confirmations
from all OSDs, new version is committed and the old one is allowed to be removed.
- In a pool doesn't use immediate_commit mode, then write requests sent by clients aren't
treated as committed to physical media instantly. Clients have to send separate type of
requests (SYNC) to commit changes, and before it isn't sent, new versions of data are
allowed to be lost if some OSDs die. Thus, when immediate_commit is disabled, clients
store copies of all write requests in memory and repeat them from there when the
connection to primary OSD is lost. This in-memory copy is removed after a successful
SYNC, and to prevent excessive memory usage, clients also do an automatic SYNC
every [client_dirty_limit](../config/network.en.md#client_dirty_limit) written bytes.
## Similarities to Ceph

View File

@ -11,6 +11,7 @@
- [Серверные компоненты](#серверные-компоненты)
- [Базовые понятия](#базовые-понятия)
- [Клиентские компоненты](#клиентские-компоненты)
- [Дополнительные утилиты](#дополнительные-утилиты)
- [Общий процесс записи и чтения](#общий-процесс-записи-и-чтения)
- [Особенности обработки запросов](#особенности-обработки-запросов)
- [Схожесть с Ceph](#схожесть-с-ceph)
@ -34,8 +35,9 @@
- **Пул (Pool)** — контейнер для данных, имеющих одну и ту же схему избыточности и правила распределения по OSD.
- **PG (Placement Group)** — "шард", единица деления пулов в кластере, которой назначается свой набор
OSD для хранения данных (копий или частей объектов).
- **Домен отказа (Failure Domain)** — группа OSD, одновременное падение которых рассматривается
как вероятное. По умолчанию это "host" (сервер).
- **Домен отказа (Failure Domain)** — группа OSD, от одновременного падения которых должен защищать
Vitastor. По умолчанию домен отказа — "host" (сервер), но вы можете установить для пула как больший
домен отказа (например, стойку серверов), так и меньший (например, отдельный диск).
- **Дерево распределения** (Placement Tree, в Ceph CRUSH Tree) — иерархическая группировка OSD
в узлы, которые далее можно использовать как домены отказа.
@ -49,25 +51,39 @@
На базе клиентской библиотеки реализованы все остальные клиенты:
- **vitastor-cli** — утилита командной строки для управления кластером. В данный момент позволяет
просматривать общее состояние кластера и управлять образами — т.е. создавать, менять и удалять
виртуальные диски, их снимки и клоны.
- **Драйвер QEMU** — подключаемый модуль QEMU, позволяющий QEMU/KVM виртуальным машинам работать
с виртуальными дисками Vitastor напрямую из пространства пользователя с помощью клиентской
библиотеки, без необходимости отображения дисков в виде блочных устройств. Тот же драйвер
позволяет подключать диски в систему через [VDUSE](../usage/qemu.ru.md#vduse).
- **vitastor-nbd** — утилита, позволяющая монтировать образы Vitastor в виде блочных устройств
с помощью NBD (Network Block Device), на самом деле скорее работающего как "BUSE"
(Block Device In Userspace). Модуля ядра Linux для выполнения той же задачи в Vitastor нет
(по крайней мере, пока).
- **CSI драйвер** — драйвер для подключения Vitastor-образов в виде персистентных томов (PV) Kubernetes.
Работает через vitastor-nbd — образы отражаются в виде блочных устройств и монтируются
в контейнеры.
- **[vitastor-cli](../usage/cli.ru.md)** — утилита командной строки для управления кластером.
Позволяет просматривать общее состояние кластера, управлять пулами и образами — то есть
создавать, менять и удалять виртуальные диски, их снимки и клоны.
- **[Драйвер QEMU](../usage/qemu.ru.md)** — подключаемый модуль QEMU, позволяющий QEMU/KVM
виртуальным машинам работать с виртуальными дисками Vitastor напрямую из пространства пользователя
с помощью клиентской библиотеки, без необходимости подключения дисков в виде блочных устройств
Linux. Если, однако, вы хотите подключать диски в виде блочных устройств, то вы тоже можете
сделать это с помощью того же самого драйвера и [VDUSE](../usage/qemu.ru.md#vduse).
- **[vitastor-nbd](../usage/nbd.ru.md)** — утилита, позволяющая монтировать образы Vitastor
в виде блочных устройств с помощью NBD (Network Block Device), на самом деле скорее работающего
как "BUSE" (Block Device In Userspace). Модуля ядра Linux для выполнения той же задачи в
Vitastor нет (по крайней мере, пока). NBD — более старый и нерекомендуемый способ подключения
дисков — вам следует использовать VDUSE всегда, когда это возможно.
- **[CSI драйвер](../installation/kubernetes.ru.md)** — драйвер для подключения Vitastor-образов
в виде персистентных томов (PV) Kubernetes. Работает через VDUSE (если доступно) или через
NBD — образы отражаются в виде блочных устройств и монтируются в контейнеры.
- **Драйвера Proxmox, OpenStack и т.п.** — подключаемые модули для соответствующих систем,
позволяющие использовать Vitastor как хранилище в оных.
- **vitastor-nfs** — утилита, предоставляющая файловый доступ к образам в кластере Vitastor
по протоколу NFS 3.0. Предназначена для гипервизоров, не основанных на QEMU и Linux, но при
этом поддерживающих NFS.
- **[vitastor-nfs](../usage/nfs.ru.md)** — NFS 3.0 сервер, предоставляющий два варианта файловой системы:
первая — упрощённая для файлового доступа к блочным образам (для не-QEMU гипервизоров, поддерживающих NFS),
вторая — VitastorFS, полноценная кластерная POSIX ФС. Оба варианта поддерживают параллельный
доступ с нескольких vitastor-nfs серверов. На самом деле можно вообще не выделять
отдельные NFS-серверы, а вместо этого использовать команду vitastor-nfs mount, запускающую
NFS-сервер прямо на клиентской машине и монтирующую ФС локально.
- **[Драйвер fio](../usage/fio.ru.md)** — подключаемый модуль для утилиты тестирования
производительности дисков fio, позволяющий тестировать Vitastor-кластеры.
- **vitastor-kv** — клиент для key-value базы данных, работающей поверх разделяемого блочного
образа (обычного блочного образа vitastor). Метаданные VitastorFS хранятся именно в vitastor-kv.
## Дополнительные утилиты
- **vitastor-disk** — утилита для разметки дисков под Vitastor OSD. С её помощью можно
создавать, удалять, менять размеры или перемещать разделы OSD.
## Общий процесс записи и чтения
@ -98,16 +114,22 @@
находиться на других OSD, если эти объекты деградированы или перемещены, или идёт процесс
ребаланса. Запросы для проверки по сети не отправляются, информация о местоположении всех
объектов рассчитывается первичным OSD при активации PG и хранится в памяти.
- Первичный OSD соединяется (если ещё не соединён) с вторичными OSD, на которых располагаются
части объекта, и отправляет им запросы чтения/записи, а также читает/пишет из/в своё локальное
хранилище, если сам входит в набор.
- Когда это возможно, первичный OSD обрабатывает запрос локально. Например, так происходит
при чтениях объектов из пулов с репликацией или при чтении из EC пула, затрагивающего
только часть, хранимую на диске самого первичного OSD.
- Когда запрос требует записи или чтения с вторичных OSD, первичный OSD использует заранее
установленные соединения с ними для выполнения этих запросов. Это происходит параллельно
локальным операциям чтения/записи с диска самого OSD. Так как соединения к вторичным OSD PG
устанавливаются при её запуске, то они уже гарантированно установлены, когда PG активна,
и если любое из этих соединений отключается, PG перезапускается, а все текущие запросы чтения
и записи в неё завершаются с ошибкой EPIPE, после чего повторяются клиентами.
- После завершения всех вторичных операций чтения/записи первичный OSD отправляет ответ клиенту.
### Особенности обработки запросов
- Если в пуле используются коды коррекции ошибок и при этом часть OSD недоступна, первичный
OSD при чтении восстанавливает данные из оставшихся частей.
- Каждый объект имеет номер версии. При записи объекта первичный OSD сначала читает из номер
- Каждый объект имеет номер версии. При записи объекта первичный OSD сначала получает номер
версии объекта. Так как первичный OSD обычно сам хранит копию или часть объекта, номер
версии обычно читается из памяти самого OSD. Однако, если ни одна часть обновляемого объекта
не находится на первичном OSD, для получения номера версии он обращается к одному из вторичных
@ -115,20 +137,20 @@
так как метаданные объектов, включая номер версии, все OSD хранят в памяти.
- Если в пуле используются коды коррекции ошибок, перед частичной записью объекта для вычисления
чётности зачастую требуется чтение частей объекта с вторичных OSD или с локального диска
самого первичного OSD.
- Также, если в пуле используются коды коррекции ошибок, для закрытия Write Hole применяется
самого первичного OSD. Это называется процессом "чтение-модификация-запись" (read-modify-write).
- Если в пуле используются коды коррекции ошибок, для закрытия Write Hole применяется
двухфазный алгоритм записи: сначала на все вторичные OSD записывается новая версия частей
объекта, но при этом старая версия не удаляется, а потом, после получения подтверждения
успешной записи от всех вторичных OSD, новая версия фиксируется и разрешается удаление старой.
- Если в кластере не включён режим immediate_commit, то запросы записи, отправляемые клиентами,
- Если в пуле не включён режим immediate_commit, то запросы записи, отправляемые клиентами,
не считаются зафиксированными на физических накопителях сразу. Для фиксации данных клиенты
должны отдельно отправлять запросы SYNC (отдельный от чтения и записи вид запроса),
а пока такой запрос не отправлен, считается, что записанные данные могут исчезнуть,
если соответствующий OSD упадёт. Поэтому, когда режим immediate_commit отключён, все
запросы записи клиенты копируют в памяти и при потере соединения и повторном соединении
с OSD повторяют из памяти. Скопированные в память данные удаляются при успешном fsync,
с OSD повторяют из памяти. Скопированные в память данные удаляются при успешном SYNC,
а чтобы хранение этих данных не приводило к чрезмерному потреблению памяти, клиенты
автоматически выполняют fsync каждые [client_dirty_limit](../config/network.ru.md#client_dirty_limit)
автоматически выполняют SYNC каждые [client_dirty_limit](../config/network.ru.md#client_dirty_limit)
записанных байт.
## Схожесть с Ceph

View File

@ -32,7 +32,7 @@
- SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
- NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
Intel DC-P3700/P4500/P4600, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
Intel DC-P3700/P4500/P4600, Intel D5-P4320, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
- HDD: HGST Ultrastar, Toshiba MG, Seagate EXOS
## Configure monitors

View File

@ -22,7 +22,7 @@
использовать и десктопные SSD, включив режим отложенного fsync, но производительность будет хуже.
О конденсаторах читайте [здесь](../config/layout-cluster.ru.md#immediate_commit).
- Если хотите использовать HDD, берите современные модели с Media или SSD кэшем - HGST Ultrastar,
Toshiba MG08, Seagate EXOS или что-то похожее. Если такого кэша у ваших дисков нет,
Toshiba MG, Seagate EXOS или что-то похожее. Если такого кэша у ваших дисков нет,
обязательно возьмите SSD под метаданные и журнал (маленькие, буквально 2 ГБ на 1 ТБ HDD-места).
- Возьмите быструю сеть, минимум 10 гбит/с. Идеал - что-то вроде Mellanox ConnectX-4 с RoCEv2.
- Для лучшей производительности отключите энергосбережение CPU: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
@ -32,8 +32,8 @@
- SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
- NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
Intel DC-P3700/P4500/P4600, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
- HDD: HGST Ultrastar, Toshiba MG06/MG07/MG08, Seagate EXOS
Intel DC-P3700/P4500/P4600, Intel D5-P4320, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
- HDD: HGST Ultrastar, Toshiba MG, Seagate EXOS
## Настройте мониторы

View File

@ -171,7 +171,14 @@ to make them use the new version of the client library.
### 1.7.x to 1.8.0
After upgrading version <= 1.7.x to version >= 1.8.0, BUT <= 1.9.0: restart all clients
It's recommended to upgrade from version <= 1.7.x to version >= 1.8.0 with full downtime,
i.e. you should first stop clients and then the cluster (OSDs and monitor), because 1.8.0
includes a fix for etcd event stream inconsistency which could lead to "incomplete" objects
appearing in EC pools, and in rare cases, probably, even to data corruption during mass OSD
restarts. It doesn't mean that you WILL hit this problem if you upgrade without full downtime,
but it's better to secure yourself against it.
Also, if you upgrade version from <= 1.7.x to version >= 1.8.0, BUT <= 1.9.0: restart all clients
(VMs and so on), otherwise they will hang when monitor clears old PG configuration key,
which happens 24 hours after upgrade.

View File

@ -168,7 +168,14 @@ done
### 1.7.x -> 1.8.0
После обновления с версий <= 1.7.x до версий >= 1.8.0, НО <= 1.9.0: перезапустите всех
Обновляться с версий <= 1.7.x до версий >= 1.8.0 рекомендуется с полной остановкой
сначала клиентов, а затем кластера, так как в 1.8.0 исправлена проблема (неконсистентность
потоков событий от etcd), способная приводить к появлению incomplete объектов в EC-пулах
и, хоть и редко, но даже к повреждению данных при массовых перезапусках OSD. Если вы
обновляетесь без полной остановки - это не значит, что вы обязательно столкнётесь с этой
проблемой, но лучше подстраховаться.
Также, если вы обновляетесь с версии <= 1.7.x до версии >= 1.8.0, НО <= 1.9.0: перезапустите всех
клиентов (процессы виртуальных машин можно перезапустить путём миграции на другой сервер),
иначе они зависнут, когда монитор удалит старый ключ конфигурации PG, что происходит через
24 часа после обновления.

View File

@ -13,6 +13,7 @@ It supports the following commands:
- [prepare](#prepare)
- [upgrade-simple](#upgrade-simple)
- [resize](#resize)
- [raw-resize](#raw-resize)
- [start/stop/restart/enable/disable](#start/stop/restart/enable/disable)
- [purge](#purge)
- [read-sb](#read-sb)
@ -50,12 +51,16 @@ Options (automatic mode):
--osd_per_disk <N>
Create <N> OSDs on each disk (default 1)
--hybrid
Prepare hybrid (HDD+SSD) OSDs using provided devices. SSDs will be used for
journals and metadata, HDDs will be used for data. Partitions for journals and
metadata will be created automatically. Whether disks are SSD or HDD is decided
by the `/sys/block/.../queue/rotational` flag. In hybrid mode, default object
size is 1 MB instead of 128 KB, default journal size is 1 GB instead of 32 MB,
and throttle_small_writes is enabled by default.
Prepare hybrid (HDD+SSD, NVMe+SATA or etc) OSDs using provided devices. By default,
any passed SSDs will be used for journals and metadata, HDDs will be used for data,
but you can override this behaviour with --fast-devices option. Journal and metadata
partitions will be created automatically. In the default mode, SSD and HDD disks
are distinguished by the `/sys/block/.../queue/rotational` flag. When HDDs are used
for data in hybrid mode, default block_size is 1 MB instead of 128 KB, default journal
size is 1 GB instead of 32 MB, and throttle_small_writes is enabled by default.
--fast-devices /dev/nvmeX,/dev/nvmeY
In --hybrid mode, use these devices for journal and metadata instead of auto-detecting
and extracting them from the main [devices...] list.
--disable_data_fsync auto
Disable data device cache and fsync (1/yes/true = on, default auto)
--disable_meta_fsync auto
@ -127,25 +132,49 @@ Requires the `sfdisk` utility.
## resize
`vitastor-disk resize <ALL_OSD_PARAMETERS> <NEW_LAYOUT> [--iodepth 32]`
`vitastor-disk resize <osd_num>|<osd_device> [OPTIONS]`
Resize data area and/or rewrite/move journal and metadata.
Resize data area and/or move journal and metadata:
| <!-- --> | <!-- --> |
|---------------------------|----------------------------------------|
| `--move-journal TARGET` | move journal to `TARGET` |
| `--move-meta TARGET` | move metadata to `TARGET` |
| `--journal-size NEW_SIZE` | resize journal to `NEW_SIZE` |
| `--data-size NEW_SIZE` | resize data device to `NEW_SIZE` |
| `--dry-run` | only show new layout, do not apply it |
`NEW_SIZE` may include k/m/g/t suffixes.
`TARGET` may be one of:
| <!-- --> | <!-- --> |
|----------------|--------------------------------------------------------------------------|
| `<partition>` | move journal/metadata to an existing GPT partition |
| `<raw_device>` | create a GPT partition on `<raw_device>` and move journal/metadata to it |
| `""` | (empty string) move journal/metadata back to the data device |
## raw-resize
`vitastor-disk raw-resize <ALL_OSD_PARAMETERS> <NEW_LAYOUT> [--iodepth 32]`
Resize data area and/or rewrite/move journal and metadata (manual format).
`ALL_OSD_PARAMETERS` must include all (at least all disk-related)
parameters from OSD command line (i.e. from systemd unit or superblock).
`NEW_LAYOUT` may include new disk layout parameters:
```
--new_data_offset SIZE resize data area so it starts at SIZE
--new_data_len SIZE resize data area to SIZE bytes
--new_meta_device PATH use PATH for new metadata
--new_meta_offset SIZE make new metadata area start at SIZE
--new_meta_len SIZE make new metadata area SIZE bytes long
--new_journal_device PATH use PATH for new journal
--new_journal_offset SIZE make new journal area start at SIZE
--new_journal_len SIZE make new journal area SIZE bytes long
```
| <!-- --> | <!-- --> |
|-----------------------------|-------------------------------------------|
| `--new_data_offset SIZE` | resize data area so it starts at `SIZE` |
| `--new_data_len SIZE` | resize data area to `SIZE` bytes |
| `--new_meta_device PATH` | use `PATH` for new metadata |
| `--new_meta_offset SIZE` | make new metadata area start at `SIZE` |
| `--new_meta_len SIZE` | make new metadata area `SIZE` bytes long |
| `--new_journal_device PATH` | use `PATH` for new journal |
| `--new_journal_offset SIZE` | make new journal area start at `SIZE` |
| `--new_journal_len SIZE` | make new journal area `SIZE` bytes long |
SIZE may include k/m/g/t suffixes. If any of the new layout parameter
options are not specified, old values will be used.
@ -217,10 +246,14 @@ Intended for use from startup scripts (i.e. from systemd units).
## dump-journal
`vitastor-disk dump-journal [OPTIONS] <osd_device>`
`vitastor-disk dump-journal [OPTIONS] <journal_file> <journal_block_size> <offset> <size>`
Dump journal in human-readable or JSON (if `--json` is specified) format.
You can specify any OSD device (data, metadata or journal), or the layout manually.
Options:
```
@ -233,23 +266,35 @@ Options:
## write-journal
`vitastor-disk write-journal <osd_device>`
`vitastor-disk write-journal <journal_file> <journal_block_size> <bitmap_size> <offset> <size>`
Write journal from JSON taken from standard input in the same format as produced by
`dump-journal --json --format data`.
You can specify any OSD device (data, metadata or journal), or the layout manually.
## dump-meta
`vitastor-disk dump-meta <osd_device>`
`vitastor-disk dump-meta <meta_file> <meta_block_size> <offset> <size>`
Dump metadata in JSON format.
You can specify any OSD device (data, metadata or journal), or the layout manually.
## write-meta
`vitastor-disk write-meta <osd_device>`
`vitastor-disk write-meta <meta_file> <offset> <size>`
Write metadata from JSON taken from standard input in the same format as produced by `dump-meta`.
You can specify any OSD device (data, metadata or journal), or the layout manually.
## simple-offsets
`vitastor-disk simple-offsets <device>`

View File

@ -13,6 +13,7 @@ vitastor-disk - инструмент командной строки для уп
- [prepare](#prepare)
- [upgrade-simple](#upgrade-simple)
- [resize](#resize)
- [raw-resize](#raw-resize)
- [start/stop/restart/enable/disable](#start/stop/restart/enable/disable)
- [purge](#purge)
- [read-sb](#read-sb)
@ -50,12 +51,17 @@ vitastor-disk - инструмент командной строки для уп
--osd_per_disk <N>
Создавать по несколько (<N>) OSD на каждом диске (по умолчанию 1)
--hybrid
Инициализировать гибридные (HDD+SSD) OSD на указанных дисках. SSD будут
использованы для журналов и метаданных, а HDD - для данных. Разделы для журналов
и метаданных будут созданы автоматически. Является ли диск SSD или HDD, определяется
по флагу `/sys/block/.../queue/rotational`. В гибридном режиме по умолчанию
используется размер объекта 1 МБ вместо 128 КБ, размер журнала 1 ГБ вместо 32 МБ
и включённый throttle_small_writes.
Инициализировать гибридные (HDD+SSD, NVMe+SATA и т.п.) OSD на указанных дисках.
По умолчанию, SSD будут использованы для журналов и метаданных, а HDD - для данных,
но вы можете поменять это поведение опцией --fast-devices. Разделы для журналов
и метаданных будут созданы автоматически. В режиме по умолчанию SSD и HDD-диски
различаются по флагу `/sys/block/.../queue/rotational`. Когда в гибридном режиме
для данных используются HDD, по умолчанию размер блока устанавливается 1 МБ вместо
128 КБ, размер журнала 1 ГБ вместо 32 МБ, и throttle_small_writes включается по
умолчанию.
--fast-devices /dev/nvmeX,/dev/nvmeY
Использовать данные диски для журналов и метаданных в гибридном режиме вместо их
автоопределения и извлечения из основного списка [devices...].
--disable_data_fsync auto
Отключать кэш и fsync-и для устройств данных. (1/yes/true = да, по умолчанию автоопределение)
--disable_meta_fsync auto
@ -129,27 +135,51 @@ throttle_target_mbs, throttle_target_parallelism, throttle_threshold_us.
## resize
`vitastor-disk resize <ALL_OSD_PARAMETERS> <NEW_LAYOUT> [--iodepth 32]`
`vitastor-disk resize <osd_num>|<osd_device> [OPTIONS]`
Изменить размер области данных и/или переместить журнал и метаданные.
Изменить размер области данных и/или переместить журнал и метаданные:
В `ALL_OSD_PARAMETERS` нужно указать все относящиеся к диску параметры OSD
| <!-- --> | <!-- --> |
|-------------------------------|------------------------------------------------|
| `--move-journal ЦЕЛЬ` | переместить журнал на `ЦЕЛЬ` |
| `--move-meta ЦЕЛЬ` | переместить метаданные на `ЦЕЛЬ` |
| `--journal-size НОВЫЙ_РАЗМЕР` | изменить размер журнала на `НОВЫЙ_РАЗМЕР` |
| `--data-size НОВЫЙ_РАЗМЕР` | изменить размер диска данных на `НОВЫЙ_РАЗМЕР` |
| `--dry-run` | показать новые параметры, но не применять их |
`НОВЫЙ_РАЗМЕР` может быть указан с суффиксами k/m/g/t (кило/мега/гига/терабайт).
`ЦЕЛЬ` может быть одним из:
| <!-- --> | <!-- --> |
|-----------------|-------------------------------------------------------------------------------------|
| `<раздел>` | переместить журнал/метаданные на существующий GPT-раздел |
| `<полный_диск>` | создать GPT-раздел на диске `<полный_диск>` и переместить журнал/метаданные на него |
| `""` | (пустая строка) переместить журнал/метаданные обратно на диск данных |
## raw-resize
`vitastor-disk raw-resize <ВСЕАРАМЕТРЫ_OSD> <НОВЫЕ_РАЗМЕРЫ> [--iodepth 32]`
Изменить размер области данных и/или переместить журнал и метаданные (ручной формат).
В `ВСЕАРАМЕТРЫ_OSD` нужно указать все относящиеся к диску параметры OSD
из суперблока OSD или из файла сервиса systemd (в старых версиях).
В `NEW_LAYOUT` нужно указать новые параметры расположения данных:
В `НОВЫЕ_РАЗМЕРЫ` нужно указать новые параметры расположения данных:
```
--new_data_offset РАЗМЕР сдвинуть начало области данных на РАЗМЕР байт
--new_data_len РАЗМЕР изменить размер области данных до РАЗМЕР байт
--new_meta_device ПУТЬ использовать ПУТЬ как новое устройство метаданных
--new_meta_offset РАЗМЕР разместить новые метаданные по смещению РАЗМЕР байт
--new_meta_len РАЗМЕР сделать новые метаданные размером РАЗМЕР байт
--new_journal_device ПУТЬ использовать ПУТЬ как новое устройство журнала
--new_journal_offset РАЗМЕР разместить новый журнал по смещению РАЗМЕР байт
--new_journal_len РАЗМЕР сделать новый журнал размером РАЗМЕР байт
```
| <!-- --> | <!-- --> |
|-------------------------------|-------------------------------------------------------|
| `--new_data_offset РАЗМЕР` | сдвинуть начало области данных на `РАЗМЕР` байт |
| `--new_data_len РАЗМЕР` | изменить размер области данных до `РАЗМЕР` байт |
| `--new_meta_device ПУТЬ` | использовать `ПУТЬ` как новое устройство метаданных |
| `--new_meta_offset РАЗМЕР` | разместить новые метаданные по смещению `РАЗМЕР` байт |
| `--new_meta_len РАЗМЕР` | сделать новые метаданные размером `РАЗМЕР` байт |
| `--new_journal_device ПУТЬ` | использовать `ПУТЬ` как новое устройство журнала |
| `--new_journal_offset РАЗМЕР` | разместить новый журнал по смещению `РАЗМЕР` байт |
| `--new_journal_len РАЗМЕР` | сделать новый журнал размером `РАЗМЕР` байт |
РАЗМЕР может быть указан с суффиксами k/m/g/t. Если любой из новых параметров
`РАЗМЕР` может быть указан с суффиксами k/m/g/t. Если любой из новых параметров
расположения не указан, он принимается равным старому значению.
## start/stop/restart/enable/disable
@ -224,10 +254,15 @@ OSD отключены fsync-и.
## dump-journal
`vitastor-disk dump-journal <osd_device>`
`vitastor-disk dump-journal [OPTIONS] <journal_file> <journal_block_size> <offset> <size>`
Вывести журнал в человекочитаемом или в JSON (с опцией `--json`) виде.
Вы можете указать любой раздел OSD - данных, журнала или метаданных - либо указать все
параметры расположения вручную.
Опции:
```
@ -240,22 +275,37 @@ OSD отключены fsync-и.
## write-journal
`vitastor-disk write-journal <osd_device>`
`vitastor-disk write-journal <journal_file> <journal_block_size> <bitmap_size> <offset> <size>`
Записать журнал из JSON со стандартного ввода в формате, аналогичном `dump-journal --json --format data`.
Вы можете указать любой раздел OSD - данных, журнала или метаданных - либо указать все
параметры расположения вручную.
## dump-meta
`vitastor-disk dump-meta <osd_device>`
`vitastor-disk dump-meta <meta_file> <meta_block_size> <offset> <size>`
Вывести метаданные в формате JSON.
Вы можете указать любой раздел OSD - данных, журнала или метаданных - либо указать все
параметры расположения вручную.
## write-meta
`vitastor-disk write-meta <osd_device>`
`vitastor-disk write-meta <meta_file> <offset> <size>`
Записать метаданные из JSON со стандартного ввода в формате, аналогичном `dump-meta`.
Вы можете указать любой раздел OSD - данных, журнала или метаданных - либо указать все
параметры расположения вручную.
## simple-offsets
`vitastor-disk simple-offsets <device>`

View File

@ -156,17 +156,17 @@ behind. Defragmentation removes garbage and moves data still in use to new volum
Options:
| <!-- --> | <!-- --> |
|--------------------------|------------------------------------------------------------------------ |
| --volume_untouched 86400 | Defragment volumes last appended to at least this number of seconds ago |
| --defrag_percent 50 | Defragment volumes with at least this % of removed data |
| --defrag_block_count 16 | Read this number of pool blocks at once during defrag |
| --defrag_iodepth 16 | Move up to this number of files in parallel during defrag |
| --trace | Print verbose defragmentation status |
| --dry-run | Skip modifications, only print status |
| --recalc-stats | Recalculate all volume statistics |
| --include-empty | Include old and empty volumes; make sure to restart NFS servers before using it |
| --no-rm | Move, but do not delete data |
| <!-- --> | <!-- --> |
|----------------------------|------------------------------------------------------------------------ |
| `--volume_untouched 86400` | Defragment volumes last appended to at least this number of seconds ago |
| `--defrag_percent 50` | Defragment volumes with at least this % of removed data |
| `--defrag_block_count 16` | Read this number of pool blocks at once during defrag |
| `--defrag_iodepth 16` | Move up to this number of files in parallel during defrag |
| `--trace` | Print verbose defragmentation status |
| `--dry-run` | Skip modifications, only print status |
| `--recalc-stats` | Recalculate all volume statistics |
| `--include-empty` | Include old and empty volumes; make sure to restart NFS servers before using it |
| `--no-rm` | Move, but do not delete data |
## Common options

View File

@ -164,17 +164,17 @@ JSON-формате :-). Для инспекции содержимого БД
Опции:
| <!-- --> | <!-- --> |
|--------------------------|------------------------------------------------------------------------ |
| --volume_untouched 86400 | Дефрагментировать только тома, в которые уже не писали это число секунд |
| --defrag_percent 50 | Дефрагментировать только тома, в которых этот % данных удалён |
| --defrag_block_count 16 | Читать это количество блоков пула за один раз |
| --defrag_iodepth 16 | Перемещать одновременно до этого числа файлов |
| --trace | Печатать детальную статистику дефрагментации |
| --dry-run | Не производить никаких изменений, только описать выполняемые действия |
| --recalc-stats | Пересчитать и сохранить статистику всех томов |
| --include-empty | Дефрагментировать старые и пустые тома; обязательно перезапустите NFS-сервера после использования этой опции |
| --no-rm | Перемещать, но не удалять данные |
| <!-- --> | <!-- --> |
|----------------------------|------------------------------------------------------------------------ |
| `--volume_untouched 86400` | Дефрагментировать только тома, в которые уже не писали это число секунд |
| `--defrag_percent 50` | Дефрагментировать только тома, в которых этот % данных удалён |
| `--defrag_block_count 16` | Читать это количество блоков пула за один раз |
| `--defrag_iodepth 16` | Перемещать одновременно до этого числа файлов |
| `--trace` | Печатать детальную статистику дефрагментации |
| `--dry-run` | Не производить никаких изменений, только описать выполняемые действия |
| `--recalc-stats` | Пересчитать и сохранить статистику всех томов |
| `--include-empty` | Дефрагментировать старые и пустые тома; обязательно перезапустите NFS-сервера после использования этой опции |
| `--no-rm` | Перемещать, но не удалять данные |
## Общие опции

View File

@ -151,9 +151,9 @@ Example performance comparison:
To try VDUSE you need at least Linux 5.15, built with VDUSE support
(CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
Debian Linux kernels have these options disabled by now, so if you want to try it on Debian,
use a kernel from Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/), Proxmox,
or build modules for Debian kernel manually:
Debian Linux kernels had these options disabled until 6.6, so make sure you install a newer kernel
(from bookworm-backports, trixie or newer Debian version) if you want to try VDUSE. You can also
build modules for an existing kernel manually:
```
mkdir build

View File

@ -154,9 +154,9 @@ VDUSE - на данный момент лучший интерфейс для п
Чтобы попробовать VDUSE, вам нужно ядро Linux как минимум версии 5.15, собранное с поддержкой
VDUSE (CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
В ядрах в Debian Linux поддержка пока отключена по умолчанию, так что чтобы попробовать VDUSE
на Debian, поставьте ядро из Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/),
из Proxmox или соберите модули для ядра Debian вручную:
В ядрах в Debian Linux эти опции включены, только начиная с 6.6, так что установите свежее ядро
из bookworm-backports, trixie или из более новой версии Debian, если хотите попробовать VDUSE.
Либо же вы можете самостоятельно собрать модули для установленного ядра:
```
mkdir build

View File

@ -567,6 +567,7 @@ class Mon
async apply_pool_pgs(results, up_osds, osd_tree, tree_hash)
{
const etcd_request = { compare: [], success: [] };
for (const pool_id in (this.state.pg.config||{}).items||{})
{
// We should stop all PGs when deleting a pool or changing its PG count
@ -579,9 +580,24 @@ class Mon
return false;
}
}
if (!this.state.config.pools[pool_id])
{
// Delete PG history and stats of the deleted pool
etcd_request.success.push({ requestDeleteRange: {
key: b64(this.config.etcd_prefix+'/pg/history/'+pool_id+'/'),
range_end: b64(this.config.etcd_prefix+'/pg/history/'+pool_id+'0'),
} });
etcd_request.success.push({ requestDeleteRange: {
key: b64(this.config.etcd_prefix+'/pg/stats/'+pool_id+'/'),
range_end: b64(this.config.etcd_prefix+'/pg/stats/'+pool_id+'0'),
} });
etcd_request.success.push({ requestDeleteRange: {
key: b64(this.config.etcd_prefix+'/pgstats/'+pool_id+'/'),
range_end: b64(this.config.etcd_prefix+'/pgstats/'+pool_id+'0'),
} });
}
}
const new_pg_config = JSON.parse(JSON.stringify(this.state.pg.config));
const etcd_request = { compare: [], success: [] };
for (const pool_id in (new_pg_config||{}).items||{})
{
if (!this.state.config.pools[pool_id])

View File

@ -1,6 +1,6 @@
{
"name": "vitastor-mon",
"version": "1.9.1",
"version": "1.9.3",
"description": "Vitastor SDS monitor service",
"main": "mon-main.js",
"scripts": {

View File

@ -3,7 +3,9 @@
set -e
reapply_patch() {
if ! patch -f --dry-run -F 0 -R $1 < $2 >/dev/null; then
if ! [[ -e $1 ]]; then
echo "$1 does not exist, OpenNebula is not installed"
elif ! patch -f --dry-run -F 0 -R $1 < $2 >/dev/null; then
already_applied=0
if ! patch --no-backup-if-mismatch -r - -F 0 -f $1 < $2; then
applied_ok=0
@ -15,8 +17,13 @@ echo "Reapplying Vitastor patches to OpenNebula's oned.conf, vmm_execrc and down
already_applied=1
applied_ok=1
reapply_patch /var/lib/one/remotes/datastore/downloader.sh /var/lib/one/remotes/datastore/vitastor/downloader-vitastor.sh.diff
reapply_patch /etc/one/oned.conf /var/lib/one/remotes/datastore/vitastor/oned.conf.diff
reapply_patch /etc/one/vmm_exec/vmm_execrc /var/lib/one/remotes/datastore/vitastor/vmm_execrc.diff
if [[ -e /etc/one/oned.conf ]]; then
if ! /var/lib/one/remotes/datastore/vitastor/patch-oned-conf.py /etc/one/oned.conf; then
applied_ok=0
already_applied=0
fi
fi
if [[ "$already_applied" = 1 ]]; then
echo "OK: Vitastor OpenNebula patches are already applied"
elif [[ "$applied_ok" = 1 ]]; then

View File

@ -0,0 +1,115 @@
#!/usr/bin/env python3
# Patch /etc/one/oned.conf for Vitastor support
# -s = also enable save.vitastor/restore.vitastor overrides
import re
import os
import sys
class Fixer:
save_restore = 0
def require_sub_cb(self, m, cb):
self.found = 1
return cb(m)
def require_sub(self, regexp, cb, text, error):
self.found = 0
new_text = re.sub(regexp, lambda m: self.require_sub_cb(m, cb), text)
if not self.found and error:
self.errors.append(error)
return new_text
def fix(self, oned_conf):
self.errors = []
self.kvm_found = 0
oned_conf = self.require_sub(r'((?:^|\n)[ \t]*VM_MAD\s*=\s*\[)([^\]]+)\]', lambda m: m.group(1)+self.fix_vm_mad(m.group(2))+']', oned_conf, 'VM_MAD not found')
if not self.kvm_found:
self.errors.append("VM_MAD[NAME=kvm].ARGUMENTS not found")
oned_conf = self.require_sub(r'((?:^|\n)[ \t]*TM_MAD\s*=\s*\[)([^\]]+)\]', lambda m: m.group(1)+self.fix_tm_mad(m.group(2))+']', oned_conf, 'TM_MAD not found')
oned_conf = self.require_sub(r'((?:^|\n)[ \t]*DATASTORE_MAD\s*=\s*\[)([^\]]+)\]', lambda m: m.group(1)+self.fix_datastore_mad(m.group(2))+']', oned_conf, 'DATASTORE_MAD not found')
if oned_conf[-1:] != '\n':
oned_conf += '\n'
if not re.compile(r'(^|\n)[ \t]*INHERIT_DATASTORE_ATTR\s*=\s*"VITASTOR_CONF"').search(oned_conf):
oned_conf += '\nINHERIT_DATASTORE_ATTR="VITASTOR_CONF"\n'
if not re.compile(r'(^|\n)[ \t]*INHERIT_DATASTORE_ATTR\s*=\s*"IMAGE_PREFIX"').search(oned_conf):
oned_conf += '\nINHERIT_DATASTORE_ATTR="IMAGE_PREFIX"\n'
if not re.compile(r'(^|\n)[ \t]*TM_MAD_CONF\s*=\s*\[[^\]]*NAME\s*=\s*"vitastor"').search(oned_conf):
oned_conf += ('\nTM_MAD_CONF = [\n'+
' NAME = "vitastor", LN_TARGET = "NONE", CLONE_TARGET = "SELF", SHARED = "YES",\n'+
' DS_MIGRATE = "NO", DRIVER = "raw", ALLOW_ORPHANS="format",\n'+
' TM_MAD_SYSTEM = "ssh,shared", LN_TARGET_SSH = "SYSTEM", CLONE_TARGET_SSH = "SYSTEM",\n'+
' DISK_TYPE_SSH = "FILE", LN_TARGET_SHARED = "NONE",\n'+
' CLONE_TARGET_SHARED = "SELF", DISK_TYPE_SHARED = "FILE"\n'+
']\n')
if not re.compile(r'(^|\n)[ \t]*DS_MAD_CONF\s*=\s*\[[^\]]*NAME\s*=\s*"vitastor"').search(oned_conf):
oned_conf += ('\nDS_MAD_CONF = [\n'+
' NAME = "vitastor",\n'+
' REQUIRED_ATTRS = "DISK_TYPE,BRIDGE_LIST",\n'+
' PERSISTENT_ONLY = "NO",\n'+
' MARKETPLACE_ACTIONS = "export"\n'+
']\n')
return oned_conf
def fix_vm_mad(self, vm_mad_params):
if re.compile(r'\bNAME\s*=\s*"kvm"').search(vm_mad_params):
vm_mad_params = re.sub(r'\b(ARGUMENTS\s*=\s*")([^"]+)"', lambda m: m.group(1)+self.fix_vm_mad_args(m.group(2))+'"', vm_mad_params)
self.kvm_found = 1
return vm_mad_params
def fix_vm_mad_args(self, args):
args = self.fix_vm_mad_override(args, 'deploy')
if self.save_restore:
args = self.fix_vm_mad_override(args, 'save')
args = self.fix_vm_mad_override(args, 'restore')
return args
def fix_vm_mad_override(self, args, override):
m = re.compile(r'-l (\S+)').search(args)
if m and re.compile(override+'='+override+'.vitastor').search(m.group(1)):
return args
elif m and re.compile(override+'=').search(m.group(1)):
self.errors.append(override+"= is already overridden in -l option in VM_MAD[NAME=kvm].ARGUMENTS")
return args
elif m:
return self.require_sub(r'-l (\S+)', lambda m: '-l '+m.group(1)+','+override+'='+override+'.vitastor', args, '-l option not found in VM_MAD[NAME=kvm].ARGUMENTS')
else:
return args+' -l '+override+'='+override+'.vitastor'
def fix_tm_mad(self, params):
return self.require_sub(r'\b(ARGUMENTS\s*=\s*")([^"]+)"', lambda m: m.group(1)+self.fix_tm_mad_args('d', m.group(2), "TM_MAD")+'"', params, "TM_MAD.ARGUMENTS not found")
def fix_tm_mad_args(self, opt, args, v):
return self.require_sub('(-'+opt+r') (\S+)', lambda m: self.fix_tm_mad_arg(m), args, "-"+opt+" option not found in "+v+".ARGUMENTS")
def fix_tm_mad_arg(self, m):
a = m.group(2).split(',')
if 'vitastor' not in a:
a += [ 'vitastor' ]
return m.group(1)+' '+(','.join(a))
def fix_datastore_mad(self, params):
params = self.require_sub(r'\b(ARGUMENTS\s*=\s*")([^"]+)"', lambda m: m.group(1)+self.fix_tm_mad_args('d', m.group(2), "DATASTORE_MAD")+'"', params, "DATASTORE_MAD.ARGUMENTS not found")
return self.require_sub(r'\b(ARGUMENTS\s*=\s*")([^"]+)"', lambda m: m.group(1)+self.fix_tm_mad_args('s', m.group(2), "DATASTORE_MAD")+'"', params, "")
fixer = Fixer()
oned_conf_file = ''
for arg in sys.argv[1:]:
if arg == '-s':
fixer.save_restore = 1
else:
oned_conf_file = arg
break
if not oned_conf_file:
sys.stderr.write("USAGE: ./patch-oned-conf.py [-s] /etc/one/oned.conf\n-s means also enable save.vitastor/restore.vitastor overrides\n")
sys.exit(1)
with open(oned_conf_file, 'r') as fd:
oned_conf = fd.read()
new_conf = fixer.fix(oned_conf)
if new_conf != oned_conf:
os.rename(oned_conf_file, oned_conf_file+'.bak')
with open(oned_conf_file, 'w') as fd:
fd.write(new_conf)
if len(fixer.errors) > 0:
sys.stderr.write("ERROR: Failed to patch "+oned_conf_file+", patch it manually. Errors:\n- "+('\n- '.join(fixer.errors))+'\n')
sys.exit(1)

View File

@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VITASTOR_VERSION = '1.9.1'
VITASTOR_VERSION = '1.9.3'
LOG = logging.getLogger(__name__)

View File

@ -306,12 +306,12 @@ index e5ff653a60..884ecc79ea 100644
+ etcd = virBufferContentAndReset(&buf);
+ }
+
+ if (virJSONValueObjectCreate(&ret,
+ "S:etcd-host", etcd,
+ "S:etcd-prefix", src->query,
+ "S:config-path", src->configFile,
+ "s:image", src->path,
+ NULL) < 0)
+ if (virJSONValueObjectAdd(&ret,
+ "S:etcd-host", etcd,
+ "S:etcd-prefix", src->query,
+ "S:config-path", src->configFile,
+ "s:image", src->path,
+ NULL) < 0)
+ return NULL;
+
+ return ret;

View File

@ -0,0 +1,172 @@
diff --git a/block/meson.build b/block/meson.build
index f1262ec2ba..3cf3e23f16 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -114,6 +114,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
diff --git a/meson.build b/meson.build
index fbda17c987..3edac22aff 100644
--- a/meson.build
+++ b/meson.build
@@ -1510,6 +1510,26 @@ if not get_option('rbd').auto() or have_block
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'))
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -2351,6 +2371,7 @@ endif
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
@@ -4510,6 +4531,7 @@ summary_info += {'fdt support': fdt_opt == 'internal' ? 'internal' : fdt}
summary_info += {'libcap-ng support': libcap_ng}
summary_info += {'bpf support': libbpf}
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
diff --git a/meson_options.txt b/meson_options.txt
index 0269fa0f16..4740ffdc27 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -194,6 +194,8 @@ option('lzo', type : 'feature', value : 'auto',
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
diff --git a/qapi/block-core.json b/qapi/block-core.json
index aa40d44f1d..bbee6a0e9c 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3203,7 +3203,7 @@
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
- 'ssh', 'throttle', 'vdi', 'vhdx',
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -4286,6 +4286,28 @@
'*key-secret': 'str',
'*server': ['InetSocketAddressBase'] } }
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
##
# @ReplicationMode:
#
@@ -4742,6 +4764,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'virtio-blk-vfio-pci':
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
'if': 'CONFIG_BLKIO' },
@@ -5183,6 +5206,20 @@
'*cluster-size' : 'size',
'*encrypt' : 'RbdEncryptionCreateOptions' } }
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @location: Where to store the new image file. This location cannot
+# point to a snapshot.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
##
# @BlockdevVmdkSubformat:
#
@@ -5405,6 +5442,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index c97079a38c..4623f552ec 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -168,6 +168,7 @@ meson_options_help() {
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
printf "%s\n" ' qpl Query Processing Library support'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' rdma Enable RDMA-based migration'
printf "%s\n" ' replication replication support'
printf "%s\n" ' rutabaga-gfx rutabaga_gfx support'
@@ -444,6 +445,8 @@ _meson_option_parse() {
--disable-qpl) printf "%s" -Dqpl=disabled ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-rdma) printf "%s" -Drdma=enabled ;;
--disable-rdma) printf "%s" -Drdma=disabled ;;
--enable-relocatable) printf "%s" -Drelocatable=true ;;

View File

@ -1,11 +1,11 @@
Name: vitastor
Version: 1.9.1
Version: 1.9.3
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.9.1.el7.tar.gz
Source0: vitastor-1.9.3.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@ -1,11 +1,11 @@
Name: vitastor
Version: 1.9.1
Version: 1.9.3
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.9.1.el8.tar.gz
Source0: vitastor-1.9.3.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@ -1,11 +1,11 @@
Name: vitastor
Version: 1.9.1
Version: 1.9.3
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.9.1.el9.tar.gz
Source0: vitastor-1.9.3.el9.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel
@ -74,7 +74,7 @@ Vitastor library headers for development.
Summary: Vitastor - fio drivers
Group: Development/Libraries
Requires: vitastor-client = %{version}-%{release}
Requires: fio = 3.27-8.el9
Requires: fio = 3.35-1.el9
%description -n vitastor-fio

View File

@ -19,7 +19,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
add_definitions(-DVITASTOR_VERSION="1.9.1")
add_definitions(-DVITASTOR_VERSION="1.9.3")
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
add_link_options(-fno-omit-frame-pointer)
if (${WITH_ASAN})

View File

@ -10,7 +10,7 @@ endif (IBVERBS_LIBRARIES)
add_library(vitastor_common STATIC
../util/epoll_manager.cpp etcd_state_client.cpp messenger.cpp ../util/addr_util.cpp
msgr_stop.cpp msgr_op.cpp msgr_send.cpp msgr_receive.cpp ../util/ringloop.cpp ../../json11/json11.cpp
http_client.cpp osd_ops.cpp pg_states.cpp ../util/timerfd_manager.cpp ../util/str_util.cpp ${MSGR_RDMA}
http_client.cpp osd_ops.cpp pg_states.cpp ../util/timerfd_manager.cpp ../util/str_util.cpp ../util/json_util.cpp ${MSGR_RDMA}
)
target_link_libraries(vitastor_common pthread)
target_compile_options(vitastor_common PUBLIC -fPIC)
@ -88,7 +88,7 @@ add_executable(test_cluster_client
EXCLUDE_FROM_ALL
../test/test_cluster_client.cpp
pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp cluster_client_wb.cpp msgr_op.cpp ../test/mock/messenger.cpp msgr_stop.cpp
etcd_state_client.cpp ../util/timerfd_manager.cpp ../util/str_util.cpp ../../json11/json11.cpp
etcd_state_client.cpp ../util/timerfd_manager.cpp ../util/str_util.cpp ../util/json_util.cpp ../../json11/json11.cpp
)
target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)
target_include_directories(test_cluster_client BEFORE PUBLIC ${CMAKE_SOURCE_DIR}/src/test/mock)

View File

@ -4,7 +4,7 @@
#include <stdexcept>
#include <assert.h>
#include "cluster_client_impl.h"
#include "http_client.h" // json_is_true
#include "json_util.h"
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json config)
{
@ -955,7 +955,7 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
? (stripe + pg_block_size) : (op->offset + op->len);
op->parts[i].iov.reset();
op->parts[i].flags = 0;
if (op->cur_inode != op->inode || op->opcode == OSD_OP_READ && dirty_copied)
if (op->opcode != OSD_OP_READ_CHAIN_BITMAP && op->cur_inode != op->inode || op->opcode == OSD_OP_READ && dirty_copied)
{
// Read remaining parts from upper layers
uint64_t prev = begin, cur = begin;

View File

@ -15,6 +15,7 @@
#include "addr_util.h"
#include "str_util.h"
#include "json_util.h"
#include "json11/json11.hpp"
#include "http_client.h"
#include "timerfd_manager.h"
@ -61,6 +62,7 @@ struct http_co_t
inline void end() { ended = true; if (!onstack) { delete this; } }
void run_cb_and_clear();
void start_connection();
void start_ws_connection();
void close_connection();
void next_request();
void handle_events();
@ -111,7 +113,7 @@ http_co_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, cons
handler->keepalive = false;
handler->request = request;
handler->response_callback = response_callback;
handler->start_connection();
handler->start_ws_connection();
return handler;
}
@ -281,6 +283,27 @@ void http_co_t::close_connection()
epoll_events = 0;
}
void http_co_t::start_ws_connection()
{
stackin();
start_connection();
if (request_timeout > 0)
{
timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
{
stackin();
if (state != HTTP_CO_WEBSOCKET)
{
close_connection();
parsed = { .error = "Websocket connection timed out" };
run_cb_and_clear();
}
stackout();
});
}
stackout();
}
void http_co_t::start_connection()
{
stackin();
@ -724,22 +747,3 @@ static bool ws_parse_frame(std::string & buf, int & type, std::string & res)
buf = buf.substr(hdr+len);
return true;
}
// FIXME: move to utils
bool json_is_true(const json11::Json & val)
{
if (val.is_string())
return val == "true" || val == "yes" || val == "1";
return val.bool_value();
}
bool json_is_false(const json11::Json & val)
{
if (val.is_string())
return val.string_value() == "false" || val.string_value() == "no" || val.string_value() == "0";
if (val.is_number())
return val.number_value() == 0;
if (val.is_bool())
return !val.bool_value();
return false;
}

View File

@ -48,9 +48,3 @@ void http_request(http_co_t *handler, const std::string & host, const std::strin
const http_options_t & options, std::function<void(const http_response_t *response)> response_callback);
void http_post_message(http_co_t *handler, int type, const std::string & msg);
void http_close(http_co_t *co);
// Utils
std::string strtolower(const std::string & in);
// FIXME: move to json11
bool json_is_true(const json11::Json & val);
bool json_is_false(const json11::Json & val);

View File

@ -177,7 +177,7 @@ protected:
std::vector<int> read_ready_clients;
std::vector<int> write_ready_clients;
// We don't use ringloop->set_immediate here because we may have no ringloop in client :)
std::vector<std::function<void()>> set_immediate;
std::vector<osd_op_t*> set_immediate_ops;
public:
timerfd_manager_t *tfd;
@ -237,6 +237,8 @@ protected:
void handle_op_hdr(osd_client_t *cl);
bool handle_reply_hdr(osd_client_t *cl);
void handle_reply_ready(osd_op_t *op);
void handle_immediate_ops();
void clear_immediate_ops(int peer_fd);
#ifdef WITH_RDMA
void try_send_rdma(osd_client_t *cl);

View File

@ -598,6 +598,7 @@ void osd_messenger_t::handle_rdma_events()
}
fprintf(stderr, " with status: %s, stopping client\n", ibv_wc_status_str(wc[i].status));
stop_client(client_id);
clear_immediate_ops(client_id);
continue;
}
if (!is_send)
@ -606,6 +607,7 @@ void osd_messenger_t::handle_rdma_events()
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
{
// handle_read_buffer may stop the client
clear_immediate_ops(client_id);
continue;
}
try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]);
@ -666,9 +668,5 @@ void osd_messenger_t::handle_rdma_events()
}
}
} while (event_count > 0);
for (auto cb: set_immediate)
{
cb();
}
set_immediate.clear();
handle_immediate_ops();
}

View File

@ -65,6 +65,7 @@ void osd_messenger_t::read_requests()
bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
{
bool ret = false;
int peer_fd = cl->peer_fd;
cl->read_msg.msg_iovlen = 0;
cl->refs--;
if (cl->peer_state == PEER_STOPPED)
@ -101,7 +102,8 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
{
if (!handle_read_buffer(cl, cl->in_buf, result))
{
goto fin;
clear_immediate_ops(peer_fd);
return false;
}
}
else
@ -113,7 +115,8 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
{
if (!handle_finished_read(cl))
{
goto fin;
clear_immediate_ops(peer_fd);
return false;
}
}
}
@ -122,15 +125,47 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
ret = true;
}
}
fin:
for (auto cb: set_immediate)
{
cb();
}
set_immediate.clear();
handle_immediate_ops();
return ret;
}
void osd_messenger_t::clear_immediate_ops(int peer_fd)
{
size_t i = 0, j = 0;
while (i < set_immediate_ops.size())
{
if (set_immediate_ops[i]->peer_fd == peer_fd)
{
delete set_immediate_ops[i];
}
else
{
if (i != j)
set_immediate_ops[j] = set_immediate_ops[i];
j++;
}
i++;
}
set_immediate_ops.resize(j);
}
void osd_messenger_t::handle_immediate_ops()
{
for (auto op: set_immediate_ops)
{
if (op->op_type == OSD_OP_IN)
{
exec_op(op);
}
else
{
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(op->callback)(op);
}
}
set_immediate_ops.clear();
}
bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int remain)
{
// Compose operation(s) from the buffer
@ -199,7 +234,7 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
{
// Operation is ready
cl->received_ops.push_back(cl->read_op);
set_immediate.push_back([this, op = cl->read_op]() { exec_op(op); });
set_immediate_ops.push_back(cl->read_op);
cl->read_op = NULL;
cl->read_state = 0;
}
@ -295,7 +330,7 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
{
// Operation is ready
cl->received_ops.push_back(cur_op);
set_immediate.push_back([this, cur_op]() { exec_op(cur_op); });
set_immediate_ops.push_back(cur_op);
cl->read_op = NULL;
cl->read_state = 0;
}
@ -416,9 +451,5 @@ void osd_messenger_t::handle_reply_ready(osd_op_t *op)
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
);
set_immediate.push_back([op]()
{
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(op->callback)(op);
});
set_immediate_ops.push_back(op);
}

View File

@ -16,7 +16,6 @@
#include "qapi/error.h"
#include "qapi/qmp/qdict.h"
#include "qapi/qmp/qerror.h"
#include "qemu/uri.h"
#include "qemu/error-report.h"
#include "qemu/module.h"
#include "qemu/option.h"
@ -1021,7 +1020,11 @@ static BlockDriver bdrv_vitastor = {
// FIXME: Implement it along with per-inode statistics
//.bdrv_get_allocated_file_size = vitastor_get_allocated_file_size,
#if QEMU_VERSION_MAJOR > 9 || QEMU_VERSION_MAJOR == 9 && QEMU_VERSION_MINOR > 0
.bdrv_open = vitastor_file_open,
#else
.bdrv_file_open = vitastor_file_open,
#endif
.bdrv_close = vitastor_close,
// Option list for the create operation

View File

@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: Vitastor
Description: Vitastor client library
Version: 1.9.1
Version: 1.9.3
Libs: -L${libdir} -lvitastor_client
Cflags: -I${includedir}

View File

@ -369,6 +369,7 @@ struct cli_dd_t
{
cli_tool_t *parent;
std::vector<std::string> conv, iflag, oflag;
dd_in_info_t iinfo;
dd_out_info_t oinfo;
@ -430,7 +431,7 @@ struct cli_dd_t
if (read_op->retval < 0)
{
fprintf(
stderr, "Failed to read bitmap for %lu bytes from image %s at offset %lu: %s (code %d)\n",
stderr, "Failed to read bitmap for %ju bytes from image %s at offset %ju: %s (code %d)\n",
read_op->len, iinfo.iimg.c_str(), read_op->offset,
strerror(read_op->retval < 0 ? -read_op->retval : EIO), read_op->retval
);
@ -475,7 +476,7 @@ struct cli_dd_t
if (read_op->retval != read_op->len)
{
fprintf(
stderr, "Failed to read %lu bytes from image %s at offset %lu: %s (code %d)\n",
stderr, "Failed to read %ju bytes from image %s at offset %ju: %s (code %d)\n",
read_op->len, iinfo.iimg.c_str(), read_op->offset,
strerror(read_op->retval < 0 ? -read_op->retval : EIO), read_op->retval
);
@ -546,7 +547,7 @@ struct cli_dd_t
if (data->res < 0)
{
fprintf(
stderr, "Failed to read %lu bytes from %s at offset %lu: %s (code %d)\n",
stderr, "Failed to read %ju bytes from %s at offset %ju: %s (code %d)\n",
data->iov.iov_len, iinfo.ifile == "" ? "stdin" : iinfo.ifile.c_str(), cur_read->offset,
strerror(-data->res), data->res
);
@ -643,7 +644,7 @@ struct cli_dd_t
if (write_op->retval != write_op->len)
{
fprintf(
stderr, "Failed to write %lu bytes to image %s at offset %lu: %s (code %d)\n",
stderr, "Failed to write %ju bytes to image %s at offset %ju: %s (code %d)\n",
write_op->len, oinfo.oimg.c_str(), write_op->offset,
strerror(write_op->retval < 0 ? -write_op->retval : EIO), write_op->retval
);
@ -679,7 +680,7 @@ struct cli_dd_t
if (data->res < 0)
{
fprintf(
stderr, "Failed to write %lu bytes to %s at offset %lu: %s (code %d)\n",
stderr, "Failed to write %ju bytes to %s at offset %ju: %s (code %d)\n",
data->iov.iov_len, oinfo.ofile == "" ? "stdout" : oinfo.ofile.c_str(),
oinfo.out_seekable ? cur_read->offset+cur_read->len+oseek : 0,
strerror(-data->res), data->res
@ -726,7 +727,7 @@ struct cli_dd_t
{
char buf[256];
snprintf(
buf, sizeof(buf), "%lu bytes (%s) copied, %.1f s, %sB/s",
buf, sizeof(buf), "%ju bytes (%s) copied, %.1f s, %sB/s",
written_size, format_size(written_size).c_str(), sec_total,
format_size((uint64_t)(written_size/sec_total), true).c_str()
);
@ -748,7 +749,7 @@ struct cli_dd_t
else
{
fprintf(
stderr, "\r%lu bytes (%s) copied, %.1f s, %sB/s, avg %sB/s\033[K",
stderr, "\r%ju bytes (%s) copied, %.1f s, %sB/s, avg %sB/s\033[K",
written_size, format_size(written_size).c_str(), sec_total,
format_size((uint64_t)(delta/sec_delta), true).c_str(),
format_size((uint64_t)(written_size/sec_total), true).c_str()
@ -766,6 +767,49 @@ struct cli_dd_t
goto resume_3;
else if (state == 4)
goto resume_4;
for (int i = 0; i < conv.size(); i++)
{
if (conv[i] == "nofsync")
oinfo.end_fsync = false;
else if (conv[i] == "trunc")
oinfo.out_trunc = true;
else if (conv[i] == "nocreat")
oinfo.out_create = false;
else if (conv[i] == "noerror")
ignore_errors = true;
else if (conv[i] == "nosparse")
write_zero = true;
else
{
result = (cli_result_t){ .err = EINVAL, .text = "Unknown option conv="+conv[i] };
state = 100;
return;
}
}
for (int i = 0; i < iflag.size(); i++)
{
if (iflag[i] == "direct")
iinfo.in_direct = true;
else
{
result = (cli_result_t){ .err = EINVAL, .text = "Unknown option iflag="+iflag[i] };
state = 100;
return;
}
}
for (int i = 0; i < oflag.size(); i++)
{
if (oflag[i] == "direct")
oinfo.out_direct = true;
else if (oflag[i] == "append")
oinfo.out_append = true;
else
{
result = (cli_result_t){ .err = EINVAL, .text = "Unknown option oflag="+oflag[i] };
state = 100;
return;
}
}
if ((oinfo.oimg != "" && oinfo.ofile != "") || (iinfo.iimg != "" && iinfo.ifile != ""))
{
result = (cli_result_t){ .err = EINVAL, .text = "Image and file can't be specified at the same time" };
@ -908,6 +952,18 @@ static uint64_t parse_blocks(json11::Json v, uint64_t bs, uint64_t def)
return res;
}
static std::vector<std::string> explode_json(const std::string & sep, json11::Json opt)
{
if (opt.is_array())
{
std::vector<std::string> arr;
for (auto & item: opt.array_items())
arr.push_back(item.as_string());
return arr;
}
return explode(sep, opt.as_string(), true);
}
std::function<bool(cli_result_t &)> cli_tool_t::start_dd(json11::Json cfg)
{
auto dd = new cli_dd_t();
@ -923,7 +979,7 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_dd(json11::Json cfg)
dd->oseek = parse_blocks(cfg["oseek"], dd->blocksize, 0);
if (!dd->oseek)
dd->oseek = parse_blocks(cfg["seek"], dd->blocksize, 0);
dd->iseek = parse_blocks(cfg["oseek"], dd->blocksize, 0);
dd->iseek = parse_blocks(cfg["iseek"], dd->blocksize, 0);
if (!dd->iseek)
dd->iseek = parse_blocks(cfg["skip"], dd->blocksize, 0);
dd->iodepth = cfg["iodepth"].uint64_value();
@ -935,25 +991,9 @@ std::function<bool(cli_result_t &)> cli_tool_t::start_dd(json11::Json cfg)
progress = true;
dd->iinfo.detect_size = cfg["size"].is_null();
dd->oinfo.out_size = parse_size(cfg["size"].as_string());
std::vector<std::string> conv = explode(",", cfg["conv"].string_value(), true);
if (std::find(conv.begin(), conv.end(), "nofsync") != conv.end())
dd->oinfo.end_fsync = false;
if (std::find(conv.begin(), conv.end(), "trunc") != conv.end())
dd->oinfo.out_trunc = true;
if (std::find(conv.begin(), conv.end(), "nocreat") != conv.end())
dd->oinfo.out_create = false;
if (std::find(conv.begin(), conv.end(), "noerror") != conv.end())
dd->ignore_errors = true;
if (std::find(conv.begin(), conv.end(), "nosparse") != conv.end())
dd->write_zero = true;
conv = explode(",", cfg["iflag"].string_value(), true);
if (std::find(conv.begin(), conv.end(), "direct") != conv.end())
dd->iinfo.in_direct = true;
conv = explode(",", cfg["oflag"].string_value(), true);
if (std::find(conv.begin(), conv.end(), "direct") != conv.end())
dd->oinfo.out_direct = true;
if (std::find(conv.begin(), conv.end(), "append") != conv.end())
dd->oinfo.out_append = true;
dd->conv = explode_json(",", cfg["conv"]);
dd->iflag = explode_json(",", cfg["iflag"]);
dd->oflag = explode_json(",", cfg["oflag"]);
return [dd](cli_result_t & result)
{
dd->loop();

View File

@ -5,6 +5,7 @@
#include "cluster_client.h"
#include "pg_states.h"
#include "str_util.h"
#include "json_util.h"
struct cli_fix_t
{

View File

@ -21,6 +21,3 @@ template<class T> void remove_duplicates(std::vector<T> & ret)
}
ret.resize(j+1);
}
// from http_client.cpp...
bool json_is_false(const json11::Json & val);

View File

@ -4,6 +4,7 @@
#include "cli.h"
#include "cluster_client.h"
#include "str_util.h"
#include "json_util.h"
#include "http_client.h"
// Reweight OSD, change tags or set noout flag

View File

@ -156,6 +156,8 @@ resume_1:
for (auto & jtag: osd_cfg["tags"].array_items())
osd.tags.push_back(jtag.string_value());
}
else if (osd_cfg["tags"].is_string())
osd.tags.push_back(osd_cfg["tags"].string_value());
osd.noout = osd_cfg["noout"].bool_value();
}
auto np_it = node_placement.find(std::to_string(osd.num));

View File

@ -4,6 +4,7 @@
#include "cli.h"
#include "cluster_client.h"
#include "str_util.h"
#include "json_util.h"
#include "pg_states.h"
#include "http_client.h"

View File

@ -5,8 +5,9 @@ project(vitastor)
# vitastor-disk
add_executable(vitastor-disk
disk_tool.cpp disk_simple_offsets.cpp
disk_tool_journal.cpp disk_tool_meta.cpp disk_tool_prepare.cpp disk_tool_resize.cpp disk_tool_udev.cpp disk_tool_utils.cpp disk_tool_upgrade.cpp
../util/crc32c.c ../util/str_util.cpp ../../json11/json11.cpp ../util/rw_blocking.cpp ../util/allocator.cpp ../util/ringloop.cpp ../blockstore/blockstore_disk.cpp
disk_tool_journal.cpp disk_tool_meta.cpp disk_tool_prepare.cpp disk_tool_resize.cpp
disk_tool_resize_auto.cpp disk_tool_udev.cpp disk_tool_utils.cpp disk_tool_upgrade.cpp
../util/crc32c.c ../util/str_util.cpp ../util/json_util.cpp ../../json11/json11.cpp ../util/rw_blocking.cpp ../util/allocator.cpp ../util/ringloop.cpp ../blockstore/blockstore_disk.cpp
)
target_link_libraries(vitastor-disk
tcmalloc_minimal

View File

@ -27,12 +27,16 @@ static const char *help_text =
" --osd_per_disk <N>\n"
" Create <N> OSDs on each disk (default 1)\n"
" --hybrid\n"
" Prepare hybrid (HDD+SSD) OSDs using provided devices. SSDs will be used for\n"
" journals and metadata, HDDs will be used for data. Partitions for journals and\n"
" metadata will be created automatically. Whether disks are SSD or HDD is decided\n"
" by the `/sys/block/.../queue/rotational` flag. In hybrid mode, default object\n"
" size is 1 MB instead of 128 KB, default journal size is 1 GB instead of 32 MB,\n"
" and throttle_small_writes is enabled by default.\n"
" Prepare hybrid (HDD+SSD, NVMe+SATA or etc) OSDs using provided devices. By default,\n"
" any passed SSDs will be used for journals and metadata, HDDs will be used for data,\n"
" but you can override this behaviour with --fast-devices option. Journal and metadata\n"
" partitions will be created automatically. In the default mode, SSD and HDD disks\n"
" are distinguished by the `/sys/block/.../queue/rotational` flag. When HDDs are used\n"
" for data in hybrid mode, default block_size is 1 MB instead of 128 KB, default journal\n"
" size is 1 GB instead of 32 MB, and throttle_small_writes is enabled by default.\n"
" --fast-devices /dev/nvmeX,/dev/nvmeY\n"
" In --hybrid mode, use these devices for journal and metadata instead of auto-detecting\n"
" and extracting them from the main [devices...] list.\n"
" --disable_data_fsync auto\n"
" Disable data device cache and fsync (1/yes/true = on, default auto)\n"
" --disable_meta_fsync auto\n"
@ -92,8 +96,22 @@ static const char *help_text =
" \n"
" Requires the `sfdisk` utility.\n"
"\n"
"vitastor-disk resize <ALL_OSD_PARAMETERS> <NEW_LAYOUT> [--iodepth 32]\n"
" Resize data area and/or rewrite/move journal and metadata\n"
"vitastor-disk resize <osd_num>|<osd_device> [OPTIONS]\n"
" Resize data area and/or move journal and metadata:\n"
" --move-journal TARGET move journal to TARGET\n"
" --move-meta TARGET move metadata to TARGET\n"
" --journal-size NEW_SIZE resize journal to NEW_SIZE\n"
" --data-size NEW_SIZE resize data device to NEW_SIZE\n"
" --dry-run only show new layout, do not apply it\n"
" \n"
" NEW_SIZE may include k/m/g/t suffixes.\n"
" TARGET may be one of:\n"
" <partition> move journal/metadata to an existing GPT partition\n"
" <raw_device> create a GPT partition on <raw_device> and move journal/metadata to it\n"
" \"\" (empty string) move journal/metadata back to the data device\n"
"\n"
"vitastor-disk raw-resize <ALL_OSD_PARAMETERS> <NEW_LAYOUT> [--iodepth 32]\n"
" Resize data area and/or rewrite/move journal and metadata (manual format).\n"
" ALL_OSD_PARAMETERS must include all (at least all disk-related)\n"
" parameters from OSD command line (i.e. from systemd unit or superblock).\n"
" NEW_LAYOUT may include new disk layout parameters:\n"
@ -143,8 +161,10 @@ static const char *help_text =
" For now, this only checks that device cache is in write-through mode if fsync is disabled.\n"
" Intended for use from startup scripts (i.e. from systemd units).\n"
"\n"
"vitastor-disk dump-journal [OPTIONS] <osd_device>\n"
"vitastor-disk dump-journal [OPTIONS] <journal_file> <journal_block_size> <offset> <size>\n"
" Dump journal in human-readable or JSON (if --json is specified) format.\n"
" Dump journal in text or JSON (if --json is specified) format.\n"
" You can specify any OSD device (data, metadata or journal), or the layout manually.\n"
" Options:\n"
" --all Scan the whole journal area for entries and dump them, even outdated ones\n"
" --json Dump journal in JSON format\n"
@ -152,16 +172,21 @@ static const char *help_text =
" --format data Same as \"entries\", but also include small write data\n"
" --format blocks Dump as an array of journal blocks each containing array of entries\n"
"\n"
"vitastor-disk write-journal <osd_device>\n"
"vitastor-disk write-journal <journal_file> <journal_block_size> <bitmap_size> <offset> <size>\n"
" Write journal from JSON taken from standard input in the same format as produced by\n"
" `dump-journal --json --format data`.\n"
" You can specify any OSD device (data, metadata or journal), or the layout manually.\n"
"\n"
"vitastor-disk dump-meta <osd_device>\n"
"vitastor-disk dump-meta <meta_file> <meta_block_size> <offset> <size>\n"
" Dump metadata in JSON format.\n"
" You can specify any OSD device (data, metadata or journal), or the layout manually.\n"
"\n"
"vitastor-disk write-meta <osd_device>\n"
"vitastor-disk write-meta <meta_file> <offset> <size>\n"
" Write metadata from JSON taken from standard input in the same format as produced by\n"
" `dump-meta`. Intended for debugging.\n"
" Write metadata from JSON taken from standard input in the same format as produced by `dump-meta`.\n"
" You can specify any OSD device (data, metadata or journal), or the layout manually.\n"
"\n"
"vitastor-disk simple-offsets <device>\n"
" Calculate offsets for old simple&stupid (no superblock) OSD deployment. Options:\n"
@ -175,6 +200,7 @@ static const char *help_text =
" --device_size 0 Set device size\n"
" --format text Result format: json, options, env, or text\n"
"\n"
"Default I/O mode for commands involving disk I/O is O_DIRECT. If you don't want it, add --io cached.\n"
"Use vitastor-disk --help <command> for command details or vitastor-disk --help --all for all details.\n"
;
@ -199,6 +225,10 @@ int main(int argc, char *argv[])
cmd.push_back((char*)"dump-journal");
aliased = true;
}
else if (!strcmp(exe_name, "vitastor-disk-test"))
{
self.test_mode = true;
}
for (int i = 1; i < argc; i++)
{
if (!strcmp(argv[i], "--all"))
@ -229,6 +259,10 @@ int main(int argc, char *argv[])
{
self.options["force"] = "1";
}
else if (!strcmp(argv[i], "--dry-run") || !strcmp(argv[i], "--dry_run"))
{
self.options["dry_run"] = "1";
}
else if (!strcmp(argv[i], "--allow-data-loss"))
{
self.options["allow_data_loss"] = "1";
@ -236,7 +270,7 @@ int main(int argc, char *argv[])
else if (argv[i][0] == '-' && argv[i][1] == '-' && i < argc-1)
{
char *key = argv[i]+2;
self.options[key] = argv[++i];
self.options[str_replace(key, "-", "_")] = argv[++i];
}
else
{
@ -249,29 +283,50 @@ int main(int argc, char *argv[])
}
if (!strcmp(cmd[0], "dump-journal"))
{
if (cmd.size() < 5)
if (cmd.size() != 2 && cmd.size() < 5)
{
print_help(help_text, aliased ? "vitastor-dump-journal" : "vitastor-disk", cmd[0], false);
return 1;
}
self.dsk.journal_device = cmd[1];
self.dsk.journal_block_size = strtoul(cmd[2], NULL, 10);
self.dsk.journal_offset = strtoull(cmd[3], NULL, 10);
self.dsk.journal_len = strtoull(cmd[4], NULL, 10);
if (cmd.size() > 2)
{
self.dsk.journal_block_size = strtoul(cmd[2], NULL, 10);
self.dsk.journal_offset = strtoull(cmd[3], NULL, 10);
self.dsk.journal_len = strtoull(cmd[4], NULL, 10);
}
else
{
// First argument is an OSD device - take metadata layout parameters from it
if (self.dump_load_check_superblock(self.dsk.journal_device))
return 1;
}
return self.dump_journal();
}
else if (!strcmp(cmd[0], "write-journal"))
{
if (cmd.size() < 6)
if (cmd.size() != 2 && cmd.size() < 6)
{
print_help(help_text, "vitastor-disk", cmd[0], false);
return 1;
}
self.new_journal_device = cmd[1];
self.dsk.journal_block_size = strtoul(cmd[2], NULL, 10);
self.dsk.clean_entry_bitmap_size = strtoul(cmd[3], NULL, 10);
self.new_journal_offset = strtoull(cmd[4], NULL, 10);
self.new_journal_len = strtoull(cmd[5], NULL, 10);
if (cmd.size() > 2)
{
self.dsk.journal_block_size = strtoul(cmd[2], NULL, 10);
self.dsk.clean_entry_bitmap_size = strtoul(cmd[3], NULL, 10);
self.new_journal_offset = strtoull(cmd[4], NULL, 10);
self.new_journal_len = strtoull(cmd[5], NULL, 10);
}
else
{
// First argument is an OSD device - take metadata layout parameters from it
if (self.dump_load_check_superblock(self.new_journal_device))
return 1;
self.new_journal_device = self.dsk.journal_device;
self.new_journal_offset = self.dsk.journal_offset;
self.new_journal_len = self.dsk.journal_len;
}
std::string json_err;
json11::Json entries = json11::Json::parse(read_all_fd(0), json_err);
if (json_err != "")
@ -296,27 +351,48 @@ int main(int argc, char *argv[])
}
else if (!strcmp(cmd[0], "dump-meta"))
{
if (cmd.size() < 5)
if (cmd.size() != 2 && cmd.size() < 5)
{
print_help(help_text, "vitastor-disk", cmd[0], false);
return 1;
}
self.dsk.meta_device = cmd[1];
self.dsk.meta_block_size = strtoul(cmd[2], NULL, 10);
self.dsk.meta_offset = strtoull(cmd[3], NULL, 10);
self.dsk.meta_len = strtoull(cmd[4], NULL, 10);
if (cmd.size() > 2)
{
self.dsk.meta_block_size = strtoul(cmd[2], NULL, 10);
self.dsk.meta_offset = strtoull(cmd[3], NULL, 10);
self.dsk.meta_len = strtoull(cmd[4], NULL, 10);
}
else
{
// First argument is an OSD device - take metadata layout parameters from it
if (self.dump_load_check_superblock(self.dsk.meta_device))
return 1;
}
return self.dump_meta();
}
else if (!strcmp(cmd[0], "write-meta"))
{
if (cmd.size() < 4)
if (cmd.size() != 2 && cmd.size() < 4)
{
print_help(help_text, "vitastor-disk", cmd[0], false);
return 1;
}
self.new_meta_device = cmd[1];
self.new_meta_offset = strtoull(cmd[2], NULL, 10);
self.new_meta_len = strtoull(cmd[3], NULL, 10);
if (cmd.size() > 2)
{
self.new_meta_offset = strtoull(cmd[2], NULL, 10);
self.new_meta_len = strtoull(cmd[3], NULL, 10);
}
else
{
// First argument is an OSD device - take metadata layout parameters from it
if (self.dump_load_check_superblock(self.new_meta_device))
return 1;
self.new_meta_device = self.dsk.meta_device;
self.new_meta_offset = self.dsk.meta_offset;
self.new_meta_len = self.dsk.meta_len;
}
std::string json_err;
json11::Json meta = json11::Json::parse(read_all_fd(0), json_err);
if (json_err != "")
@ -328,7 +404,16 @@ int main(int argc, char *argv[])
}
else if (!strcmp(cmd[0], "resize"))
{
return self.resize_data();
if (cmd.size() != 2)
{
fprintf(stderr, "Exactly 1 OSD number or OSD device path argument is required\n");
return 1;
}
return self.resize_data(cmd[1]);
}
else if (!strcmp(cmd[0], "raw-resize"))
{
return self.raw_resize();
}
else if (!strcmp(cmd[0], "simple-offsets"))
{

View File

@ -22,6 +22,7 @@
#define VITASTOR_DISK_MAX_SB_SIZE 128*1024
#define VITASTOR_PART_TYPE "e7009fac-a5a1-4d72-af72-53de13059903"
#define DEFAULT_HYBRID_JOURNAL "1G"
#define DEFAULT_HYBRID_SSD_JOURNAL "128M"
struct resizer_data_moving_t;
@ -40,6 +41,7 @@ struct disk_tool_t
/**** Parameters ****/
std::map<std::string, std::string> options;
bool test_mode = false;
bool all, json, now;
bool dump_with_blocks, dump_with_data;
blockstore_disk_t dsk;
@ -93,10 +95,16 @@ struct disk_tool_t
void dump_meta_header(blockstore_meta_header_v2_t *hdr);
void dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap);
int dump_load_check_superblock(const std::string & device);
int write_json_journal(json11::Json entries);
int write_json_meta(json11::Json meta);
int resize_data();
int resize_data(std::string device);
int resize_parse_move_journal(std::map<std::string, std::string> & move_options, bool dry_run);
int resize_parse_move_meta(std::map<std::string, std::string> & move_options, bool dry_run);
int raw_resize();
int resize_parse_params();
void resize_init(blockstore_meta_header_v2_t *hdr);
int resize_remap_blocks();
@ -114,11 +122,14 @@ struct disk_tool_t
int systemd_start_stop_osds(const std::vector<std::string> & cmd, const std::vector<std::string> & devices);
int pre_exec_osd(std::string device);
int purge_devices(const std::vector<std::string> & devices);
int clear_osd_superblock(const std::string & dev);
json11::Json read_osd_superblock(std::string device, bool expect_exist = true, bool ignore_nonref = false);
uint32_t write_osd_superblock(std::string device, json11::Json params);
int prepare_one(std::map<std::string, std::string> options, int is_hdd = -1);
int check_existing_partition(std::string & dev_by_uuid);
int fix_partition_type(std::string & dev_by_uuid);
int prepare(std::vector<std::string> devices);
std::vector<vitastor_dev_info_t> collect_devices(const std::vector<std::string> & devices);
json11::Json add_partitions(vitastor_dev_info_t & devinfo, std::vector<std::string> sizes);
@ -133,13 +144,13 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output);
uint64_t sscanf_json(const char *fmt, const json11::Json & str);
void fromhexstr(const std::string & from, int bytes, uint8_t *to);
int disable_cache(std::string dev);
uint64_t get_device_size(const std::string & dev, bool should_exist = false);
std::string get_parent_device(std::string dev);
bool json_is_true(const json11::Json & val);
int shell_exec(const std::vector<std::string> & cmd, const std::string & in, std::string *out, std::string *err);
int write_zero(int fd, uint64_t offset, uint64_t size);
json11::Json read_parttable(std::string dev);
uint64_t dev_size_from_parttable(json11::Json pt);
uint64_t free_from_parttable(json11::Json pt);
int fix_partition_type(std::string dev_by_uuid);
int fix_partition_type_uuid(std::string & dev_by_uuid, const std::string & type_uuid);
std::string csum_type_str(uint32_t data_csum_type);
uint32_t csum_type_from_str(std::string data_csum_type);

View File

@ -18,7 +18,7 @@ int disk_tool_t::dump_journal()
printf("[\n");
if (all)
{
dsk.journal_fd = open(dsk.journal_device.c_str(), O_DIRECT|O_RDONLY);
dsk.journal_fd = open(dsk.journal_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY);
if (dsk.journal_fd < 0)
{
fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno));
@ -121,7 +121,7 @@ int disk_tool_t::dump_journal()
int disk_tool_t::process_journal(std::function<int(void*)> block_fn)
{
dsk.journal_fd = open(dsk.journal_device.c_str(), O_DIRECT|O_RDONLY);
dsk.journal_fd = open(dsk.journal_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY);
if (dsk.journal_fd < 0)
{
fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno));
@ -517,6 +517,12 @@ int disk_tool_t::write_json_journal(json11::Json entries)
uint32_t data_csum_size = !dsk.data_csum_type ? 0 : ne->small_write.len/dsk.csum_block_size*(dsk.data_csum_type & 0xFF);
fromhexstr(rec["bitmap"].string_value(), dsk.clean_entry_bitmap_size, ((uint8_t*)ne) + sizeof(journal_entry_small_write) + data_csum_size);
fromhexstr(rec["data"].string_value(), ne->small_write.len, new_journal_data);
if (ne->small_write.len > 0 && !rec["data"].is_string())
{
fprintf(stderr, "Error: entry data is missing, please generate the dump with --json --format data\n");
free(new_journal_buf);
return 1;
}
if (dsk.data_csum_type)
fromhexstr(rec["block_csums"].string_value(), data_csum_size, ((uint8_t*)ne) + sizeof(journal_entry_small_write));
if (rec["data"].is_string())

View File

@ -4,6 +4,7 @@
#include "disk_tool.h"
#include "rw_blocking.h"
#include "osd_id.h"
#include "json_util.h"
int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)> hdr_fn,
std::function<void(uint64_t, clean_disk_entry*, uint8_t*)> record_fn)
@ -13,7 +14,7 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
fprintf(stderr, "Invalid metadata block size: is not a multiple of %d\n", DIRECT_IO_ALIGNMENT);
return 1;
}
dsk.meta_fd = open(dsk.meta_device.c_str(), O_DIRECT|O_RDONLY);
dsk.meta_fd = open(dsk.meta_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDONLY);
if (dsk.meta_fd < 0)
{
fprintf(stderr, "Failed to open metadata device %s: %s\n", dsk.meta_device.c_str(), strerror(errno));
@ -149,6 +150,31 @@ int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v2_t *)>
return 0;
}
int disk_tool_t::dump_load_check_superblock(const std::string & device)
{
json11::Json sb = read_osd_superblock(device, true, false);
if (sb.is_null())
return 1;
try
{
auto cfg = json_to_string_map(sb["params"].object_items());
dsk.parse_config(cfg);
dsk.data_io = dsk.meta_io = dsk.journal_io = "cached";
dsk.open_data();
dsk.open_meta();
dsk.open_journal();
dsk.calc_lengths(true);
}
catch (std::exception & e)
{
dsk.close_all();
fprintf(stderr, "%s\n", e.what());
return 1;
}
dsk.close_all();
return 0;
}
int disk_tool_t::dump_meta()
{
int r = process_meta(
@ -176,7 +202,7 @@ void disk_tool_t::dump_meta_header(blockstore_meta_header_v2_t *hdr)
{
printf(
"{\"version\":\"0.9\",\"meta_block_size\":%u,\"data_block_size\":%u,\"bitmap_granularity\":%u,"
"\"data_csum_type\":%s,\"csum_block_size\":%u,\"entries\":[\n",
"\"data_csum_type\":\"%s\",\"csum_block_size\":%u,\"entries\":[\n",
hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity,
csum_type_str(hdr->data_csum_type).c_str(), hdr->csum_block_size
);
@ -243,12 +269,16 @@ int disk_tool_t::write_json_meta(json11::Json meta)
? meta["data_block_size"].uint64_value() : 131072;
new_hdr->bitmap_granularity = meta["bitmap_granularity"].uint64_value()
? meta["bitmap_granularity"].uint64_value() : 4096;
new_hdr->data_csum_type = meta["data_csum_type"].is_number()
? meta["data_csum_type"].uint64_value()
: (meta["data_csum_type"].string_value() == "crc32c"
? BLOCKSTORE_CSUM_CRC32C
: BLOCKSTORE_CSUM_NONE);
new_hdr->csum_block_size = meta["csum_block_size"].uint64_value();
if (new_hdr->version >= BLOCKSTORE_META_FORMAT_V2)
{
new_hdr->data_csum_type = meta["data_csum_type"].is_number()
? meta["data_csum_type"].uint64_value()
: (meta["data_csum_type"].string_value() == "crc32c"
? BLOCKSTORE_CSUM_CRC32C
: BLOCKSTORE_CSUM_NONE);
new_hdr->csum_block_size = meta["csum_block_size"].uint64_value();
new_hdr->header_csum = crc32c(0, new_hdr, sizeof(*new_hdr));
}
uint32_t new_clean_entry_header_size = (new_hdr->version == BLOCKSTORE_META_FORMAT_V1
? sizeof(clean_disk_entry) : sizeof(clean_disk_entry) + 4 /*entry_csum*/);
new_clean_entry_bitmap_size = (new_hdr->data_block_size / new_hdr->bitmap_granularity + 7) / 8;
@ -285,8 +315,7 @@ int disk_tool_t::write_json_meta(json11::Json meta)
fromhexstr(e["data_csum"].string_value(), new_data_csum_size,
((uint8_t*)new_entry) + sizeof(clean_disk_entry) + 2*new_clean_entry_bitmap_size);
}
uint32_t *new_entry_csum = (uint32_t*)(((uint8_t*)new_entry) + sizeof(clean_disk_entry) +
2*new_clean_entry_bitmap_size + new_data_csum_size);
uint32_t *new_entry_csum = (uint32_t*)(((uint8_t*)new_entry) + new_clean_entry_size - 4);
*new_entry_csum = crc32c(0, new_entry, new_clean_entry_size - 4);
}
}

View File

@ -3,6 +3,7 @@
#include "disk_tool.h"
#include "str_util.h"
#include "json_util.h"
#include "osd_id.h"
int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_hdd)
@ -28,18 +29,12 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
};
if (options.find("force") == options.end())
{
std::vector<std::string> all_devs = { options["data_device"], options["meta_device"], options["journal_device"] };
for (int i = 0; i < all_devs.size(); i++)
std::string* all_devs[] = { &options["data_device"], &options["meta_device"], &options["journal_device"] };
for (int i = 0; i < 3; i++)
{
const auto & dev = all_devs[i];
auto & dev = *all_devs[i];
if (dev == "")
continue;
if (dev.substr(0, 22) != "/dev/disk/by-partuuid/")
{
// Partitions should be identified by GPT partition UUID
fprintf(stderr, "%s does not start with /dev/disk/by-partuuid/. Partitions should be identified by GPT partition UUIDs\n", dev.c_str());
return 1;
}
std::string real_dev = realpath_str(dev, false);
if (real_dev == "")
return 1;
@ -52,24 +47,9 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
return 1;
}
if (i == 0 && is_hdd == -1)
is_hdd = trim(read_file("/sys/block/"+parent_dev+"/queue/rotational")) == "1";
std::string out;
if (shell_exec({ "wipefs", dev }, "", &out, NULL) != 0 || out != "")
{
fprintf(stderr, "%s contains data, not creating OSD without --force. wipefs shows:\n%s", dev.c_str(), out.c_str());
is_hdd = trim(read_file("/sys/block/"+parent_dev.substr(5)+"/queue/rotational")) == "1";
if (check_existing_partition(dev) != 0)
return 1;
}
json11::Json sb = read_osd_superblock(dev, false);
if (!sb.is_null())
{
fprintf(stderr, "%s already contains Vitastor OSD superblock, not creating OSD without --force\n", dev.c_str());
return 1;
}
if (fix_partition_type(dev) != 0)
{
fprintf(stderr, "%s has incorrect type and we failed to change it to Vitastor type\n", dev.c_str());
return 1;
}
}
}
for (auto dev: std::vector<std::string>{"data", "meta", "journal"})
@ -128,7 +108,11 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
try
{
dsk.parse_config(options);
dsk.data_io = dsk.meta_io = dsk.journal_io = "direct";
// Set all offsets to 4096 to calculate metadata size with excess
dsk.journal_offset = 4096;
dsk.meta_offset = 4096;
dsk.data_offset = 4096;
dsk.data_io = dsk.meta_io = dsk.journal_io = (options["io"] == "cached" ? "cached" : "direct");
dsk.open_data();
dsk.open_meta();
dsk.open_journal();
@ -173,7 +157,11 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
return 1;
}
std::string osd_num_str;
if (shell_exec({ "vitastor-cli", "alloc-osd" }, "", &osd_num_str, NULL) != 0)
if (test_mode && options.find("osd_num") != options.end())
{
osd_num_str = options["osd_num"];
}
else if (shell_exec({ "vitastor-cli", "alloc-osd" }, "", &osd_num_str, NULL) != 0)
{
dsk.close_all();
return 1;
@ -187,8 +175,8 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
}
sb["osd_num"] = osd_num;
// Zero out metadata and journal
if (write_zero(dsk.meta_fd, dsk.meta_offset, dsk.meta_len) != 0 ||
write_zero(dsk.journal_fd, dsk.journal_offset, dsk.journal_len) != 0)
if (write_zero(dsk.meta_fd, sb["meta_offset"].uint64_value(), dsk.meta_len) != 0 ||
write_zero(dsk.journal_fd, sb["journal_offset"].uint64_value(), dsk.journal_len) != 0)
{
fprintf(stderr, "Failed to zero out metadata or journal: %s\n", strerror(errno));
dsk.close_all();
@ -213,52 +201,76 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
if (sep_j)
desc += (sep_m ? " and journal on " : " with journal on ") + realpath_str(options["journal_device"]);
fprintf(stderr, "Initialized OSD %ju on %s\n", osd_num, desc.c_str());
if (shell_exec({ "systemctl", "enable", "--now", "vitastor-osd@"+std::to_string(osd_num) }, "", NULL, NULL) != 0)
if (!test_mode || options.find("no_init") == options.end())
{
fprintf(stderr, "Failed to enable systemd unit vitastor-osd@%ju\n", osd_num);
if (shell_exec({ "systemctl", "enable", "--now", "vitastor-osd@"+std::to_string(osd_num) }, "", NULL, NULL) != 0)
{
fprintf(stderr, "Failed to enable systemd unit vitastor-osd@%ju\n", osd_num);
return 1;
}
}
return 0;
}
int disk_tool_t::check_existing_partition(std::string & dev)
{
std::string out;
if (shell_exec({ "wipefs", dev }, "", &out, NULL) != 0 || out != "")
{
fprintf(stderr, "%s contains data, not creating OSD without --force. wipefs shows:\n%s", dev.c_str(), out.c_str());
return 1;
}
json11::Json sb = read_osd_superblock(dev, false);
if (!sb.is_null())
{
fprintf(stderr, "%s already contains Vitastor OSD superblock, not creating OSD without --force\n", dev.c_str());
return 1;
}
if (fix_partition_type(dev) != 0)
{
fprintf(stderr, "%s has incorrect type and we failed to change it to Vitastor type\n", dev.c_str());
return 1;
}
return 0;
}
int disk_tool_t::fix_partition_type(std::string & dev)
{
std::string type_uuid = VITASTOR_PART_TYPE;
if (test_mode && options.find("part_type_uuid") != options.end())
{
type_uuid = options["part_type_uuid"];
}
return fix_partition_type_uuid(dev, type_uuid);
}
std::vector<vitastor_dev_info_t> disk_tool_t::collect_devices(const std::vector<std::string> & devices)
{
std::vector<vitastor_dev_info_t> devinfo;
std::set<std::string> seen;
for (auto & dev: devices)
{
if (seen.find(dev) != seen.end())
{
fprintf(stderr, "%s is specified multiple times, ignoring\n", dev.c_str());
continue;
}
// Check if the device is a whole disk
if (dev.substr(0, 5) != "/dev/")
{
fprintf(stderr, "%s does not start with /dev/, ignoring\n", dev.c_str());
continue;
}
struct stat dev_st, sys_st;
if (stat(dev.c_str(), &dev_st) < 0)
struct stat sys_st;
uint64_t dev_size = get_device_size(dev, false);
if (dev_size == UINT64_MAX)
{
if (errno == ENOENT)
{
fprintf(stderr, "%s does not exist, skipping\n", dev.c_str());
continue;
}
fprintf(stderr, "Error checking %s: %s\n", dev.c_str(), strerror(errno));
return {};
}
uint64_t dev_size = dev_st.st_size;
if (S_ISBLK(dev_st.st_mode))
else if (!dev_size)
{
int fd = open(dev.c_str(), O_DIRECT|O_RDWR);
if (fd < 0)
{
fprintf(stderr, "Failed to open %s: %s\n", dev.c_str(), strerror(errno));
return {};
}
if (ioctl(fd, BLKGETSIZE64, &dev_size) < 0)
{
fprintf(stderr, "Failed to get %s size: %s\n", dev.c_str(), strerror(errno));
close(fd);
return {};
}
close(fd);
fprintf(stderr, "%s does not exist, skipping\n", dev.c_str());
continue;
}
if (stat(("/sys/block/"+dev.substr(5)).c_str(), &sys_st) < 0)
{
@ -303,10 +315,6 @@ std::vector<vitastor_dev_info_t> disk_tool_t::collect_devices(const std::vector<
.free = !pt.is_null() ? free_from_parttable(pt) : dev_size,
});
}
if (!devinfo.size())
{
fprintf(stderr, "No suitable devices found\n");
}
return devinfo;
}
@ -337,7 +345,7 @@ json11::Json disk_tool_t::add_partitions(vitastor_dev_info_t & devinfo, std::vec
script += "+ "+size+" "+std::string(VITASTOR_PART_TYPE)+"\n";
}
std::string out;
if (shell_exec({ "sfdisk", "--no-reread", "--force", devinfo.path }, script, &out, NULL) != 0)
if (shell_exec({ "sfdisk", "--no-reread", "--no-tell-kernel", "--force", devinfo.path }, script, &out, NULL) != 0)
{
fprintf(stderr, "Failed to add %zu partition(s) with sfdisk\n", sizes.size());
return {};
@ -357,68 +365,61 @@ json11::Json disk_tool_t::add_partitions(vitastor_dev_info_t & devinfo, std::vec
fprintf(stderr, "Failed to add %zu partition(s) with sfdisk: new partitions not found in table\n", sizes.size());
return {};
}
// Check if new nodes exist and run partprobe if not
// Check if new devices exist, run partprobe if not, then wait until they appear
// FIXME: We could use parted instead of sfdisk because partprobe is already a part of parted
int iter = 0, r;
while (true)
{
for (const auto & part: new_parts)
{
struct stat st;
if (stat(part["node"].string_value().c_str(), &st) < 0)
{
if (errno == ENOENT)
{
iter++;
// Run partprobe
std::string out;
if (iter > 1 || (r = shell_exec({ "partprobe", devinfo.path }, "", &out, NULL)) != 0)
{
fprintf(
stderr, iter == 1 && r == 255
? "partprobe utility is required to reread partition table while disk %s is in use\n"
: "partprobe failed to re-read partition table while disk %s is in use\n",
devinfo.path.c_str()
);
return {};
}
break;
}
else
{
fprintf(stderr, "Failed to lstat %s: %s\n", part["node"].string_value().c_str(), strerror(errno));
return {};
}
}
}
break;
}
// Wait until device symlinks in /dev/disk/by-partuuid/ appear
bool exists = false;
iter = 0;
while (!exists && iter < 300) // max 30 sec
const int max_iter = 300; // max 30 sec
int iter = 0;
int r = 0;
while (!exists && iter < max_iter)
{
exists = true;
for (const auto & part: new_parts)
{
std::string link_path = "/dev/disk/by-partuuid/"+strtolower(part["uuid"].string_value());
struct stat st;
if (lstat(link_path.c_str(), &st) < 0)
if (stat(part["node"].string_value().c_str(), &st) < 0 ||
lstat(link_path.c_str(), &st) < 0)
{
if (errno == ENOENT)
{
exists = false;
if (iter == 4)
{
// Print message after 400ms
fprintf(stderr, "Waiting for %s to appear for up to %d sec...\n", link_path.c_str(), max_iter/10);
}
}
else
{
fprintf(stderr, "Failed to lstat %s: %s\n", link_path.c_str(), strerror(errno));
fprintf(stderr, "Failed to stat %s or lstat %s: %s\n", part["node"].string_value().c_str(),
link_path.c_str(), strerror(errno));
return {};
}
}
}
if (!exists)
if (exists)
{
struct timespec ts = { .tv_sec = 0, .tv_nsec = 100000000 }; // 100ms
iter += (nanosleep(&ts, NULL) == 0);
break;
}
if (!exists && iter == 0)
{
// Run partprobe
std::string out;
r = shell_exec({ "partprobe", devinfo.path }, "", &out, NULL);
if (r != 0)
{
fprintf(
stderr, r == 255
? "partprobe utility is required to reread partition table while disk %s is in use\n"
: "partprobe failed to re-read partition table while disk %s is in use\n",
devinfo.path.c_str()
);
return {};
}
}
struct timespec ts = { .tv_sec = 0, .tv_nsec = 100000000 }; // 100ms
iter += (nanosleep(&ts, NULL) == 0 || !iter);
}
devinfo.pt = newpt;
devinfo.osd_part_count += sizes.size();
@ -501,7 +502,10 @@ int disk_tool_t::get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std
{
blockstore_disk_t dsk;
dsk.parse_config(options);
dsk.data_io = dsk.meta_io = dsk.journal_io = "direct";
dsk.journal_offset = 4096;
dsk.meta_offset = 4096;
dsk.data_offset = 4096;
dsk.data_io = dsk.meta_io = dsk.journal_io = "cached";
dsk.open_data();
dsk.open_meta();
dsk.open_journal();
@ -511,6 +515,7 @@ int disk_tool_t::get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std
}
catch (std::exception & e)
{
dsk.close_all();
fprintf(stderr, "%s\n", e.what());
return 1;
}
@ -565,9 +570,12 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
{
if (options.find("data_device") != options.end() && options["data_device"] != "")
{
if (options.find("hybrid") != options.end() || options.find("osd_per_disk") != options.end() || devices.size())
if (options.find("hybrid") != options.end() ||
options.find("fast_devices") != options.end() ||
options.find("osd_per_disk") != options.end() ||
devices.size())
{
fprintf(stderr, "Device list (positional arguments) and --hybrid are incompatible with --data_device\n");
fprintf(stderr, "Device list (positional arguments), --osd_per_disk, --hybrid and --fast-devices are incompatible with --data_device\n");
return 1;
}
return prepare_one(options, options.find("hdd") != options.end() ? 1 : 0);
@ -584,8 +592,10 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
auto devinfo = collect_devices(devices);
if (!devinfo.size())
{
fprintf(stderr, "No suitable devices found\n");
return 1;
}
bool explicit_fast = options.find("fast_devices") != options.end();
uint64_t osd_per_disk = stoull_full(options["osd_per_disk"]);
if (!osd_per_disk)
osd_per_disk = 1;
@ -604,21 +614,55 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
if (options.find("disable_meta_fsync") == options.end())
options["disable_meta_fsync"] = "auto";
options["disable_journal_fsync"] = options["disable_meta_fsync"];
for (auto & dev: devinfo)
if (!dev.is_hdd)
ssds.push_back(dev);
if (!ssds.size())
if (explicit_fast)
{
fprintf(stderr, "No SSDs found\n");
return 1;
auto fast = explode(",", options["fast_devices"], true);
ssds = collect_devices(fast);
if (!ssds.size())
{
fprintf(stderr, "No fast devices found\n");
return 1;
}
if (options["journal_size"] == "")
{
auto auto_journal_size = DEFAULT_HYBRID_SSD_JOURNAL;
for (auto & dev: devinfo)
{
if (dev.is_hdd)
{
auto_journal_size = DEFAULT_HYBRID_JOURNAL;
break;
}
}
options["journal_size"] = auto_journal_size;
}
}
else if (ssds.size() == devinfo.size())
else
{
fprintf(stderr, "No HDDs found\n");
return 1;
std::vector<vitastor_dev_info_t> hdds;
for (auto & dev: devinfo)
{
if (!dev.is_hdd)
ssds.push_back(dev);
else
hdds.push_back(dev);
}
if (!ssds.size())
{
fprintf(stderr, "No SSDs found\n");
return 1;
}
if (!hdds.size())
{
fprintf(stderr, "No HDDs found\n");
return 1;
}
devinfo = hdds;
if (options["journal_size"] == "")
{
options["journal_size"] = DEFAULT_HYBRID_JOURNAL;
}
}
if (options["journal_size"] == "")
options["journal_size"] = DEFAULT_HYBRID_JOURNAL;
}
else
{
@ -628,31 +672,28 @@ int disk_tool_t::prepare(std::vector<std::string> devices)
auto journal_size = options["journal_size"];
for (auto & dev: devinfo)
{
if (!hybrid || dev.is_hdd)
// Select new partitions and create an OSD on each of them
for (const auto & uuid: get_new_data_parts(dev, osd_per_disk, max_other_percent))
{
// Select new partitions and create an OSD on each of them
for (const auto & uuid: get_new_data_parts(dev, osd_per_disk, max_other_percent))
options["force"] = true;
options["data_device"] = "/dev/disk/by-partuuid/"+strtolower(uuid);
if (hybrid)
{
options["force"] = true;
options["data_device"] = "/dev/disk/by-partuuid/"+strtolower(uuid);
if (hybrid)
// Select/create journal and metadata partitions
int r = get_meta_partition(ssds, options);
if (r != 0)
{
// Select/create journal and metadata partitions
int r = get_meta_partition(ssds, options);
if (r != 0)
{
return 1;
}
options.erase("journal_size");
}
// Treat all disks as SSDs if not in the hybrid mode
prepare_one(options, dev.is_hdd ? 1 : 0);
if (hybrid)
{
options["journal_size"] = journal_size;
options.erase("journal_device");
options.erase("meta_device");
return 1;
}
options.erase("journal_size");
}
// Treat all disks as SSDs if not in the hybrid mode
prepare_one(options, dev.is_hdd ? 1 : 0);
if (hybrid)
{
options["journal_size"] = journal_size;
options.erase("journal_device");
options.erase("meta_device");
}
}
}

View File

@ -18,7 +18,7 @@ struct resizer_data_moving_t
uint64_t old_loc, new_loc;
};
int disk_tool_t::resize_data()
int disk_tool_t::raw_resize()
{
int r;
// Parse parameters
@ -91,7 +91,7 @@ int disk_tool_t::resize_parse_params()
try
{
dsk.parse_config(options);
dsk.data_io = dsk.meta_io = dsk.journal_io = "direct";
dsk.data_io = dsk.meta_io = dsk.journal_io = "cached";
dsk.open_data();
dsk.open_meta();
dsk.open_journal();
@ -114,7 +114,10 @@ int disk_tool_t::resize_parse_params()
new_data_offset = options.find("new_data_offset") != options.end()
? parse_size(options["new_data_offset"]) : dsk.data_offset;
new_data_len = options.find("new_data_len") != options.end()
? parse_size(options["new_data_len"]) : dsk.data_len;
? parse_size(options["new_data_len"])
: (options.find("new_data_offset") != options.end()
? dsk.data_device_size-new_data_offset
: dsk.data_len);
new_meta_offset = options.find("new_meta_offset") != options.end()
? parse_size(options["new_meta_offset"]) : dsk.meta_offset;
new_meta_len = options.find("new_meta_len") != options.end()
@ -123,6 +126,14 @@ int disk_tool_t::resize_parse_params()
? parse_size(options["new_journal_offset"]) : dsk.journal_offset;
new_journal_len = options.find("new_journal_len") != options.end()
? parse_size(options["new_journal_len"]) : dsk.journal_len;
if (new_data_len+new_data_offset > dsk.data_device_size)
new_data_len = dsk.data_device_size-new_data_offset;
if (new_meta_device == dsk.data_device && new_data_offset < new_meta_offset &&
new_data_len+new_data_offset > new_meta_offset)
new_data_len = new_meta_offset-new_data_offset;
if (new_journal_device == dsk.data_device && new_data_offset < new_journal_offset &&
new_data_len+new_data_offset > new_journal_offset)
new_data_len = new_journal_offset-new_data_offset;
if (new_meta_device == dsk.meta_device &&
new_journal_device == dsk.journal_device &&
new_data_offset == dsk.data_offset &&
@ -159,10 +170,10 @@ void disk_tool_t::resize_init(blockstore_meta_header_v2_t *hdr)
dsk.data_csum_type = hdr->data_csum_type;
dsk.csum_block_size = hdr->csum_block_size;
}
if (((new_data_len-dsk.data_len) % dsk.data_block_size) ||
((new_data_offset-dsk.data_offset) % dsk.data_block_size))
if (((new_data_offset-dsk.data_offset) % dsk.data_block_size))
{
fprintf(stderr, "Data alignment mismatch\n");
fprintf(stderr, "Data alignment mismatch: old data offset is 0x%jx, new is 0x%jx, but alignment on %x should be equal\n",
dsk.data_offset, new_data_offset, dsk.data_block_size);
exit(1);
}
data_idx_diff = ((int64_t)(dsk.data_offset-new_data_offset)) / dsk.data_block_size;
@ -220,10 +231,10 @@ int disk_tool_t::resize_remap_blocks()
}
for (uint64_t i = 0; i < free_last; i++)
{
if (data_alloc->get(total_blocks-i))
data_remap[total_blocks-i] = 0;
if (data_alloc->get(total_blocks-i-1))
data_remap[total_blocks-i-1] = 0;
else
data_alloc->set(total_blocks-i, true);
data_alloc->set(total_blocks-i-1, true);
}
for (auto & p: data_remap)
{
@ -246,7 +257,7 @@ int disk_tool_t::resize_copy_data()
iodepth = 32;
}
ringloop = new ring_loop_t(iodepth < RINGLOOP_DEFAULT_SIZE ? RINGLOOP_DEFAULT_SIZE : iodepth);
dsk.data_fd = open(dsk.data_device.c_str(), O_DIRECT|O_RDWR);
dsk.data_fd = open(dsk.data_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR);
if (dsk.data_fd < 0)
{
fprintf(stderr, "Failed to open data device %s: %s\n", dsk.data_device.c_str(), strerror(errno));
@ -441,7 +452,7 @@ int disk_tool_t::resize_rewrite_journal()
int disk_tool_t::resize_write_new_journal()
{
new_journal_fd = open(new_journal_device.c_str(), O_DIRECT|O_RDWR);
new_journal_fd = open(new_journal_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR);
if (new_journal_fd < 0)
{
fprintf(stderr, "Failed to open new journal device %s: %s\n", new_journal_device.c_str(), strerror(errno));
@ -467,12 +478,13 @@ int disk_tool_t::resize_rewrite_meta()
blockstore_meta_header_v2_t *new_hdr = (blockstore_meta_header_v2_t *)new_meta_buf;
new_hdr->zero = 0;
new_hdr->magic = BLOCKSTORE_META_MAGIC_V1;
new_hdr->version = BLOCKSTORE_META_FORMAT_V1;
new_hdr->version = BLOCKSTORE_META_FORMAT_V2;
new_hdr->meta_block_size = dsk.meta_block_size;
new_hdr->data_block_size = dsk.data_block_size;
new_hdr->bitmap_granularity = dsk.bitmap_granularity ? dsk.bitmap_granularity : 4096;
new_hdr->data_csum_type = dsk.data_csum_type;
new_hdr->csum_block_size = dsk.csum_block_size;
new_hdr->header_csum = crc32c(0, new_hdr, sizeof(*new_hdr));
},
[this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
{
@ -481,7 +493,7 @@ int disk_tool_t::resize_rewrite_meta()
block_num = remap_it->second;
if (block_num < free_first || block_num >= total_blocks-free_last)
{
fprintf(stderr, "BUG: remapped block not in range\n");
fprintf(stderr, "BUG: remapped block %ju not in range %ju..%ju\n", block_num, free_first, total_blocks-free_last);
exit(1);
}
block_num += data_idx_diff;
@ -494,6 +506,8 @@ int disk_tool_t::resize_rewrite_meta()
memcpy(new_entry->bitmap, bitmap, 2*new_clean_entry_bitmap_size + new_data_csum_size);
else
memset(new_entry->bitmap, 0xff, 2*new_clean_entry_bitmap_size);
uint32_t *new_entry_csum = (uint32_t*)(((uint8_t*)new_entry) + new_clean_entry_size - 4);
*new_entry_csum = crc32c(0, new_entry, new_clean_entry_size - 4);
}
);
if (r != 0)
@ -507,7 +521,7 @@ int disk_tool_t::resize_rewrite_meta()
int disk_tool_t::resize_write_new_meta()
{
new_meta_fd = open(new_meta_device.c_str(), O_DIRECT|O_RDWR);
new_meta_fd = open(new_meta_device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR);
if (new_meta_fd < 0)
{
fprintf(stderr, "Failed to open new metadata device %s: %s\n", new_meta_device.c_str(), strerror(errno));

View File

@ -0,0 +1,298 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include "disk_tool.h"
#include "rw_blocking.h"
#include "str_util.h"
#include "json_util.h"
int disk_tool_t::resize_data(std::string device)
{
if (options.find("move_journal") == options.end() &&
options.find("move_data") == options.end() &&
options.find("journal_size") == options.end() &&
options.find("data_size") == options.end())
{
fprintf(stderr, "None of --move-journal, --move-data, --journal-size, --data-size options are specified - nothing to do!\n");
return 1;
}
if (stoull_full(device))
device = "/dev/vitastor/osd"+device+"-data";
json11::Json sb = read_osd_superblock(device, true, false);
if (sb.is_null())
return 1;
auto sb_params = json_to_string_map(sb["params"].object_items());
try
{
dsk.parse_config(sb_params);
dsk.data_io = dsk.meta_io = dsk.journal_io = "cached";
dsk.open_data();
dsk.open_meta();
dsk.open_journal();
dsk.calc_lengths(true);
}
catch (std::exception & e)
{
dsk.close_all();
fprintf(stderr, "%s\n", e.what());
return 1;
}
// Save FD numbers because calc_lengths() relies on them
int old_journal_fd = dsk.journal_fd, old_meta_fd = dsk.meta_fd, old_data_fd = dsk.data_fd;
dsk.close_all();
bool dry_run = options.find("dry_run") != options.end();
auto old_journal_device = dsk.journal_device;
auto old_meta_device = dsk.meta_device;
new_journal_len = dsk.journal_len;
if (options.find("journal_size") != options.end())
{
new_journal_len = parse_size(options["journal_size"]);
if (options.find("move_journal") == options.end())
options["move_journal"] = dsk.journal_device == dsk.data_device ? "" : dsk.journal_device;
}
uint64_t new_data_dev_size = 0;
if (options.find("data_size") != options.end())
{
new_data_dev_size = parse_size(options["data_size"]);
new_data_dev_size = options["data_size"] == "max" || new_data_dev_size > dsk.data_device_size
? dsk.data_device_size : new_data_dev_size;
dsk.data_device_size = new_data_dev_size;
dsk.cfg_data_size = 0;
dsk.journal_fd = old_journal_fd;
dsk.meta_fd = old_meta_fd;
dsk.data_fd = old_data_fd;
dsk.calc_lengths(true);
dsk.journal_fd = -1;
dsk.meta_fd = -1;
dsk.data_fd = -1;
}
std::map<std::string, std::string> move_options;
if (options.find("move_journal") != options.end())
{
if (resize_parse_move_journal(move_options, dry_run) != 0)
return 1;
}
if (options.find("move_meta") != options.end())
{
if (resize_parse_move_meta(move_options, dry_run) != 0)
return 1;
}
auto new_journal_device = move_options.find("new_journal_device") != move_options.end()
? move_options["new_journal_device"] : dsk.journal_device;
auto new_meta_device = move_options.find("new_meta_device") != move_options.end()
? move_options["new_meta_device"] : dsk.meta_device;
// Calculate new data & meta offsets
new_data_offset = 4096 + (new_journal_device == dsk.data_device ? new_journal_len : 0) +
(new_meta_device == dsk.data_device ? dsk.meta_len : 0);
new_data_offset += ((dsk.data_offset-new_data_offset) % dsk.data_block_size);
if (new_data_offset != dsk.data_offset)
move_options["new_data_offset"] = std::to_string(new_data_offset);
if (new_data_dev_size != 0)
move_options["new_data_len"] = std::to_string(new_data_dev_size-new_data_offset);
new_meta_offset = 4096 + (new_meta_device == new_journal_device ? new_journal_len : 0);
if (new_meta_offset != dsk.meta_offset)
move_options["new_meta_offset"] = std::to_string(new_meta_offset);
// Run resize
auto orig_options = std::move(options);
options = sb_params;
for (auto & kv: move_options)
options[kv.first] = kv.second;
if (!json)
{
std::string cmd;
for (auto & kv: move_options)
cmd += " "+kv.first+" = "+kv.second+"\n";
fprintf(stderr, "Running resize:\n%s", cmd.c_str());
}
if (!dry_run && raw_resize() != 0)
return 1;
// Write new superblocks
json11::Json::object new_sb_params = sb["params"].object_items();
if (move_options.find("new_journal_device") != move_options.end())
new_sb_params["journal_device"] = move_options["new_journal_device"];
if (move_options.find("new_meta_device") != move_options.end())
new_sb_params["meta_device"] = move_options["new_meta_device"];
new_sb_params["data_offset"] = new_data_offset;
new_sb_params["meta_offset"] = new_meta_offset;
if (move_options.find("new_data_len") != move_options.end())
new_sb_params["data_size"] = stoull_full(move_options["new_data_len"]);
std::set<std::string> clear_superblocks, write_superblocks;
write_superblocks.insert(dsk.data_device);
write_superblocks.insert(new_journal_device);
write_superblocks.insert(new_meta_device);
if (write_superblocks.find(old_journal_device) == write_superblocks.end())
clear_superblocks.insert(old_journal_device);
if (write_superblocks.find(old_meta_device) == write_superblocks.end())
clear_superblocks.insert(old_meta_device);
for (auto & dev: clear_superblocks)
{
if (!json)
fprintf(stderr, "Clearing OSD superblock on %s\n", dev.c_str());
if (!dry_run && clear_osd_superblock(dev) != 0)
return 1;
}
for (auto & dev: write_superblocks)
{
if (!json)
fprintf(stderr, "Writing new OSD superblock to %s\n", dev.c_str());
if (!dry_run && !write_osd_superblock(dev, new_sb_params))
return 1;
}
if (json)
{
printf("%s\n", json11::Json(json11::Json::object {
{ "new_sb_params", new_sb_params },
}).dump().c_str());
}
return 0;
}
int disk_tool_t::resize_parse_move_journal(std::map<std::string, std::string> & move_options, bool dry_run)
{
if (options["move_journal"] == "")
{
// move back to the data device
// but first check if not already there :)
if (dsk.journal_device == dsk.data_device && new_journal_len == dsk.journal_len)
{
// already there
fprintf(stderr, "journal is already on data device and has the same size\n");
return 0;
}
move_options["new_journal_device"] = dsk.data_device;
move_options["new_journal_offset"] = "4096";
move_options["new_journal_len"] = std::to_string(new_journal_len);
}
else
{
std::string real_dev = realpath_str(options["move_journal"], false);
if (real_dev == "")
return 1;
std::string parent_dev = get_parent_device(real_dev);
if (parent_dev == "")
return 1;
if (parent_dev == real_dev)
{
// whole disk - create partition
std::string old_real_dev = realpath_str(dsk.journal_device);
if (old_real_dev == "")
return 1;
if (options.find("force") == options.end() &&
get_parent_device(old_real_dev) == parent_dev)
{
// already there
fprintf(stderr, "journal is already on a partition of %s, add --force to create a new partition\n", options["move_journal"].c_str());
return 0;
}
new_journal_len = ((new_journal_len+1024*1024-1)/1024/1024)*1024*1024;
if (!dry_run)
{
auto devinfos = collect_devices({ real_dev });
if (devinfos.size() == 0)
return 1;
std::vector<std::string> sizes;
sizes.push_back(std::to_string(new_journal_len/1024/1024)+"MiB");
auto new_parts = add_partitions(devinfos[0], sizes);
if (!new_parts.array_items().size())
return 1;
options["move_journal"] = "/dev/disk/by-partuuid/"+strtolower(new_parts[0]["uuid"].string_value());
}
else
options["move_journal"] = "<new journal partition on "+parent_dev+">";
}
else
{
// already a partition - check that it's a GPT partition with correct type
if ((options.find("force") == options.end()
? check_existing_partition(options["move_journal"])
: fix_partition_type(options["move_journal"])) != 0)
{
return 1;
}
new_journal_len = get_device_size(options["move_journal"], true);
if (new_journal_len == UINT64_MAX)
return 1;
}
new_journal_len -= 4096;
move_options["new_journal_device"] = options["move_journal"];
move_options["new_journal_offset"] = "4096";
move_options["new_journal_len"] = std::to_string(new_journal_len);
}
return 0;
}
int disk_tool_t::resize_parse_move_meta(std::map<std::string, std::string> & move_options, bool dry_run)
{
if (options["move_meta"] == "")
{
// move back to the data device
// but first check if not already there :)
if (dsk.meta_device == dsk.data_device)
{
// already there
fprintf(stderr, "metadata is already on data device\n");
return 0;
}
auto new_journal_device = move_options.find("new_journal_device") != move_options.end()
? move_options["new_journal_device"] : dsk.journal_device;
move_options["new_meta_device"] = dsk.data_device;
move_options["new_meta_len"] = std::to_string(dsk.meta_len);
}
else
{
std::string real_dev = realpath_str(options["move_meta"], false);
if (real_dev == "")
return 1;
std::string parent_dev = get_parent_device(real_dev);
if (parent_dev == "")
return 1;
uint64_t new_meta_len = 0;
if (parent_dev == real_dev)
{
// whole disk - create partition
std::string old_real_dev = realpath_str(dsk.meta_device);
if (old_real_dev == "")
return 1;
if (options.find("force") == options.end() &&
get_parent_device(old_real_dev) == parent_dev)
{
// already there
fprintf(stderr, "metadata is already on a partition of %s\n", options["move_meta"].c_str());
return 0;
}
new_meta_len = ((dsk.meta_len+1024*1024-1)/1024/1024)*1024*1024;
if (!dry_run)
{
auto devinfos = collect_devices({ real_dev });
if (devinfos.size() == 0)
return 1;
std::vector<std::string> sizes;
sizes.push_back(std::to_string(new_meta_len/1024/1024)+"MiB");
auto new_parts = add_partitions(devinfos[0], sizes);
if (!new_parts.array_items().size())
return 1;
options["move_meta"] = "/dev/disk/by-partuuid/"+strtolower(new_parts[0]["uuid"].string_value());
}
else
options["move_meta"] = "<new metadata partition on "+parent_dev+">";
}
else
{
// already a partition - check that it's a GPT partition with correct type
if ((options.find("force") == options.end()
? check_existing_partition(options["move_meta"])
: fix_partition_type(options["move_meta"])) != 0)
{
return 1;
}
new_meta_len = get_device_size(options["move_meta"], true);
if (new_meta_len == UINT64_MAX)
return 1;
}
new_meta_len -= 4096;
move_options["new_meta_len"] = std::to_string(new_meta_len);
move_options["new_meta_device"] = options["move_meta"];
move_options["new_meta_offset"] = "4096";
}
return 0;
}

View File

@ -6,6 +6,7 @@
#include "disk_tool.h"
#include "rw_blocking.h"
#include "str_util.h"
#include "json_util.h"
struct __attribute__((__packed__)) vitastor_disk_superblock_t
{
@ -121,7 +122,7 @@ uint32_t disk_tool_t::write_osd_superblock(std::string device, json11::Json para
sb->size = sb_size;
memcpy(sb->json_data, json_data.c_str(), json_data.size());
sb->crc32c = crc32c(0, &sb->size, sb->size - ((uint8_t*)&sb->size - buf));
int fd = open(device.c_str(), O_DIRECT|O_RDWR);
int fd = open(device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR);
if (fd < 0)
{
fprintf(stderr, "Failed to open device %s: %s\n", device.c_str(), strerror(errno));
@ -149,7 +150,7 @@ json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_ex
json11::Json osd_params;
std::string json_err;
std::string real_device, device_type, real_data, real_meta, real_journal;
int r, fd = open(device.c_str(), O_DIRECT|O_RDWR);
int r, fd = open(device.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR);
if (fd < 0)
{
fprintf(stderr, "Failed to open device %s: %s\n", device.c_str(), strerror(errno));
@ -381,6 +382,34 @@ int disk_tool_t::pre_exec_osd(std::string device)
return 0;
}
int disk_tool_t::clear_osd_superblock(const std::string & dev)
{
uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 4096);
int fd = -1, r = open(dev.c_str(), (options["io"] == "cached" ? 0 : O_DIRECT) | O_RDWR);
if (r >= 0)
{
fd = r;
r = read_blocking(fd, buf, 4096);
if (r == 4096)
{
// Clear magic and CRC
memset(buf, 0, 12);
r = lseek64(fd, 0, 0);
if (r == 0)
{
r = write_blocking(fd, buf, 4096);
if (r == 4096)
r = 0;
}
}
}
if (fd >= 0)
close(fd);
free(buf);
buf = NULL;
return r;
}
int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
{
std::set<uint64_t> osd_numbers;
@ -439,7 +468,6 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
return 1;
}
// Destroy OSD superblocks
uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 4096);
for (auto & sb: superblocks)
{
for (auto dev_type: std::vector<std::string>{ "data", "meta", "journal" })
@ -447,26 +475,7 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
auto dev = sb["real_"+dev_type+"_device"].string_value();
if (dev != "")
{
int fd = -1, r = open(dev.c_str(), O_DIRECT|O_RDWR);
if (r >= 0)
{
fd = r;
r = read_blocking(fd, buf, 4096);
if (r == 4096)
{
// Clear magic and CRC
memset(buf, 0, 12);
r = lseek64(fd, 0, 0);
if (r == 0)
{
r = write_blocking(fd, buf, 4096);
if (r == 4096)
r = 0;
}
}
}
if (fd >= 0)
close(fd);
int r = clear_osd_superblock(dev);
if (r != 0)
{
fprintf(stderr, "Failed to clear OSD %ju %s device %s superblock: %s\n",
@ -487,7 +496,7 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
fprintf(stderr, "Failed to delete partition %s: failed to find parent device\n", dev.c_str());
continue;
}
auto pt = read_parttable("/dev/"+parent_dev);
auto pt = read_parttable(parent_dev);
if (!pt.is_object())
continue;
json11::Json::array newpt = pt["partitions"].array_items();
@ -498,7 +507,7 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
auto old_part = newpt[i];
newpt.erase(newpt.begin()+i, newpt.begin()+i+1);
vitastor_dev_info_t devinfo = {
.path = "/dev/"+parent_dev,
.path = parent_dev,
.pt = json11::Json::object{ { "partitions", newpt } },
};
add_partitions(devinfo, {});
@ -507,7 +516,7 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
errno != ENOENT)
{
std::string out;
shell_exec({ "partprobe", "/dev/"+parent_dev }, "", &out, NULL);
shell_exec({ "partprobe", parent_dev }, "", &out, NULL);
}
break;
}
@ -516,7 +525,5 @@ int disk_tool_t::purge_devices(const std::vector<std::string> & devices)
}
}
}
free(buf);
buf = NULL;
return 0;
}

View File

@ -101,7 +101,7 @@ int disk_tool_t::upgrade_simple_unit(std::string unit)
resizer.options = options;
for (auto & kv: resize)
resizer.options[kv.first] = std::to_string(kv.second);
if (resizer.resize_data() != 0)
if (resizer.raw_resize() != 0)
{
// FIXME: Resize with backup or journal
fprintf(

View File

@ -60,14 +60,14 @@ int disable_cache(std::string dev)
auto parent_dev = get_parent_device(dev);
if (parent_dev == "")
return 1;
auto scsi_disk = "/sys/block/"+parent_dev+"/device/scsi_disk";
auto scsi_disk = "/sys/block/"+parent_dev.substr(5)+"/device/scsi_disk";
DIR *dir = opendir(scsi_disk.c_str());
if (!dir)
{
if (errno == ENOENT)
{
// Not a SCSI/SATA device, just check /sys/block/.../queue/write_cache
return check_queue_cache(dev.substr(5), parent_dev);
return check_queue_cache(dev.substr(5), parent_dev.substr(5));
}
else
{
@ -84,7 +84,7 @@ int disable_cache(std::string dev)
{
// Not a SCSI/SATA device, just check /sys/block/.../queue/write_cache
closedir(dir);
return check_queue_cache(dev.substr(5), parent_dev);
return check_queue_cache(dev.substr(5), parent_dev.substr(5));
}
scsi_disk += "/";
scsi_disk += de->d_name;
@ -117,6 +117,38 @@ int disable_cache(std::string dev)
return 0;
}
uint64_t get_device_size(const std::string & dev, bool should_exist)
{
struct stat dev_st;
if (stat(dev.c_str(), &dev_st) < 0)
{
if (errno == ENOENT && !should_exist)
{
return 0;
}
fprintf(stderr, "Error checking %s: %s\n", dev.c_str(), strerror(errno));
return UINT64_MAX;
}
uint64_t dev_size = dev_st.st_size;
if (S_ISBLK(dev_st.st_mode))
{
int fd = open(dev.c_str(), O_DIRECT|O_RDWR);
if (fd < 0)
{
fprintf(stderr, "Failed to open %s: %s\n", dev.c_str(), strerror(errno));
return UINT64_MAX;
}
if (ioctl(fd, BLKGETSIZE64, &dev_size) < 0)
{
fprintf(stderr, "Failed to get %s size: %s\n", dev.c_str(), strerror(errno));
close(fd);
return UINT64_MAX;
}
close(fd);
}
return dev_size;
}
std::string get_parent_device(std::string dev)
{
if (dev.substr(0, 5) != "/dev/")
@ -125,16 +157,26 @@ std::string get_parent_device(std::string dev)
return "";
}
dev = dev.substr(5);
// check if it's a partition - partitions aren't present in /sys/block/
struct stat st;
auto chk = "/sys/block/"+dev;
if (stat(chk.c_str(), &st) == 0)
{
// present in /sys/block/ - not a partition
return "/dev/"+dev;
}
else if (errno != ENOENT)
{
fprintf(stderr, "Failed to stat %s: %s\n", chk.c_str(), strerror(errno));
return "";
}
int i = dev.size();
while (i > 0 && isdigit(dev[i-1]))
i--;
if (i >= 1 && dev[i-1] == '-') // dm-0, dm-1
return dev;
else if (i >= 2 && dev[i-1] == 'p' && isdigit(dev[i-2])) // nvme0n1p1
if (i >= 2 && dev[i-1] == 'p' && isdigit(dev[i-2])) // nvme0n1p1
i--;
// Check that such block device exists
struct stat st;
auto chk = "/sys/block/"+dev.substr(0, i);
chk = "/sys/block/"+dev.substr(0, i);
if (stat(chk.c_str(), &st) < 0)
{
if (errno != ENOENT)
@ -142,16 +184,9 @@ std::string get_parent_device(std::string dev)
fprintf(stderr, "Failed to stat %s: %s\n", chk.c_str(), strerror(errno));
return "";
}
return dev;
return "/dev/"+dev;
}
return dev.substr(0, i);
}
bool json_is_true(const json11::Json & val)
{
if (val.is_string())
return val == "true" || val == "yes" || val == "1";
return val.bool_value();
return "/dev/"+dev.substr(0, i);
}
int shell_exec(const std::vector<std::string> & cmd, const std::string & in, std::string *out, std::string *err)
@ -308,23 +343,42 @@ uint64_t free_from_parttable(json11::Json pt)
return free;
}
int fix_partition_type(std::string dev_by_uuid)
int fix_partition_type_uuid(std::string & dev_by_uuid, const std::string & type_uuid)
{
auto uuid = strtolower(dev_by_uuid.substr(dev_by_uuid.rfind('/')+1));
std::string parent_dev = get_parent_device(realpath_str(dev_by_uuid, false));
bool is_partuuid = dev_by_uuid.substr(0, 22) == "/dev/disk/by-partuuid/";
auto uuid = is_partuuid ? strtolower(dev_by_uuid.substr(22)) : "";
auto node = realpath_str(dev_by_uuid, false);
std::string parent_dev = get_parent_device(node);
if (parent_dev == "")
return 1;
auto pt = read_parttable("/dev/"+parent_dev);
auto pt = read_parttable(parent_dev);
if (pt.is_null() || pt.is_bool())
return 1;
bool found = false;
std::string script = "label: gpt\n\n";
for (const auto & part: pt["partitions"].array_items())
{
bool this_part = (strtolower(part["uuid"].string_value()) == uuid);
if (this_part && strtolower(part["type"].string_value()) == "e7009fac-a5a1-4d72-af72-53de13059903")
bool this_part = (part["node"].string_value() == node) &&
(!is_partuuid || strtolower(part["uuid"].string_value()) == uuid);
if (this_part)
{
// Already correct type
return 0;
found = true;
if (!is_partuuid)
{
if (part["uuid"] == "")
{
fprintf(stderr, "Could not determine partition UUID for %s. Please use GPT partitions\n", dev_by_uuid.c_str());
return 1;
}
auto new_dev = "/dev/disk/by-partuuid/"+strtolower(part["uuid"].string_value());
fprintf(stderr, "Using %s instead of %s\n", new_dev.c_str(), dev_by_uuid.c_str());
dev_by_uuid = new_dev;
}
if (strtolower(part["type"].string_value()) == type_uuid)
{
// Already correct type
return 0;
}
}
script += part["node"].string_value()+": ";
bool first = true;
@ -334,15 +388,20 @@ int fix_partition_type(std::string dev_by_uuid)
{
script += (first ? "" : ", ")+kv.first+"="+
(kv.first == "type" && this_part
? "e7009fac-a5a1-4d72-af72-53de13059903"
? type_uuid
: (kv.second.is_string() ? kv.second.string_value() : kv.second.dump()));
first = false;
}
}
script += "\n";
}
if (!found)
{
fprintf(stderr, "Could not find partition table entry for %s\n", dev_by_uuid.c_str());
return 1;
}
std::string out;
return shell_exec({ "sfdisk", "--no-reread", "--force", "/dev/"+parent_dev }, script, &out, NULL);
return shell_exec({ "sfdisk", "--no-reread", "--no-tell-kernel", "--force", parent_dev }, script, &out, NULL);
}
std::string csum_type_str(uint32_t data_csum_type)

View File

@ -19,6 +19,7 @@
#include "addr_util.h"
#include "str_util.h"
#include "json_util.h"
#include "nfs_proxy.h"
#include "nfs_kv.h"
#include "nfs_block.h"

View File

@ -14,19 +14,7 @@
#include "osd.h"
#include "http_client.h"
#include "str_util.h"
static blockstore_config_t json_to_bs(const json11::Json::object & config)
{
blockstore_config_t bs;
for (auto kv: config)
{
if (kv.second.is_string())
bs[kv.first] = kv.second.string_value();
else if (!kv.second.is_null())
bs[kv.first] = kv.second.dump();
}
return bs;
}
#include "json_util.h"
osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
{
@ -46,7 +34,7 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
if (!json_is_true(this->config["disable_blockstore"]))
{
auto bs_cfg = json_to_bs(this->config);
auto bs_cfg = json_to_string_map(this->config);
this->bs = new blockstore_t(bs_cfg, ringloop, tfd);
// Wait for blockstore initialisation before actually starting OSD logic
// to prevent peering timeouts during restart with filled databases
@ -151,7 +139,7 @@ void osd_t::parse_config(bool init)
}
if (bs)
{
auto bs_cfg = json_to_bs(config);
auto bs_cfg = json_to_string_map(config);
bs->parse_config(bs_cfg);
}
st_cli.parse_config(config);

View File

@ -150,7 +150,7 @@ class osd_t
bool pg_config_applied = false;
bool etcd_reporting_pg_state = false;
bool etcd_reporting_stats = false;
int autosync_timer_id = -1, print_stats_timer_id = -1, slow_log_timer_id = -1;
int print_stats_timer_id = -1, slow_log_timer_id = -1;
// peers and PGs
@ -168,6 +168,8 @@ class osd_t
object_id recovery_last_oid;
int recovery_pg_done = 0, recovery_done = 0;
osd_op_t *autosync_op = NULL;
int autosync_copies_to_delete = 0;
int autosync_timer_id = -1;
// Scrubbing
uint64_t scrub_nearest_ts = 0;

View File

@ -13,10 +13,11 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
bool first = true;
while (it != pg.flush_actions.end())
{
if (!first && (it->first.oid.inode != prev_it->first.oid.inode ||
(it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK)) &&
fb->rollback_lists[it->first.osd_num].size() >= FLUSH_BATCH ||
fb->stable_lists[it->first.osd_num].size() >= FLUSH_BATCH)
if (!first &&
(it->first.oid.inode != prev_it->first.oid.inode ||
(it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK)) &&
(fb->rollback_lists[it->first.osd_num].size() >= FLUSH_BATCH ||
fb->stable_lists[it->first.osd_num].size() >= FLUSH_BATCH))
{
// Stop only at the object boundary
break;
@ -75,6 +76,7 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
// Throw the result away
return;
}
fb->flush_done++;
if (retval != 0)
{
if (peer_osd == this->osd_num)
@ -92,12 +94,11 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
auto fd_it = msgr.osd_peer_fds.find(peer_osd);
if (fd_it != msgr.osd_peer_fds.end())
{
// Will repeer/stop this PG
msgr.stop_client(fd_it->second);
}
return;
}
}
fb->flush_done++;
if (fb->flush_done == fb->flush_ops)
{
// This flush batch is done

View File

@ -645,6 +645,18 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec
{
throw std::runtime_error("BUG: Invalid object state: "+std::to_string((*object_state)->state));
}
if (changed && immediate_commit != IMMEDIATE_ALL)
{
// Trigger double automatic sync after changing PG state when we're running with fsyncs.
// First autosync commits all written objects and applies copies_to_delete_after_sync;
// Second autosync commits all deletions run by the first sync.
// Without it, rebalancing in a cluster without load may result in some small amount of
// garbage left on "extra" OSDs of the PG, because last deletions are not synced at all.
// FIXME: 1000% correct way is to switch PG state only after copies_to_delete_after_sync.
// But it's much more complicated.
unstable_write_count += autosync_writes;
autosync_copies_to_delete = 2;
}
if (changed && report)
{
report_pg_state(pg);

View File

@ -9,6 +9,10 @@ void osd_t::autosync()
{
if (immediate_commit != IMMEDIATE_ALL && !autosync_op)
{
if (autosync_copies_to_delete > 0)
{
autosync_copies_to_delete--;
}
autosync_op = new osd_op_t();
autosync_op->op_type = OSD_OP_IN;
autosync_op->peer_fd = SELF_FD;
@ -29,6 +33,11 @@ void osd_t::autosync()
}
delete autosync_op;
autosync_op = NULL;
if (autosync_copies_to_delete > 0)
{
// Trigger the second "copies_to_delete" autosync
autosync();
}
};
exec_op(autosync_op);
}

View File

@ -213,6 +213,15 @@ resume_8:
{
goto resume_6;
}
if (immediate_commit == IMMEDIATE_NONE)
{
// Mark OSDs as dirty because deletions have to be synced too!
for (int i = 0; i < op_data->copies_to_delete_count; i++)
{
auto & chunk = op_data->copies_to_delete[i];
this->dirty_osds.insert(chunk.osd_num);
}
}
}
for (int i = 0; i < op_data->dirty_pg_count; i++)
{
@ -227,7 +236,7 @@ resume_8:
start_pg_peering(pg);
}
}
// FIXME: Free those in the destructor?
// FIXME: Free those in the destructor (not here)?
free(op_data->dirty_pgs);
op_data->dirty_pgs = NULL;
op_data->dirty_osds = NULL;

View File

@ -7,6 +7,12 @@
bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
{
osd_primary_op_data_t *op_data = cur_op->op_data;
// First check if PG is not active anymore
if (!(pg.state & PG_ACTIVE))
{
pg_cancel_write_queue(pg, cur_op, op_data->oid, -EPIPE);
return false;
}
// Check if actions are pending for this object
auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){
.oid = op_data->oid,

View File

@ -12,6 +12,11 @@ target_link_libraries(stub_bench tcmalloc_minimal)
add_executable(osd_test osd_test.cpp ../util/rw_blocking.cpp ../util/addr_util.cpp)
target_link_libraries(osd_test tcmalloc_minimal)
# bindiff
add_executable(bindiff
bindiff.c
)
# stub_uring_osd
add_executable(stub_uring_osd
stub_uring_osd.cpp

177
src/test/bindiff.c Normal file
View File

@ -0,0 +1,177 @@
// Copyright (c) Vitaliy Filippov, 2004+
// License: VNPL-1.1 (see README.md for details)
#ifndef _LARGEFILE64_SOURCE
#define _LARGEFILE64_SOURCE
#endif
#include <string.h>
#include <sys/stat.h>
#include <errno.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <unistd.h>
#include <fcntl.h>
#define BUFSIZE 0x100000
uint64_t filelength(int fd)
{
struct stat st;
if (fstat(fd, &st) < 0)
{
fprintf(stderr, "fstat failed: %s\n", strerror(errno));
return 0;
}
if (st.st_size < 0)
{
return 0;
}
return (uint64_t)st.st_size;
}
size_t read_blocking(int fd, void *read_buf, size_t remaining)
{
size_t done = 0;
while (done < remaining)
{
ssize_t r = read(fd, read_buf, remaining-done);
if (r <= 0)
{
if (!errno)
{
// EOF
return done;
}
else if (errno != EINTR && errno != EAGAIN && errno != EPIPE)
{
perror("read");
exit(1);
}
continue;
}
done += (size_t)r;
read_buf = (uint8_t*)read_buf + r;
}
return done;
}
size_t write_blocking(int fd, void *write_buf, size_t remaining)
{
size_t done = 0;
while (done < remaining)
{
ssize_t r = write(fd, write_buf, remaining-done);
if (r < 0)
{
if (errno != EINTR && errno != EAGAIN && errno != EPIPE)
{
perror("write");
exit(1);
}
continue;
}
done += (size_t)r;
write_buf = (uint8_t*)write_buf + r;
}
return done;
}
int main(int narg, char *args[])
{
int fd1 = -1, fd2 = -1;
uint8_t *buf1 = NULL, *buf2 = NULL;
uint64_t addr = 0, l1 = 0, l2 = 0, l = 0, diffl = 0;
size_t buf1_len = 0, buf2_len = 0, i = 0, j = 0, dl = 0;
int argoff = 0;
int nosource = 0;
fprintf(stderr, "VMX HexDiff v2.1\nLicense: GPLv3.0+, (c) 2005+, Vitaliy Filippov\n");
argoff = 1;
if (narg > argoff && strcmp(args[argoff], "-n") == 0)
{
nosource = 1;
argoff++;
}
if (narg < argoff+2)
{
fprintf(stderr, "USAGE: bindiff [-n] <file1> <file2>\n"
"This will create hex patch file1->file2 and write it to stdout.\n"
"[-n] = do not write file1 data in patch, only file2.\n");
return -1;
}
fd1 = open(args[argoff], O_RDONLY);
if (fd1 < 0)
{
fprintf(stderr, "Couldn't open %s: %s\n", args[argoff], strerror(errno));
return -1;
}
fd2 = open(args[argoff+1], O_RDONLY);
if (fd2 < 0)
{
fprintf(stderr, "Couldn't open %s: %s\n", args[argoff+1], strerror(errno));
close(fd1);
return -1;
}
l1 = filelength(fd1);
l2 = filelength(fd2);
if (l1 < l2)
l = l1;
else
l = l2;
addr = diffl = 0;
buf1 = malloc(BUFSIZE+1);
buf2 = malloc(BUFSIZE+1);
while ((buf1_len = read_blocking(fd1, buf1, BUFSIZE)) > 0 && (buf2_len = read_blocking(fd2, buf2, BUFSIZE)) > 0)
{
buf1[buf1_len] = buf2[buf2_len] = 0;
for (dl = 0, i = 0; i <= buf1_len && i <= buf2_len; i++, addr++)
{
if (buf1[i] != buf2[i])
{
dl++;
}
else if (dl)
{
printf("%08jX: ", addr-dl);
if (!nosource)
{
for (j = i-dl; j < i; j++)
printf("%02X", buf1[j]);
printf(" ");
}
for (j = i-dl; j < i; j++)
printf("%02X", buf2[j]);
printf("\n");
diffl += dl;
dl = 0;
}
}
addr--;
}
if (l1 < l2)
{
printf("%08zX: ", i);
while ((buf2_len = read_blocking(fd2, buf2, BUFSIZE)) > 0)
{
for (j = 0; j < buf2_len; j++, i++)
printf("%02X", buf2[j]);
}
printf("\n");
}
else if (l1 > l2)
{
printf("SIZE %08zX\n", l2);
}
if (diffl != 0 || l1 != l2)
{
fprintf(stderr, "Difference in %zu of %zu common bytes\n", diffl, l);
if (l1 != l2)
fprintf(stderr, "Length difference!\nFile \"%s\": %zu\nFile \"%s\": %zu\n", args [1], l1, args [2], l2);
}
else
{
fprintf(stderr, "Files are equal\n");
}
return 0;
}

View File

@ -55,10 +55,3 @@ json11::Json::object osd_messenger_t::merge_configs(const json11::Json::object &
{
return cli_config;
}
bool json_is_true(const json11::Json & val)
{
if (val.is_string())
return val == "true" || val == "yes" || val == "1";
return val.bool_value();
}

35
src/util/json_util.cpp Normal file
View File

@ -0,0 +1,35 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
#include "json_util.h"
std::map<std::string, std::string> json_to_string_map(const json11::Json::object & config)
{
std::map<std::string, std::string> bs;
for (auto kv: config)
{
if (kv.second.is_string())
bs[kv.first] = kv.second.string_value();
else if (!kv.second.is_null())
bs[kv.first] = kv.second.dump();
}
return bs;
}
bool json_is_true(const json11::Json & val)
{
if (val.is_string())
return val == "true" || val == "yes" || val == "1";
return val.bool_value();
}
bool json_is_false(const json11::Json & val)
{
if (val.is_string())
return val.string_value() == "false" || val.string_value() == "no" || val.string_value() == "0";
if (val.is_number())
return val.number_value() == 0;
if (val.is_bool())
return !val.bool_value();
return false;
}

13
src/util/json_util.h Normal file
View File

@ -0,0 +1,13 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
#pragma once
#include <map>
#include <string>
#include "json11/json11.hpp"
std::map<std::string, std::string> json_to_string_map(const json11::Json::object & config);
bool json_is_true(const json11::Json & val);
bool json_is_false(const json11::Json & val);

View File

@ -10,7 +10,7 @@
#include "rw_blocking.h"
int read_blocking(int fd, void *read_buf, size_t remaining)
size_t read_blocking(int fd, void *read_buf, size_t remaining)
{
size_t done = 0;
while (done < remaining)
@ -30,13 +30,13 @@ int read_blocking(int fd, void *read_buf, size_t remaining)
}
continue;
}
done += r;
done += (size_t)r;
read_buf = (uint8_t*)read_buf + r;
}
return done;
}
int write_blocking(int fd, void *write_buf, size_t remaining)
size_t write_blocking(int fd, void *write_buf, size_t remaining)
{
size_t done = 0;
while (done < remaining)
@ -51,7 +51,7 @@ int write_blocking(int fd, void *write_buf, size_t remaining)
}
continue;
}
done += r;
done += (size_t)r;
write_buf = (uint8_t*)write_buf + r;
}
return done;

View File

@ -6,8 +6,8 @@
#include <unistd.h>
#include <sys/uio.h>
int read_blocking(int fd, void *read_buf, size_t remaining);
int write_blocking(int fd, void *write_buf, size_t remaining);
size_t read_blocking(int fd, void *read_buf, size_t remaining);
size_t write_blocking(int fd, void *write_buf, size_t remaining);
int readv_blocking(int fd, iovec *iov, int iovcnt);
int writev_blocking(int fd, iovec *iov, int iovcnt);
int sendv_blocking(int fd, iovec *iov, int iovcnt, int flags);

View File

@ -62,7 +62,7 @@ int timerfd_manager_t::set_timer_us(uint64_t micros, bool repeat, std::function<
.callback = callback,
});
inc_timer(timers[timers.size()-1]);
set_nearest();
set_nearest(false);
return timer_id;
}
@ -82,13 +82,13 @@ void timerfd_manager_t::clear_timer(int timer_id)
{
nearest--;
}
set_nearest();
set_nearest(false);
break;
}
}
}
void timerfd_manager_t::set_nearest()
void timerfd_manager_t::set_nearest(bool trigger_inline)
{
if (onstack > 0)
{
@ -134,10 +134,13 @@ again:
}
if (exp.it_value.tv_sec < 0 || exp.it_value.tv_sec == 0 && exp.it_value.tv_nsec <= 0)
{
// It already happened
// FIXME: Postpone to setImmediate/BH to avoid reenterability problems
trigger_nearest();
goto again;
// It already happened - set minimal timeout
if (trigger_inline)
{
trigger_nearest();
goto again;
}
exp.it_value = { .tv_sec = 0, .tv_nsec = 1 };
}
if (timerfd_settime(timerfd, 0, &exp, NULL))
{
@ -157,7 +160,7 @@ void timerfd_manager_t::handle_readable()
trigger_nearest();
}
wait_state = 0;
set_nearest();
set_nearest(true);
}
void timerfd_manager_t::trigger_nearest()

View File

@ -26,7 +26,7 @@ class timerfd_manager_t
std::vector<timerfd_timer_t> timers;
void inc_timer(timerfd_timer_t & t);
void set_nearest();
void set_nearest(bool trigger_inline);
void trigger_nearest();
void handle_readable();
public:

View File

@ -83,16 +83,19 @@ fi
POOLCFG='"name":"testpool","failure_domain":"osd",'$POOLCFG
$ETCDCTL put /vitastor/config/pools '{"1":{'$POOLCFG',"pg_size":'$PG_SIZE',"pg_minsize":'$PG_MINSIZE',"pg_count":'$PG_COUNT'}}'
wait_up()
wait_pool_up()
{
local sec=$1
local pool=$2
local pgsize=$3
local pgcount=$4
local i=0
local configured=0
while [[ $i -lt $sec ]]; do
if $ETCDCTL get /vitastor/pg/config --print-value-only | jq -s -e '(. | length) != 0 and ([ .[0].items["1"][] |
select(((.osd_set | select(. != 0) | sort | unique) | length) == '$PG_SIZE') ] | length) == '$PG_COUNT; then
if $ETCDCTL get /vitastor/pg/config --print-value-only | jq -s -e '(. | length) != 0 and ([ .[0].items["'$pool'"][] |
select(((.osd_set | select(. != 0) | sort | unique) | length) == '$pgsize') ] | length) == '$pgcount; then
configured=1
if $ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active"]) ] | length == '$PG_COUNT; then
if $ETCDCTL get /vitastor/pg/state/$pool/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active"]) ] | length == '$pgcount; then
break
fi
fi
@ -107,6 +110,11 @@ wait_up()
done
}
wait_up()
{
wait_pool_up "$1" 1 $PG_SIZE $PG_COUNT
}
if [[ $OSD_COUNT -gt 0 ]]; then
wait_up 120
fi

View File

@ -68,6 +68,11 @@ TEST_NAME=csum_4k_dmj OSD_ARGS="--data_csum_type crc32c --inmemory_metadata fal
TEST_NAME=csum_4k_dj OSD_ARGS="--data_csum_type crc32c --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
TEST_NAME=csum_4k OSD_ARGS="--data_csum_type crc32c" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
./test_resize.sh
./test_resize_auto.sh
./test_snapshot_pool2.sh
./test_osd_tags.sh
./test_enospc.sh

View File

@ -60,6 +60,7 @@ qemu-img convert -S 4096 -p \
-O raw ./testdata/bin/read.bin
if ! diff -q ./testdata/bin/read.bin ./testdata/bin/mirror.bin; then
build/src/test/bindiff ./testdata/bin/read.bin ./testdata/bin/mirror.bin
format_error Data lost during self-heal
fi

View File

@ -3,6 +3,7 @@
PG_COUNT=${PG_COUNT:-32}
. `dirname $0`/run_3osds.sh
check_qemu
LD_PRELOAD="build/src/client/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/client/libfio_vitastor.so -bs=4M -direct=1 -iodepth=4 \
@ -26,22 +27,22 @@ for i in $(seq 1 $OSD_COUNT); do
offsets=$(build/src/disk_tool/vitastor-disk simple-offsets --format json ./testdata/bin/test_osd$i.bin)
meta_offset=$(echo $offsets | jq -r .meta_offset)
data_offset=$(echo $offsets | jq -r .data_offset)
build/src/disk_tool/vitastor-disk dump-journal --json ./testdata/bin/test_osd$i.bin 4096 0 $meta_offset >./testdata/journal_before_resize.json
build/src/disk_tool/vitastor-disk dump-meta ./testdata/bin/test_osd$i.bin 4096 $meta_offset $((data_offset-meta_offset)) >./testdata/meta_before_resize.json
build/src/disk_tool/vitastor-disk resize \
build/src/disk_tool/vitastor-disk dump-journal --io cached --json ./testdata/bin/test_osd$i.bin 4096 0 $meta_offset >./testdata/journal_before_resize.json
build/src/disk_tool/vitastor-disk dump-meta --io cached ./testdata/bin/test_osd$i.bin 4096 $meta_offset $((data_offset-meta_offset)) >./testdata/meta_before_resize.json
build/src/disk_tool/vitastor-disk raw-resize --io cached \
$(build/src/disk_tool/vitastor-disk simple-offsets --format options ./testdata/bin/test_osd$i.bin 2>/dev/null) \
--new_meta_offset 0 \
--new_meta_len $((1024*1024)) \
--new_journal_offset $((1024*1024)) \
--new_data_offset $((128*1024*1024))
build/src/disk_tool/vitastor-disk dump-journal --json ./testdata/bin/test_osd$i.bin 4096 $((1024*1024)) $((127*1024*1024)) >./testdata/journal_after_resize.json
build/src/disk_tool/vitastor-disk dump-meta ./testdata/bin/test_osd$i.bin 4096 0 $((1024*1024)) >./testdata/meta_after_resize.json
--new_data_offset $((128*1024*1024+32768))
build/src/disk_tool/vitastor-disk dump-journal --io cached --json ./testdata/bin/test_osd$i.bin 4096 $((1024*1024)) $((127*1024*1024)) >./testdata/journal_after_resize.json
build/src/disk_tool/vitastor-disk dump-meta --io cached ./testdata/bin/test_osd$i.bin 4096 0 $((1024*1024)) >./testdata/meta_after_resize.json
if ! (cat ./testdata/meta_before_resize.json ./testdata/meta_after_resize.json | \
jq -e -s 'map([ .entries[] | del(.block) ] | sort_by(.pool, .inode, .stripe)) | .[0] == .[1] and (.[0] | length) > 1000'); then
format_error "OSD $i metadata corrupted after resizing"
fi
if ! (cat ./testdata/journal_before_resize.json ./testdata/journal_after_resize.json | \
jq -e -s 'map([ .[].entries[] | del(.crc32, .crc32_prev, .valid, .loc, .start) ]) | .[0] == .[1] and (.[0] | length) > 1'); then
jq -e -s 'map([ .[] | del(.crc32, .crc32_prev, .valid, .loc, .start) ]) | .[0] == .[1] and (.[0] | length) > 1'); then
format_error "OSD $i journal corrupted after resizing"
fi
done
@ -53,7 +54,7 @@ for i in $(seq 1 $OSD_COUNT); do
--data_device ./testdata/bin/test_osd$i.bin \
--meta_offset 0 \
--journal_offset $((1024*1024)) \
--data_offset $((128*1024*1024)) >>./testdata/osd$i.log 2>&1 &
--data_offset $((128*1024*1024+32768)) >>./testdata/osd$i.log 2>&1 &
eval OSD${i}_PID=$!
done

94
tests/test_resize_auto.sh Executable file
View File

@ -0,0 +1,94 @@
#!/bin/bash -ex
ANTIETCD=1
. `dirname $0`/common.sh
[[ -e build/src/disk_tool/vitastor-disk-test ]] || ln -s vitastor-disk build/src/disk_tool/vitastor-disk-test
dd if=/dev/zero of=./testdata/bin/test_osd1.bin bs=1 count=1 seek=$((100*1024*1024*1024-1))
LOOP1=$(sudo losetup --show -f ./testdata/bin/test_osd1.bin)
trap "kill -9 $(jobs -p) || true; sudo losetup -d $LOOP1"' || true' EXIT
dd if=/dev/zero of=./testdata/bin/test_meta.bin bs=1 count=1 seek=$((1024*1024*1024-1))
LOOP2=$(sudo losetup --show -f ./testdata/bin/test_meta.bin)
trap "kill -9 $(jobs -p) || true; sudo losetup -d $LOOP1 $LOOP2"' || true' EXIT
# also test prepare --hybrid :)
# non-vitastor random type UUID to prevent udev activation
mount | grep '/dev type devtmpfs' || sudo mount udev /dev/ -t devtmpfs
sudo build/src/disk_tool/vitastor-disk-test prepare --no_init 1 --meta_reserve 1x,1M \
--block_size 131072 --osd_num 987654 --part_type_uuid 0df42ae0-3695-4395-a957-7d5ff3645c56 \
--hybrid --fast-devices $LOOP2 $LOOP1
# write almost empty journal
node <<EOF > ./testdata/journal.json
console.log(JSON.stringify([
{"type":"start","start":"0x1000"},
{"type":"big_write_instant","inode":"0x1000000000001","stripe":"0xc60000","ver":"10","offset":0,"len":131072,"loc":"0x18ffdc0000","bitmap":"ffffffff"}
]));
EOF
sudo build/src/disk_tool/vitastor-disk write-journal ${LOOP1}p1 < ./testdata/journal.json
sudo build/src/disk_tool/vitastor-disk dump-journal --json --format data ${LOOP1}p1 | jq -S '[ .[] | del(.crc32, .crc32_prev) ]' > ./testdata/j2.json
jq -S '[ .[] + {"valid":true} ]' < ./testdata/journal.json > ./testdata/j1.json
diff ./testdata/j1.json ./testdata/j2.json
# write fake metadata items in the end
DATA_DEV_SIZE=$(sudo blockdev --getsize64 ${LOOP1}p1)
BLOCK_COUNT=$(((DATA_DEV_SIZE-4096)/128/1024))
node <<EOF > ./testdata/meta.json
console.log(JSON.stringify({
version: "0.9",
meta_block_size: 4096,
data_block_size: 131072,
bitmap_granularity: 4096,
data_csum_type: "none",
csum_block_size: 0,
entries: [ ...new Array(100).keys() ].map(i => ({
block: ($BLOCK_COUNT-100)+i,
pool: 1,
inode: "0x1",
stripe: "0x"+Number(i*0x20000).toString(16),
version: 10,
bitmap: "ffffffff",
ext_bitmap: "ffffffff",
})),
}));
EOF
# also test write & dump
sudo build/src/disk_tool/vitastor-disk write-meta ${LOOP1}p1 < ./testdata/meta.json
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 > ./testdata/compare.json
jq -S < ./testdata/meta.json > ./testdata/1.json
jq -S < ./testdata/compare.json > ./testdata/2.json
diff ./testdata/1.json ./testdata/2.json
# move journal & meta back, data will become smaller; end indexes should be shifted by -1251
sudo build/src/disk_tool/vitastor-disk-test resize --move-journal '' --move-meta '' ${LOOP1}p1
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 | jq -S > ./testdata/2.json
jq -S '. + {"entries": [ .entries[] | (. + { "block": (.block-1251) }) ]}' < ./testdata/meta.json > ./testdata/1.json
diff ./testdata/1.json ./testdata/2.json
sudo build/src/disk_tool/vitastor-disk dump-journal --json --format data ${LOOP1}p1 | jq -S '[ .[] | del(.crc32, .crc32_prev) ]' > ./testdata/j2.json
jq -S '[ (.[] + {"valid":true}) | (if .type == "big_write_instant" then . + {"loc":"0x18f6160000"} else . end) ]' < ./testdata/journal.json > ./testdata/j1.json
diff ./testdata/j1.json ./testdata/j2.json
# move journal & meta out, data will become larger; end indexes should be shifted back by +1251
sudo build/src/disk_tool/vitastor-disk-test resize --move-journal ${LOOP2}p1 --move-meta ${LOOP2}p2 ${LOOP1}p1
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 | jq -S > ./testdata/2.json
jq -S < ./testdata/meta.json > ./testdata/1.json
diff ./testdata/1.json ./testdata/2.json
jq -S '[ .[] + {"valid":true} ]' < ./testdata/journal.json > ./testdata/j1.json
sudo build/src/disk_tool/vitastor-disk dump-journal --json --format data ${LOOP1}p1 | jq -S '[ .[] | del(.crc32, .crc32_prev) ]' > ./testdata/j2.json
# reduce data device size by exactly 128k * 99 (occupied blocks); exactly 1 should be left in place :)
sudo build/src/disk_tool/vitastor-disk-test resize --data-size $((DATA_DEV_SIZE-128*1024*99)) ${LOOP1}p1
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 | jq -S > ./testdata/2.json
jq -S '. + {"entries": ([ .entries[] | (. + { "block": (.block | if . > '$BLOCK_COUNT'-100 then .-('$BLOCK_COUNT'-100+1) else '$BLOCK_COUNT'-100 end) }) ] | .[1:] + [ .[0] ])}' < ./testdata/meta.json > ./testdata/1.json
diff ./testdata/1.json ./testdata/2.json
jq -S '[ .[] + {"valid":true} ]' < ./testdata/journal.json > ./testdata/j1.json
sudo build/src/disk_tool/vitastor-disk dump-journal --json --format data ${LOOP1}p1 | jq -S '[ .[] | del(.crc32, .crc32_prev) ]' > ./testdata/j2.json
# extend data device size to maximum
sudo build/src/disk_tool/vitastor-disk-test resize --data-size max ${LOOP1}p1
sudo build/src/disk_tool/vitastor-disk dump-meta ${LOOP1}p1 | jq -S > ./testdata/2.json
diff ./testdata/1.json ./testdata/2.json
format_green OK

38
tests/test_snapshot_pool2.sh Executable file
View File

@ -0,0 +1,38 @@
#!/bin/bash -ex
. `dirname $0`/run_3osds.sh
check_qemu
# snapshot in another pool
build/src/cmd/vitastor-cli --etcd_address $ETCD_URL create-pool testpool2 -s 3 -n 4 --failure_domain osd
wait_pool_up 30 2 3 4
build/src/cmd/vitastor-cli --etcd_address $ETCD_URL create -s 128M testchain -p testpool
LD_PRELOAD="build/src/client/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/client/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 -fsync=1 -rw=write \
-etcd=$ETCD_URL -image=testchain -mirror_file=./testdata/bin/mirror.bin -buffer_pattern=0xabcd
build/src/cmd/vitastor-cli --etcd_address $ETCD_URL snap-create testchain@snap1 -p testpool2
LD_PRELOAD="build/src/client/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/client/libfio_vitastor.so -bs=4k -direct=1 -iodepth=4 -end_fsync=1 -rw=randwrite -number_ios=32 \
-etcd=$ETCD_URL -image=testchain -mirror_file=./testdata/bin/mirror.bin -buffer_pattern=0xabcd
build/src/cmd/vitastor-cli --etcd_address $ETCD_URL dd iimg=testchain of=./testdata/bin/res.bin bs=128k iodepth=4
cmp ./testdata/bin/res.bin ./testdata/bin/mirror.bin
build/src/cmd/vitastor-cli --etcd_address $ETCD_URL dd iimg=testchain of=./testdata/bin/res.bin bs=32k iodepth=4 conv=nosparse
cmp ./testdata/bin/res.bin ./testdata/bin/mirror.bin
qemu-img convert -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:image=testchain" \
-O raw ./testdata/bin/res.bin
cmp ./testdata/bin/res.bin ./testdata/bin/mirror.bin
format_green OK