Compare commits

...

19 Commits

Author SHA1 Message Date
Vitaliy Filippov fc4d97da10 Print "Waiting to become master" just once
Test / test_rebalance_verify_ec (push) Successful in 1m38s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m40s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_write (push) Successful in 30s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write_xor (push) Successful in 36s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Successful in 2m16s Details
Test / test_heal_antietcd (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m19s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m26s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m18s Details
Test / test_heal_csum_32k (push) Successful in 2m22s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m20s Details
Test / test_resize_auto (push) Successful in 9s Details
Test / test_resize (push) Successful in 14s Details
Test / test_osd_tags (push) Successful in 9s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_snapshot_pool2 (push) Successful in 16s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 12s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 15s Details
Test / test_scrub_zero_osd_2 (push) Successful in 14s Details
Test / test_scrub_xor (push) Successful in 15s Details
Test / test_scrub_pg_size_3 (push) Successful in 16s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 15s Details
Test / test_nfs (push) Successful in 13s Details
Test / test_heal_csum_4k (push) Successful in 2m18s Details
2024-11-21 00:55:22 +03:00
Vitaliy Filippov c7a4ce7341 Take out_size from dd oimg if not specified
Test / test_rebalance_verify_ec (push) Successful in 1m41s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m43s Details
Test / test_write_no_same (push) Successful in 9s Details
Test / test_write (push) Successful in 32s Details
Test / test_switch_primary (push) Successful in 34s Details
Test / test_write_xor (push) Successful in 36s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Successful in 2m19s Details
Test / test_heal_antietcd (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m18s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m18s Details
Test / test_heal_csum_32k (push) Successful in 2m19s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m20s Details
Test / test_resize_auto (push) Successful in 9s Details
Test / test_resize (push) Successful in 13s Details
Test / test_osd_tags (push) Successful in 8s Details
Test / test_snapshot_pool2 (push) Successful in 14s Details
Test / test_enospc (push) Successful in 12s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 11s Details
Test / test_enospc_imm_xor (push) Successful in 12s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 13s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 16s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 17s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 12s Details
Test / test_heal_csum_4k (push) Successful in 2m8s Details
2024-11-19 02:13:34 +03:00
Vitaliy Filippov ddea31d86d Auto-select first RDMA device only if RoCE is not found, add rocev2->rocev1->ib priority
Test / test_rebalance_verify_ec (push) Successful in 1m43s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m43s Details
Test / test_write_no_same (push) Successful in 10s Details
Test / test_write (push) Successful in 29s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write_xor (push) Successful in 35s Details
Test / test_heal_pg_size_2 (push) Successful in 2m17s Details
Test / test_heal_ec (push) Successful in 2m15s Details
Test / test_heal_antietcd (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m19s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m19s Details
Test / test_heal_csum_32k (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m19s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m17s Details
Test / test_resize (push) Successful in 13s Details
Test / test_resize_auto (push) Successful in 9s Details
Test / test_osd_tags (push) Successful in 9s Details
Test / test_snapshot_pool2 (push) Successful in 16s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 10s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub_zero_osd_2 (push) Successful in 14s Details
Test / test_scrub (push) Successful in 17s Details
Test / test_scrub_xor (push) Successful in 15s Details
Test / test_scrub_pg_size_3 (push) Successful in 17s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 12s Details
Test / test_heal_csum_4k (push) Successful in 2m19s Details
2024-11-19 01:54:00 +03:00
Vitaliy Filippov 156d005412 Add serialize_overlap to test_heal
Test / test_rebalance_verify_ec (push) Successful in 1m44s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m44s Details
Test / test_write_no_same (push) Successful in 10s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write (push) Successful in 32s Details
Test / test_write_xor (push) Successful in 35s Details
Test / test_heal_pg_size_2 (push) Successful in 2m16s Details
Test / test_heal_ec (push) Successful in 2m18s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m18s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m18s Details
Test / test_heal_csum_32k (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m17s Details
Test / test_resize_auto (push) Successful in 8s Details
Test / test_resize (push) Successful in 13s Details
Test / test_osd_tags (push) Successful in 10s Details
Test / test_snapshot_pool2 (push) Successful in 16s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_enospc_xor (push) Successful in 13s Details
Test / test_enospc_imm (push) Successful in 12s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 15s Details
Test / test_scrub_zero_osd_2 (push) Successful in 15s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_scrub_pg_size_3 (push) Successful in 18s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 14s Details
Test / test_heal_csum_4k (push) Successful in 2m22s Details
2024-11-17 01:26:35 +03:00
Vitaliy Filippov 7e076c7049 Do not report OSDs with empty statistics as full
Test / test_rebalance_verify_ec (push) Successful in 1m36s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m38s Details
Test / test_write_no_same (push) Successful in 7s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write (push) Successful in 32s Details
Test / test_write_xor (push) Successful in 34s Details
Test / test_heal_pg_size_2 (push) Successful in 2m16s Details
Test / test_heal_ec (push) Successful in 2m16s Details
Test / test_heal_antietcd (push) Successful in 2m18s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m18s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m22s Details
Test / test_heal_csum_32k (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m15s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m14s Details
Test / test_resize_auto (push) Successful in 8s Details
Test / test_resize (push) Successful in 14s Details
Test / test_osd_tags (push) Successful in 7s Details
Test / test_snapshot_pool2 (push) Successful in 16s Details
Test / test_enospc (push) Successful in 12s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 11s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 12s Details
Test / test_scrub_zero_osd_2 (push) Successful in 13s Details
Test / test_scrub_xor (push) Successful in 16s Details
Test / test_scrub_pg_size_3 (push) Successful in 15s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 16s Details
Test / test_nfs (push) Successful in 12s Details
Test / test_heal_csum_4k (push) Successful in 2m19s Details
2024-11-16 23:36:16 +03:00
Vitaliy Filippov 7de38250ad Auto-select RDMA device based on osd_network
Test / test_rebalance_verify_ec (push) Successful in 1m49s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m50s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write (push) Successful in 31s Details
Test / test_write_xor (push) Successful in 34s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m12s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m19s Details
Test / test_heal_csum_32k (push) Successful in 2m16s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m18s Details
Test / test_resize_auto (push) Successful in 9s Details
Test / test_resize (push) Successful in 15s Details
Test / test_osd_tags (push) Successful in 9s Details
Test / test_snapshot_pool2 (push) Successful in 17s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 11s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 13s Details
Test / test_scrub_xor (push) Successful in 13s Details
Test / test_scrub_pg_size_3 (push) Successful in 16s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 17s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 12s Details
Test / test_heal_csum_4k (push) Successful in 2m16s Details
2024-11-16 18:38:57 +03:00
Vitaliy Filippov 9c59d30e83 Report slow ops in OSD stats in etcd and show them in vitastor-cli status
Test / test_rebalance_verify_ec (push) Successful in 1m42s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m43s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_write (push) Successful in 30s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write_xor (push) Successful in 34s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Successful in 2m17s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m21s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m18s Details
Test / test_heal_csum_32k (push) Successful in 2m19s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m20s Details
Test / test_resize_auto (push) Successful in 9s Details
Test / test_resize (push) Successful in 14s Details
Test / test_osd_tags (push) Successful in 8s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_snapshot_pool2 (push) Successful in 15s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 10s Details
Test / test_enospc_imm_xor (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 12s Details
Test / test_scrub (push) Successful in 16s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 16s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 12s Details
Test / test_heal_csum_4k (push) Successful in 2m9s Details
2024-11-16 15:11:16 +03:00
Vitaliy Filippov 5db02cdf6e Add pve-qemu 9.0 patch 2024-11-16 11:20:47 +03:00
Vitaliy Filippov 8202ee9d74 Trigger double autosync when switching PG state to prevent leaving garbage in non-immediate_commit clusters
Test / test_dd (push) Successful in 12s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m39s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_write (push) Successful in 29s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write_xor (push) Successful in 33s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Successful in 2m16s Details
Test / test_heal_antietcd (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m22s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_32k (push) Successful in 2m20s Details
Test / test_resize_auto (push) Successful in 8s Details
Test / test_resize (push) Successful in 12s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m18s Details
Test / test_osd_tags (push) Successful in 7s Details
Test / test_snapshot_pool2 (push) Successful in 15s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_enospc_xor (push) Successful in 13s Details
Test / test_enospc_imm (push) Successful in 10s Details
Test / test_enospc_imm_xor (push) Successful in 12s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 15s Details
Test / test_scrub_xor (push) Successful in 15s Details
Test / test_scrub_pg_size_3 (push) Successful in 15s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 11s Details
Test / test_heal_csum_4k (push) Successful in 2m10s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m20s Details
2024-11-15 01:26:36 +03:00
Vitaliy Filippov 5864bd067c Add missing connection timeout for etcd websockets in OSD
Test / test_rebalance_verify_ec (push) Successful in 1m43s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m44s Details
Test / test_write_no_same (push) Successful in 7s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write (push) Successful in 30s Details
Test / test_write_xor (push) Successful in 34s Details
Test / test_heal_pg_size_2 (push) Successful in 2m14s Details
Test / test_heal_ec (push) Successful in 2m16s Details
Test / test_heal_antietcd (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m16s Details
Test / test_heal_csum_32k (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m18s Details
Test / test_resize_auto (push) Successful in 8s Details
Test / test_resize (push) Successful in 14s Details
Test / test_osd_tags (push) Successful in 9s Details
Test / test_snapshot_pool2 (push) Successful in 14s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_enospc_imm (push) Successful in 10s Details
Test / test_enospc_xor (push) Successful in 13s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 14s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 14s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 13s Details
Test / test_heal_csum_4k (push) Successful in 2m11s Details
2024-11-12 02:28:07 +03:00
Vitaliy Filippov c312557ace Do not execute remaining operations if the client is stopped during read
Test / test_root_node (push) Successful in 10s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m46s Details
Test / test_write_no_same (push) Successful in 10s Details
Test / test_write (push) Successful in 31s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write_xor (push) Successful in 35s Details
Test / test_heal_pg_size_2 (push) Successful in 2m17s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m20s Details
Test / test_heal_ec (push) Successful in 2m28s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m17s Details
Test / test_heal_csum_32k (push) Successful in 2m15s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_resize (push) Successful in 13s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m18s Details
Test / test_resize_auto (push) Successful in 9s Details
Test / test_osd_tags (push) Successful in 7s Details
Test / test_snapshot_pool2 (push) Successful in 15s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 11s Details
Test / test_enospc_imm_xor (push) Successful in 14s Details
Test / test_scrub (push) Successful in 15s Details
Test / test_scrub_zero_osd_2 (push) Successful in 16s Details
Test / test_scrub_xor (push) Successful in 15s Details
Test / test_scrub_pg_size_3 (push) Successful in 17s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 17s Details
Test / test_scrub_ec (push) Successful in 13s Details
Test / test_nfs (push) Successful in 12s Details
Test / test_heal_csum_4k (push) Successful in 2m20s Details
2024-11-10 16:44:13 +03:00
Vitaliy Filippov 5ce20116d8 Postpone trigger_nearest to prevent timer callbacks called from setTimer/clearTimer
Test / test_root_node (push) Successful in 9s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m48s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_write (push) Successful in 31s Details
Test / test_switch_primary (push) Successful in 34s Details
Test / test_write_xor (push) Successful in 35s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Successful in 2m16s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m19s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m17s Details
Test / test_heal_csum_32k (push) Successful in 2m19s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m16s Details
Test / test_resize_auto (push) Successful in 9s Details
Test / test_resize (push) Successful in 12s Details
Test / test_osd_tags (push) Successful in 8s Details
Test / test_snapshot_pool2 (push) Successful in 14s Details
Test / test_enospc (push) Successful in 10s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm (push) Successful in 11s Details
Test / test_enospc_imm_xor (push) Successful in 14s Details
Test / test_scrub (push) Successful in 15s Details
Test / test_scrub_zero_osd_2 (push) Successful in 15s Details
Test / test_scrub_xor (push) Successful in 15s Details
Test / test_scrub_pg_size_3 (push) Successful in 15s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 10s Details
Test / test_heal_csum_4k (push) Successful in 2m18s Details
2024-11-10 15:51:16 +03:00
Vitaliy Filippov be66791e59 Add another note about 1.8 upgrade 2024-11-09 00:57:58 +03:00
Vitaliy Filippov 141cec2383 Add missing refcounting for flush_batch errors 2024-11-09 00:46:38 +03:00
Vitaliy Filippov 1ce4b1b417 Fix stop condition in osd_flush
Test / test_rebalance_verify_ec (push) Successful in 2m1s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m59s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_write (push) Successful in 32s Details
Test / test_switch_primary (push) Successful in 34s Details
Test / test_write_xor (push) Successful in 34s Details
Test / test_heal_pg_size_2 (push) Successful in 2m17s Details
Test / test_heal_ec (push) Successful in 2m16s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m22s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m19s Details
Test / test_heal_csum_32k (push) Successful in 2m20s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m18s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m17s Details
Test / test_resize_auto (push) Successful in 8s Details
Test / test_resize (push) Successful in 13s Details
Test / test_osd_tags (push) Successful in 7s Details
Test / test_enospc (push) Successful in 9s Details
Test / test_snapshot_pool2 (push) Successful in 13s Details
Test / test_enospc_xor (push) Successful in 11s Details
Test / test_enospc_imm (push) Successful in 12s Details
Test / test_enospc_imm_xor (push) Successful in 13s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 11s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 16s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 17s Details
Test / test_scrub_ec (push) Successful in 14s Details
Test / test_nfs (push) Successful in 12s Details
Test / test_heal_csum_4k (push) Successful in 2m9s Details
Could probably lead to PGs hung in peering states on OSD restart in EC pools,
fixable by primary OSD restart
2024-11-08 00:30:40 +03:00
Vitaliy Filippov ebf24bac9a Fix partition zeroing during prepare
Test / test_root_node (push) Successful in 10s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m46s Details
Test / test_write_no_same (push) Successful in 8s Details
Test / test_switch_primary (push) Successful in 33s Details
Test / test_write (push) Successful in 30s Details
Test / test_write_xor (push) Successful in 35s Details
Test / test_heal_pg_size_2 (push) Successful in 2m16s Details
Test / test_heal_ec (push) Successful in 2m15s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m20s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m18s Details
Test / test_heal_csum_32k (push) Successful in 2m16s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s Details
Test / test_resize_auto (push) Successful in 8s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m17s Details
Test / test_resize (push) Successful in 13s Details
Test / test_osd_tags (push) Successful in 8s Details
Test / test_snapshot_pool2 (push) Successful in 15s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_enospc_xor (push) Successful in 13s Details
Test / test_enospc_imm (push) Successful in 11s Details
Test / test_enospc_imm_xor (push) Successful in 14s Details
Test / test_scrub (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 14s Details
Test / test_scrub_xor (push) Successful in 13s Details
Test / test_scrub_pg_size_3 (push) Successful in 15s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s Details
Test / test_scrub_ec (push) Successful in 13s Details
Test / test_nfs (push) Successful in 10s Details
Test / test_heal_csum_4k (push) Successful in 2m12s Details
Previously it zeroed area beginning with 0 instead of actual metadata offset
which was leading to non-zeroed metadata when the disk is very small
2024-11-08 00:14:37 +03:00
Vitaliy Filippov edd9051f81 Fix arch.en toc 2024-11-08 00:14:18 +03:00
Vitaliy Filippov 662ca86dc0 Fix libvirt 8 patch 2024-11-07 12:21:32 +03:00
Vitaliy Filippov a1ca573168 Support QEMU 9.1
Test / test_rebalance_verify_ec (push) Successful in 1m42s Details
Test / test_rebalance_verify_ec_imm (push) Successful in 1m44s Details
Test / test_write_no_same (push) Successful in 7s Details
Test / test_switch_primary (push) Successful in 32s Details
Test / test_write (push) Successful in 33s Details
Test / test_write_xor (push) Successful in 35s Details
Test / test_heal_pg_size_2 (push) Successful in 2m15s Details
Test / test_heal_ec (push) Successful in 2m15s Details
Test / test_heal_antietcd (push) Successful in 2m16s Details
Test / test_heal_csum_32k_dmj (push) Successful in 2m13s Details
Test / test_heal_csum_32k_dj (push) Successful in 2m17s Details
Test / test_heal_csum_32k (push) Successful in 2m20s Details
Test / test_heal_csum_4k_dj (push) Successful in 2m16s Details
Test / test_heal_csum_4k_dmj (push) Successful in 2m21s Details
Test / test_resize (push) Successful in 12s Details
Test / test_resize_auto (push) Successful in 9s Details
Test / test_osd_tags (push) Successful in 9s Details
Test / test_snapshot_pool2 (push) Successful in 15s Details
Test / test_enospc (push) Successful in 11s Details
Test / test_enospc_imm (push) Successful in 9s Details
Test / test_enospc_xor (push) Successful in 12s Details
Test / test_enospc_imm_xor (push) Successful in 14s Details
Test / test_scrub_zero_osd_2 (push) Successful in 13s Details
Test / test_scrub (push) Successful in 16s Details
Test / test_scrub_xor (push) Successful in 14s Details
Test / test_scrub_pg_size_3 (push) Successful in 15s Details
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s Details
Test / test_scrub_ec (push) Successful in 15s Details
Test / test_nfs (push) Successful in 11s Details
Test / test_heal_csum_4k (push) Successful in 2m11s Details
2024-11-07 12:21:13 +03:00
35 changed files with 948 additions and 153 deletions

View File

@ -68,11 +68,15 @@ but they are not connected to the cluster.
- Type: string - Type: string
RDMA device name to use for Vitastor OSD communications (for example, RDMA device name to use for Vitastor OSD communications (for example,
"rocep5s0f0"). Now Vitastor supports all adapters, even ones without "rocep5s0f0"). If not specified, Vitastor will try to find an RoCE
ODP support, like Mellanox ConnectX-3 and non-Mellanox cards. device matching [osd_network](osd.en.md#osd_network), preferring RoCEv2,
or choose the first available RDMA device if no RoCE devices are
found or if `osd_network` is not specified.
Versions up to Vitastor 1.2.0 required ODP which is only present in Vitastor supports all adapters, even ones without ODP support, like
Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp). Mellanox ConnectX-3 and non-Mellanox cards. Versions up to Vitastor
1.2.0 required ODP which is only present in Mellanox ConnectX >= 4.
See also [rdma_odp](#rdma_odp).
Run `ibv_devinfo -v` as root to list available RDMA devices and their Run `ibv_devinfo -v` as root to list available RDMA devices and their
features. features.
@ -95,15 +99,16 @@ your device has.
## rdma_gid_index ## rdma_gid_index
- Type: integer - Type: integer
- Default: 0
Global address identifier index of the RDMA device to use. Different GID Global address identifier index of the RDMA device to use. Different GID
indexes may correspond to different protocols like RoCEv1, RoCEv2 and iWARP. indexes may correspond to different protocols like RoCEv1, RoCEv2 and iWARP.
Search for "GID" in `ibv_devinfo -v` output to determine which GID index Search for "GID" in `ibv_devinfo -v` output to determine which GID index
you need. you need.
**IMPORTANT:** If you want to use RoCEv2 (as recommended) then the correct If not specified, Vitastor will try to auto-select a RoCEv2 IPv4 GID, then
rdma_gid_index is usually 1 (IPv6) or 3 (IPv4). RoCEv2 IPv6 GID, then RoCEv1 IPv4 GID, then RoCEv1 IPv6 GID, then IB GID.
A correct rdma_gid_index for RoCEv2 is usually 1 (IPv6) or 3 (IPv4).
## rdma_mtu ## rdma_mtu

View File

@ -71,12 +71,16 @@ RDMA может быть нужно только если у клиентов е
- Тип: строка - Тип: строка
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0"). Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых Если не указано, Vitastor попробует найти RoCE-устройство, соответствующее
нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и [osd_network](osd.en.md#osd_network), предпочитая RoCEv2, или выбрать первое
картами производства не Mellanox. попавшееся RDMA-устройство, если RoCE-устройств нет или если сеть `osd_network`
не задана.
Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только Vitastor поддерживает все модели адаптеров, включая те, у которых
на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp). нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
картами производства не Mellanox. Версии Vitastor до 1.2.0 включительно
требовали ODP, который есть только на Mellanox ConnectX 4 и более новых.
См. также [rdma_odp](#rdma_odp).
Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
список доступных RDMA-устройств, их параметры и возможности. список доступных RDMA-устройств, их параметры и возможности.
@ -101,15 +105,17 @@ Control) и ECN (Explicit Congestion Notification).
## rdma_gid_index ## rdma_gid_index
- Тип: целое число - Тип: целое число
- Значение по умолчанию: 0
Номер глобального идентификатора адреса RDMA-устройства, который следует Номер глобального идентификатора адреса RDMA-устройства, который следует
использовать. Разным gid_index могут соответствовать разные протоколы связи: использовать. Разным gid_index могут соответствовать разные протоколы связи:
RoCEv1, RoCEv2, iWARP. Чтобы понять, какой нужен вам - смотрите строчки со RoCEv1, RoCEv2, iWARP. Чтобы понять, какой нужен вам - смотрите строчки со
словом "GID" в выводе команды `ibv_devinfo -v`. словом "GID" в выводе команды `ibv_devinfo -v`.
**ВАЖНО:** Если вы хотите использовать RoCEv2 (как мы и рекомендуем), то Если не указан, Vitastor попробует автоматически выбрать сначала GID,
правильный rdma_gid_index, как правило, 1 (IPv6) или 3 (IPv4). соответствующий RoCEv2 IPv4, потом RoCEv2 IPv6, потом RoCEv1 IPv4, потом
RoCEv1 IPv6, потом IB.
Правильный rdma_gid_index для RoCEv2, как правило, 1 (IPv6) или 3 (IPv4).
## rdma_mtu ## rdma_mtu

View File

@ -48,11 +48,15 @@
type: string type: string
info: | info: |
RDMA device name to use for Vitastor OSD communications (for example, RDMA device name to use for Vitastor OSD communications (for example,
"rocep5s0f0"). Now Vitastor supports all adapters, even ones without "rocep5s0f0"). If not specified, Vitastor will try to find an RoCE
ODP support, like Mellanox ConnectX-3 and non-Mellanox cards. device matching [osd_network](osd.en.md#osd_network), preferring RoCEv2,
or choose the first available RDMA device if no RoCE devices are
found or if `osd_network` is not specified.
Versions up to Vitastor 1.2.0 required ODP which is only present in Vitastor supports all adapters, even ones without ODP support, like
Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp). Mellanox ConnectX-3 and non-Mellanox cards. Versions up to Vitastor
1.2.0 required ODP which is only present in Mellanox ConnectX >= 4.
See also [rdma_odp](#rdma_odp).
Run `ibv_devinfo -v` as root to list available RDMA devices and their Run `ibv_devinfo -v` as root to list available RDMA devices and their
features. features.
@ -64,12 +68,16 @@
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification). PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
info_ru: | info_ru: |
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0"). Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых Если не указано, Vitastor попробует найти RoCE-устройство, соответствующее
нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и [osd_network](osd.en.md#osd_network), предпочитая RoCEv2, или выбрать первое
картами производства не Mellanox. попавшееся RDMA-устройство, если RoCE-устройств нет или если сеть `osd_network`
не задана.
Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только Vitastor поддерживает все модели адаптеров, включая те, у которых
на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp). нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
картами производства не Mellanox. Версии Vitastor до 1.2.0 включительно
требовали ODP, который есть только на Mellanox ConnectX 4 и более новых.
См. также [rdma_odp](#rdma_odp).
Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
список доступных RDMA-устройств, их параметры и возможности. список доступных RDMA-устройств, их параметры и возможности.
@ -94,23 +102,27 @@
`ibv_devinfo -v`. `ibv_devinfo -v`.
- name: rdma_gid_index - name: rdma_gid_index
type: int type: int
default: 0
info: | info: |
Global address identifier index of the RDMA device to use. Different GID Global address identifier index of the RDMA device to use. Different GID
indexes may correspond to different protocols like RoCEv1, RoCEv2 and iWARP. indexes may correspond to different protocols like RoCEv1, RoCEv2 and iWARP.
Search for "GID" in `ibv_devinfo -v` output to determine which GID index Search for "GID" in `ibv_devinfo -v` output to determine which GID index
you need. you need.
**IMPORTANT:** If you want to use RoCEv2 (as recommended) then the correct If not specified, Vitastor will try to auto-select a RoCEv2 IPv4 GID, then
rdma_gid_index is usually 1 (IPv6) or 3 (IPv4). RoCEv2 IPv6 GID, then RoCEv1 IPv4 GID, then RoCEv1 IPv6 GID, then IB GID.
A correct rdma_gid_index for RoCEv2 is usually 1 (IPv6) or 3 (IPv4).
info_ru: | info_ru: |
Номер глобального идентификатора адреса RDMA-устройства, который следует Номер глобального идентификатора адреса RDMA-устройства, который следует
использовать. Разным gid_index могут соответствовать разные протоколы связи: использовать. Разным gid_index могут соответствовать разные протоколы связи:
RoCEv1, RoCEv2, iWARP. Чтобы понять, какой нужен вам - смотрите строчки со RoCEv1, RoCEv2, iWARP. Чтобы понять, какой нужен вам - смотрите строчки со
словом "GID" в выводе команды `ibv_devinfo -v`. словом "GID" в выводе команды `ibv_devinfo -v`.
**ВАЖНО:** Если вы хотите использовать RoCEv2 (как мы и рекомендуем), то Если не указан, Vitastor попробует автоматически выбрать сначала GID,
правильный rdma_gid_index, как правило, 1 (IPv6) или 3 (IPv4). соответствующий RoCEv2 IPv4, потом RoCEv2 IPv6, потом RoCEv1 IPv4, потом
RoCEv1 IPv6, потом IB.
Правильный rdma_gid_index для RoCEv2, как правило, 1 (IPv6) или 3 (IPv4).
- name: rdma_mtu - name: rdma_mtu
type: int type: int
default: 4096 default: 4096

View File

@ -6,7 +6,12 @@
# Architecture # Architecture
- [Server-side components](#server-side-components)
- [Basic concepts](#basic-concepts) - [Basic concepts](#basic-concepts)
- [Client-side components](#client-side-components)
- [Additional utilities](#additional-utilities)
- [Overall read/write process](#overall-read-write-process)
- [Nuances of request handling](#nuances-of-request-handling)
- [Similarities to Ceph](#similarities-to-ceph) - [Similarities to Ceph](#similarities-to-ceph)
- [Differences from Ceph](#differences-from-ceph) - [Differences from Ceph](#differences-from-ceph)
- [Implementation Principles](#implementation-principles) - [Implementation Principles](#implementation-principles)
@ -39,10 +44,10 @@
## Client-side components ## Client-side components
- **Client library** incapsulates client I/O logic. Client library connects to etcd and to all OSDs, - **Client library** encapsulates client I/O logic. Client library connects to etcd and to all OSDs,
receives cluster state from etcd, sends read and write requests directly to all OSDs. Due receives cluster state from etcd, sends read and write requests directly to all OSDs. Due
to the symmetric distributed architecture, all data blocks (each 128 KB by default) are placed to the symmetric distributed architecture, all data blocks (each 128 KB by default) are placed
to different OSDs, but clients always knows where each data block is stored and connects directly to different OSDs, but clients always know where each data block is stored and connect directly
to the right OSD. to the right OSD.
All other client-side components are based on the client library: All other client-side components are based on the client library:
@ -77,8 +82,8 @@ All other client-side components are based on the client library:
## Additional utilities ## Additional utilities
- **vitastor-disk**утилита для разметки дисков под Vitastor OSD. С её помощью можно - **vitastor-disk**a Vitastor OSD disk management tool. You can create, remove,
создавать, удалять, менять размеры или перемещать разделы OSD. resize and move OSD partitions with it.
## Overall read/write process ## Overall read/write process

View File

@ -171,7 +171,14 @@ to make them use the new version of the client library.
### 1.7.x to 1.8.0 ### 1.7.x to 1.8.0
After upgrading version <= 1.7.x to version >= 1.8.0, BUT <= 1.9.0: restart all clients It's recommended to upgrade from version <= 1.7.x to version >= 1.8.0 with full downtime,
i.e. you should first stop clients and then the cluster (OSDs and monitor), because 1.8.0
includes a fix for etcd event stream inconsistency which could lead to "incomplete" objects
appearing in EC pools, and in rare cases, probably, even to data corruption during mass OSD
restarts. It doesn't mean that you WILL hit this problem if you upgrade without full downtime,
but it's better to secure yourself against it.
Also, if you upgrade version from <= 1.7.x to version >= 1.8.0, BUT <= 1.9.0: restart all clients
(VMs and so on), otherwise they will hang when monitor clears old PG configuration key, (VMs and so on), otherwise they will hang when monitor clears old PG configuration key,
which happens 24 hours after upgrade. which happens 24 hours after upgrade.

View File

@ -168,7 +168,14 @@ done
### 1.7.x -> 1.8.0 ### 1.7.x -> 1.8.0
После обновления с версий <= 1.7.x до версий >= 1.8.0, НО <= 1.9.0: перезапустите всех Обновляться с версий <= 1.7.x до версий >= 1.8.0 рекомендуется с полной остановкой
сначала клиентов, а затем кластера, так как в 1.8.0 исправлена проблема (неконсистентность
потоков событий от etcd), способная приводить к появлению incomplete объектов в EC-пулах
и, хоть и редко, но даже к повреждению данных при массовых перезапусках OSD. Если вы
обновляетесь без полной остановки - это не значит, что вы обязательно столкнётесь с этой
проблемой, но лучше подстраховаться.
Также, если вы обновляетесь с версии <= 1.7.x до версии >= 1.8.0, НО <= 1.9.0: перезапустите всех
клиентов (процессы виртуальных машин можно перезапустить путём миграции на другой сервер), клиентов (процессы виртуальных машин можно перезапустить путём миграции на другой сервер),
иначе они зависнут, когда монитор удалит старый ключ конфигурации PG, что происходит через иначе они зависнут, когда монитор удалит старый ключ конфигурации PG, что происходит через
24 часа после обновления. 24 часа после обновления.

View File

@ -232,6 +232,7 @@ class EtcdAdapter
async become_master() async become_master()
{ {
const state = { ...this.mon.get_mon_state(), id: ''+this.mon.etcd_lease_id }; const state = { ...this.mon.get_mon_state(), id: ''+this.mon.etcd_lease_id };
console.log('Waiting to become master');
// eslint-disable-next-line no-constant-condition // eslint-disable-next-line no-constant-condition
while (1) while (1)
{ {
@ -243,7 +244,6 @@ class EtcdAdapter
{ {
break; break;
} }
console.log('Waiting to become master');
await new Promise(ok => setTimeout(ok, this.mon.config.etcd_start_timeout)); await new Promise(ok => setTimeout(ok, this.mon.config.etcd_start_timeout));
} }
console.log('Became master'); console.log('Became master');

View File

@ -306,12 +306,12 @@ index e5ff653a60..884ecc79ea 100644
+ etcd = virBufferContentAndReset(&buf); + etcd = virBufferContentAndReset(&buf);
+ } + }
+ +
+ if (virJSONValueObjectCreate(&ret, + if (virJSONValueObjectAdd(&ret,
+ "S:etcd-host", etcd, + "S:etcd-host", etcd,
+ "S:etcd-prefix", src->query, + "S:etcd-prefix", src->query,
+ "S:config-path", src->configFile, + "S:config-path", src->configFile,
+ "s:image", src->path, + "s:image", src->path,
+ NULL) < 0) + NULL) < 0)
+ return NULL; + return NULL;
+ +
+ return ret; + return ret;

View File

@ -0,0 +1,193 @@
Index: pve-qemu-kvm-9.0.0/block/meson.build
===================================================================
--- pve-qemu-kvm-9.0.0.orig/block/meson.build
+++ pve-qemu-kvm-9.0.0/block/meson.build
@@ -126,6 +126,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
Index: pve-qemu-kvm-9.0.0/meson.build
===================================================================
--- pve-qemu-kvm-9.0.0.orig/meson.build
+++ pve-qemu-kvm-9.0.0/meson.build
@@ -1452,6 +1452,26 @@ if not get_option('rbd').auto() or have_
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'))
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -2254,6 +2274,7 @@ endif
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
@@ -4454,6 +4475,7 @@ summary_info += {'fdt support': fd
summary_info += {'libcap-ng support': libcap_ng}
summary_info += {'bpf support': libbpf}
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
Index: pve-qemu-kvm-9.0.0/meson_options.txt
===================================================================
--- pve-qemu-kvm-9.0.0.orig/meson_options.txt
+++ pve-qemu-kvm-9.0.0/meson_options.txt
@@ -194,6 +194,8 @@ option('lzo', type : 'feature', value :
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
Index: pve-qemu-kvm-9.0.0/qapi/block-core.json
===================================================================
--- pve-qemu-kvm-9.0.0.orig/qapi/block-core.json
+++ pve-qemu-kvm-9.0.0/qapi/block-core.json
@@ -3481,7 +3481,7 @@
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
'pbs',
- 'ssh', 'throttle', 'vdi', 'vhdx',
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -4591,6 +4591,28 @@
'*server': ['InetSocketAddressBase'] } }
##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
+##
# @ReplicationMode:
#
# An enumeration of replication modes.
@@ -5053,6 +5075,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'virtio-blk-vfio-pci':
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
'if': 'CONFIG_BLKIO' },
@@ -5498,6 +5521,20 @@
'*encrypt' : 'RbdEncryptionCreateOptions' } }
##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @location: Where to store the new image file. This location cannot
+# point to a snapshot.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
+##
# @BlockdevVmdkSubformat:
#
# Subformat options for VMDK images
@@ -5719,6 +5753,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
Index: pve-qemu-kvm-9.0.0/scripts/ci/org.centos/stream/8/x86_64/configure
===================================================================
--- pve-qemu-kvm-9.0.0.orig/scripts/ci/org.centos/stream/8/x86_64/configure
+++ pve-qemu-kvm-9.0.0/scripts/ci/org.centos/stream/8/x86_64/configure
@@ -30,7 +30,7 @@
--with-suffix="qemu-kvm" \
--firmwarepath=/usr/share/qemu-firmware \
--target-list="x86_64-softmmu" \
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
--audio-drv-list="" \
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
--with-coroutine=ucontext \
@@ -176,6 +176,7 @@
--enable-opengl \
--enable-pie \
--enable-rbd \
+--enable-vitastor \
--enable-rdma \
--enable-seccomp \
--enable-snappy \
Index: pve-qemu-kvm-9.0.0/scripts/meson-buildoptions.sh
===================================================================
--- pve-qemu-kvm-9.0.0.orig/scripts/meson-buildoptions.sh
+++ pve-qemu-kvm-9.0.0/scripts/meson-buildoptions.sh
@@ -168,6 +168,7 @@ meson_options_help() {
printf "%s\n" ' qed qed image format support'
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' rdma Enable RDMA-based migration'
printf "%s\n" ' replication replication support'
printf "%s\n" ' rutabaga-gfx rutabaga_gfx support'
@@ -445,6 +446,8 @@ _meson_option_parse() {
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-rdma) printf "%s" -Drdma=enabled ;;
--disable-rdma) printf "%s" -Drdma=disabled ;;
--enable-relocatable) printf "%s" -Drelocatable=true ;;

View File

@ -0,0 +1,172 @@
diff --git a/block/meson.build b/block/meson.build
index f1262ec2ba..3cf3e23f16 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -114,6 +114,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
diff --git a/meson.build b/meson.build
index fbda17c987..3edac22aff 100644
--- a/meson.build
+++ b/meson.build
@@ -1510,6 +1510,26 @@ if not get_option('rbd').auto() or have_block
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'))
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -2351,6 +2371,7 @@ endif
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
@@ -4510,6 +4531,7 @@ summary_info += {'fdt support': fdt_opt == 'internal' ? 'internal' : fdt}
summary_info += {'libcap-ng support': libcap_ng}
summary_info += {'bpf support': libbpf}
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
diff --git a/meson_options.txt b/meson_options.txt
index 0269fa0f16..4740ffdc27 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -194,6 +194,8 @@ option('lzo', type : 'feature', value : 'auto',
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
diff --git a/qapi/block-core.json b/qapi/block-core.json
index aa40d44f1d..bbee6a0e9c 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3203,7 +3203,7 @@
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
- 'ssh', 'throttle', 'vdi', 'vhdx',
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -4286,6 +4286,28 @@
'*key-secret': 'str',
'*server': ['InetSocketAddressBase'] } }
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
##
# @ReplicationMode:
#
@@ -4742,6 +4764,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'virtio-blk-vfio-pci':
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
'if': 'CONFIG_BLKIO' },
@@ -5183,6 +5206,20 @@
'*cluster-size' : 'size',
'*encrypt' : 'RbdEncryptionCreateOptions' } }
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @location: Where to store the new image file. This location cannot
+# point to a snapshot.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
##
# @BlockdevVmdkSubformat:
#
@@ -5405,6 +5442,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index c97079a38c..4623f552ec 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -168,6 +168,7 @@ meson_options_help() {
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
printf "%s\n" ' qpl Query Processing Library support'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' rdma Enable RDMA-based migration'
printf "%s\n" ' replication replication support'
printf "%s\n" ' rutabaga-gfx rutabaga_gfx support'
@@ -444,6 +445,8 @@ _meson_option_parse() {
--disable-qpl) printf "%s" -Dqpl=disabled ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-rdma) printf "%s" -Drdma=enabled ;;
--disable-rdma) printf "%s" -Drdma=disabled ;;
--enable-relocatable) printf "%s" -Drelocatable=true ;;

View File

@ -176,7 +176,7 @@ void etcd_state_client_t::add_etcd_url(std::string addr)
exit(1); exit(1);
} }
if (!local_ips.size()) if (!local_ips.size())
local_ips = getifaddr_list(); local_ips = getifaddr_list(std::vector<std::string>(), true);
std::string check_addr; std::string check_addr;
int pos = addr.find('/'); int pos = addr.find('/');
int pos2 = addr.find(':'); int pos2 = addr.find(':');

View File

@ -62,6 +62,7 @@ struct http_co_t
inline void end() { ended = true; if (!onstack) { delete this; } } inline void end() { ended = true; if (!onstack) { delete this; } }
void run_cb_and_clear(); void run_cb_and_clear();
void start_connection(); void start_connection();
void start_ws_connection();
void close_connection(); void close_connection();
void next_request(); void next_request();
void handle_events(); void handle_events();
@ -112,7 +113,7 @@ http_co_t* open_websocket(timerfd_manager_t *tfd, const std::string & host, cons
handler->keepalive = false; handler->keepalive = false;
handler->request = request; handler->request = request;
handler->response_callback = response_callback; handler->response_callback = response_callback;
handler->start_connection(); handler->start_ws_connection();
return handler; return handler;
} }
@ -282,6 +283,27 @@ void http_co_t::close_connection()
epoll_events = 0; epoll_events = 0;
} }
void http_co_t::start_ws_connection()
{
stackin();
start_connection();
if (request_timeout > 0)
{
timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
{
stackin();
if (state != HTTP_CO_WEBSOCKET)
{
close_connection();
parsed = { .error = "Websocket connection timed out" };
run_cb_and_clear();
}
stackout();
});
}
stackout();
}
void http_co_t::start_connection() void http_co_t::start_connection()
{ {
stackin(); stackin();

View File

@ -121,7 +121,7 @@ void osd_messenger_t::init()
if (use_rdma) if (use_rdma)
{ {
rdma_context = msgr_rdma_context_t::create( rdma_context = msgr_rdma_context_t::create(
rdma_device != "" ? rdma_device.c_str() : NULL, osd_networks, rdma_device != "" ? rdma_device.c_str() : NULL,
rdma_port_num, rdma_gid_index, rdma_mtu, rdma_odp, log_level rdma_port_num, rdma_gid_index, rdma_mtu, rdma_odp, log_level
); );
if (!rdma_context) if (!rdma_context)
@ -266,7 +266,8 @@ void osd_messenger_t::parse_config(const json11::Json & config)
this->rdma_port_num = (uint8_t)config["rdma_port_num"].uint64_value(); this->rdma_port_num = (uint8_t)config["rdma_port_num"].uint64_value();
if (!this->rdma_port_num) if (!this->rdma_port_num)
this->rdma_port_num = 1; this->rdma_port_num = 1;
this->rdma_gid_index = (uint8_t)config["rdma_gid_index"].uint64_value(); if (!config["rdma_gid_index"].is_null())
this->rdma_gid_index = (uint8_t)config["rdma_gid_index"].uint64_value();
this->rdma_mtu = (uint32_t)config["rdma_mtu"].uint64_value(); this->rdma_mtu = (uint32_t)config["rdma_mtu"].uint64_value();
this->rdma_max_sge = config["rdma_max_sge"].uint64_value(); this->rdma_max_sge = config["rdma_max_sge"].uint64_value();
if (!this->rdma_max_sge) if (!this->rdma_max_sge)
@ -281,6 +282,15 @@ void osd_messenger_t::parse_config(const json11::Json & config)
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024) if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
this->rdma_max_msg = 129*1024; this->rdma_max_msg = 129*1024;
this->rdma_odp = config["rdma_odp"].bool_value(); this->rdma_odp = config["rdma_odp"].bool_value();
std::vector<std::string> mask;
if (config["bind_address"].is_string())
mask.push_back(config["bind_address"].string_value());
else if (config["osd_network"].is_string())
mask.push_back(config["osd_network"].string_value());
else
for (auto v: config["osd_network"].array_items())
mask.push_back(v.string_value());
this->osd_networks = mask;
#endif #endif
if (!osd_num) if (!osd_num)
this->iothread_count = (uint32_t)config["client_iothread_count"].uint64_value(); this->iothread_count = (uint32_t)config["client_iothread_count"].uint64_value();

View File

@ -165,8 +165,9 @@ protected:
#ifdef WITH_RDMA #ifdef WITH_RDMA
bool use_rdma = true; bool use_rdma = true;
std::vector<std::string> osd_networks;
std::string rdma_device; std::string rdma_device;
uint64_t rdma_port_num = 1, rdma_gid_index = 0, rdma_mtu = 0; uint64_t rdma_port_num = 1, rdma_gid_index = -1, rdma_mtu = 0;
msgr_rdma_context_t *rdma_context = NULL; msgr_rdma_context_t *rdma_context = NULL;
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0; uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
uint64_t rdma_max_msg = 0; uint64_t rdma_max_msg = 0;
@ -177,7 +178,7 @@ protected:
std::vector<int> read_ready_clients; std::vector<int> read_ready_clients;
std::vector<int> write_ready_clients; std::vector<int> write_ready_clients;
// We don't use ringloop->set_immediate here because we may have no ringloop in client :) // We don't use ringloop->set_immediate here because we may have no ringloop in client :)
std::vector<std::function<void()>> set_immediate; std::vector<osd_op_t*> set_immediate_ops;
public: public:
timerfd_manager_t *tfd; timerfd_manager_t *tfd;
@ -237,6 +238,8 @@ protected:
void handle_op_hdr(osd_client_t *cl); void handle_op_hdr(osd_client_t *cl);
bool handle_reply_hdr(osd_client_t *cl); bool handle_reply_hdr(osd_client_t *cl);
void handle_reply_ready(osd_op_t *op); void handle_reply_ready(osd_op_t *op);
void handle_immediate_ops();
void clear_immediate_ops(int peer_fd);
#ifdef WITH_RDMA #ifdef WITH_RDMA
void try_send_rdma(osd_client_t *cl); void try_send_rdma(osd_client_t *cl);

View File

@ -3,6 +3,7 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include "addr_util.h"
#include "msgr_rdma.h" #include "msgr_rdma.h"
#include "messenger.h" #include "messenger.h"
@ -69,7 +70,136 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
send_out_size = 0; send_out_size = 0;
} }
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level) static bool is_ipv4_gid(ibv_gid_entry *gidx)
{
return (((uint64_t*)gidx->gid.raw)[0] == 0 &&
((uint32_t*)gidx->gid.raw)[2] == 0xffff0000);
}
static bool match_gid(ibv_gid_entry *gidx, addr_mask_t *networks, int nnet)
{
if (gidx->gid_type != IBV_GID_TYPE_ROCE_V1 &&
gidx->gid_type != IBV_GID_TYPE_ROCE_V2 ||
((uint64_t*)gidx->gid.raw)[0] == 0 &&
((uint64_t*)gidx->gid.raw)[1] == 0)
{
return false;
}
if (is_ipv4_gid(gidx))
{
for (int i = 0; i < nnet; i++)
{
if (networks[i].family == AF_INET && cidr_match(*(in_addr*)(gidx->gid.raw+12), networks[i].ipv4, networks[i].bits))
return true;
}
}
else
{
for (int i = 0; i < nnet; i++)
{
if (networks[i].family == AF_INET6 && cidr6_match(*(in6_addr*)gidx->gid.raw, networks[i].ipv6, networks[i].bits))
return true;
}
}
return false;
}
struct matched_dev
{
int dev = -1;
int port = -1;
int gid = -1;
bool rocev2 = false;
};
static void log_rdma_dev_port_gid(ibv_device *dev, int ib_port, int gid_index, ibv_gid_entry & gidx)
{
bool is4 = ((uint64_t*)gidx.gid.raw)[0] == 0 && ((uint32_t*)gidx.gid.raw)[2] == 0xffff0000;
char buf[256];
inet_ntop(is4 ? AF_INET : AF_INET6, is4 ? gidx.gid.raw+12 : gidx.gid.raw, buf, sizeof(buf));
fprintf(
stderr, "Auto-selected RDMA device %s port %d GID %d - ROCEv%d IPv%d %s\n",
ibv_get_device_name(dev), ib_port, gid_index,
gidx.gid_type == IBV_GID_TYPE_ROCE_V2 ? 2 : 1, is4 ? 4 : 6, buf
);
}
static matched_dev match_device(ibv_device **dev_list, addr_mask_t *networks, int nnet, int log_level)
{
matched_dev best;
ibv_device_attr attr;
ibv_port_attr portinfo;
ibv_gid_entry best_gidx;
int res;
bool have_non_roce = false, have_roce = false;
for (int i = 0; dev_list[i]; ++i)
{
auto dev = dev_list[i];
ibv_context *context = ibv_open_device(dev_list[i]);
if ((res = ibv_query_device(context, &attr)) != 0)
{
fprintf(stderr, "Couldn't query RDMA device %s for its features: %s\n", ibv_get_device_name(dev_list[i]), strerror(res));
goto cleanup;
}
for (int j = 1; j <= attr.phys_port_cnt; j++)
{
// Try to find a port with matching address
if ((res = ibv_query_port(context, j, &portinfo)) != 0)
{
fprintf(stderr, "Couldn't get RDMA device %s port %d info: %s\n", ibv_get_device_name(dev), j, strerror(res));
goto cleanup;
}
for (int k = 0; k < portinfo.gid_tbl_len; k++)
{
ibv_gid_entry gidx;
if ((res = ibv_query_gid_ex(context, j, k, &gidx, 0)) != 0)
{
if (res != ENODATA)
{
fprintf(stderr, "Couldn't read RDMA device %s GID index %d: %s\n", ibv_get_device_name(dev), k, strerror(res));
goto cleanup;
}
else
break;
}
if (gidx.gid_type != IBV_GID_TYPE_ROCE_V1 &&
gidx.gid_type != IBV_GID_TYPE_ROCE_V2)
have_non_roce = true;
else
have_roce = true;
if (match_gid(&gidx, networks, nnet))
{
// Prefer RoCEv2
if (!best.rocev2)
{
best.dev = i;
best.port = j;
best.gid = k;
best.rocev2 = (gidx.gid_type == IBV_GID_TYPE_ROCE_V2);
best_gidx = gidx;
}
}
}
}
cleanup:
ibv_close_device(context);
if (best.rocev2)
{
break;
}
}
if (best.dev >= 0 && log_level > 0)
{
log_rdma_dev_port_gid(dev_list[best.dev], best.port, best.gid, best_gidx);
}
if (best.dev < 0 && have_non_roce && !have_roce)
{
best.dev = -2;
}
return best;
}
msgr_rdma_context_t *msgr_rdma_context_t::create(std::vector<std::string> osd_networks, const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level)
{ {
int res; int res;
ibv_device **dev_list = NULL; ibv_device **dev_list = NULL;
@ -80,28 +210,23 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
clock_gettime(CLOCK_REALTIME, &tv); clock_gettime(CLOCK_REALTIME, &tv);
srand48(tv.tv_sec*1000000000 + tv.tv_nsec); srand48(tv.tv_sec*1000000000 + tv.tv_nsec);
dev_list = ibv_get_device_list(NULL); dev_list = ibv_get_device_list(NULL);
if (!dev_list) if (!dev_list || !*dev_list)
{ {
if (errno == -ENOSYS || errno == ENOSYS) if (errno == -ENOSYS || errno == ENOSYS)
{ {
if (log_level > 0) if (log_level > 0)
fprintf(stderr, "No RDMA devices found (RDMA device list returned ENOSYS)\n"); fprintf(stderr, "No RDMA devices found (RDMA device list returned ENOSYS)\n");
} }
else if (!*dev_list)
{
if (log_level > 0)
fprintf(stderr, "No RDMA devices found\n");
}
else else
fprintf(stderr, "Failed to get RDMA device list: %s\n", strerror(errno)); fprintf(stderr, "Failed to get RDMA device list: %s\n", strerror(errno));
goto cleanup; goto cleanup;
} }
if (!ib_devname) if (ib_devname)
{
ctx->dev = *dev_list;
if (!ctx->dev)
{
if (log_level > 0)
fprintf(stderr, "No RDMA devices found\n");
goto cleanup;
}
}
else
{ {
int i; int i;
for (i = 0; dev_list[i]; ++i) for (i = 0; dev_list[i]; ++i)
@ -114,6 +239,37 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
goto cleanup; goto cleanup;
} }
} }
else if (osd_networks.size())
{
std::vector<addr_mask_t> nets;
for (auto & netstr: osd_networks)
{
nets.push_back(cidr_parse(netstr));
}
auto best = match_device(dev_list, nets.data(), nets.size(), log_level);
if (best.dev == -2)
{
best.dev = 0;
if (log_level > 0)
fprintf(stderr, "No RoCE devices found, using first available RDMA device %s\n", ibv_get_device_name(*dev_list));
}
else if (best.dev < 0)
{
if (log_level > 0)
fprintf(stderr, "RDMA device matching osd_network is not found, disabling RDMA\n");
goto cleanup;
}
else
{
ib_port = best.port;
gid_index = best.gid;
}
ctx->dev = dev_list[best.dev];
}
else
{
ctx->dev = *dev_list;
}
ctx->context = ibv_open_device(ctx->dev); ctx->context = ibv_open_device(ctx->dev);
if (!ctx->context) if (!ctx->context)
@ -123,7 +279,6 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
} }
ctx->ib_port = ib_port; ctx->ib_port = ib_port;
ctx->gid_index = gid_index;
if ((res = ibv_query_port(ctx->context, ib_port, &ctx->portinfo)) != 0) if ((res = ibv_query_port(ctx->context, ib_port, &ctx->portinfo)) != 0)
{ {
fprintf(stderr, "Couldn't get RDMA device %s port %d info: %s\n", ibv_get_device_name(ctx->dev), ib_port, strerror(res)); fprintf(stderr, "Couldn't get RDMA device %s port %d info: %s\n", ibv_get_device_name(ctx->dev), ib_port, strerror(res));
@ -135,10 +290,48 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
fprintf(stderr, "RDMA device %s must have local LID because it's not Ethernet, but LID is zero\n", ibv_get_device_name(ctx->dev)); fprintf(stderr, "RDMA device %s must have local LID because it's not Ethernet, but LID is zero\n", ibv_get_device_name(ctx->dev));
goto cleanup; goto cleanup;
} }
if (ibv_query_gid(ctx->context, ib_port, gid_index, &ctx->my_gid))
if (gid_index != -1)
{ {
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), gid_index); ctx->gid_index = gid_index;
goto cleanup; if (ibv_query_gid_ex(ctx->context, ib_port, gid_index, &ctx->my_gid, 0))
{
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), gid_index);
goto cleanup;
}
}
else
{
// Auto-guess GID
for (int k = 0; k < ctx->portinfo.gid_tbl_len; k++)
{
ibv_gid_entry gidx;
if (ibv_query_gid_ex(ctx->context, ib_port, k, &gidx, 0) != 0)
{
fprintf(stderr, "Couldn't read RDMA device %s GID index %d\n", ibv_get_device_name(ctx->dev), k);
goto cleanup;
}
// Skip empty GID
if (((uint64_t*)gidx.gid.raw)[0] == 0 &&
((uint64_t*)gidx.gid.raw)[1] == 0)
{
continue;
}
// Prefer IPv4 RoCEv2 -> IPv6 RoCEv2 -> IPv4 RoCEv1 -> IPv6 RoCEv1 -> IB
if (gid_index == -1 ||
gidx.gid_type == IBV_GID_TYPE_ROCE_V2 && ctx->my_gid.gid_type != IBV_GID_TYPE_ROCE_V2 ||
gidx.gid_type == IBV_GID_TYPE_ROCE_V1 && ctx->my_gid.gid_type == IBV_GID_TYPE_IB ||
gidx.gid_type == ctx->my_gid.gid_type && is_ipv4_gid(&gidx))
{
gid_index = k;
ctx->my_gid = gidx;
}
}
ctx->gid_index = gid_index = (gid_index == -1 ? 0 : gid_index);
if (log_level > 0)
{
log_rdma_dev_port_gid(ctx->dev, ctx->ib_port, ctx->gid_index, ctx->my_gid);
}
} }
ctx->pd = ibv_alloc_pd(ctx->context); ctx->pd = ibv_alloc_pd(ctx->context);
@ -255,7 +448,7 @@ msgr_rdma_connection_t *msgr_rdma_connection_t::create(msgr_rdma_context_t *ctx,
} }
conn->addr.lid = ctx->my_lid; conn->addr.lid = ctx->my_lid;
conn->addr.gid = ctx->my_gid; conn->addr.gid = ctx->my_gid.gid;
conn->addr.qpn = conn->qp->qp_num; conn->addr.qpn = conn->qp->qp_num;
conn->addr.psn = lrand48() & 0xffffff; conn->addr.psn = lrand48() & 0xffffff;
@ -598,6 +791,7 @@ void osd_messenger_t::handle_rdma_events()
} }
fprintf(stderr, " with status: %s, stopping client\n", ibv_wc_status_str(wc[i].status)); fprintf(stderr, " with status: %s, stopping client\n", ibv_wc_status_str(wc[i].status));
stop_client(client_id); stop_client(client_id);
clear_immediate_ops(client_id);
continue; continue;
} }
if (!is_send) if (!is_send)
@ -606,6 +800,7 @@ void osd_messenger_t::handle_rdma_events()
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len)) if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
{ {
// handle_read_buffer may stop the client // handle_read_buffer may stop the client
clear_immediate_ops(client_id);
continue; continue;
} }
try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]); try_recv_rdma_wr(cl, rc->recv_buffers[rc->next_recv_buf]);
@ -666,9 +861,5 @@ void osd_messenger_t::handle_rdma_events()
} }
} }
} while (event_count > 0); } while (event_count > 0);
for (auto cb: set_immediate) handle_immediate_ops();
{
cb();
}
set_immediate.clear();
} }

View File

@ -31,12 +31,12 @@ struct msgr_rdma_context_t
uint8_t ib_port; uint8_t ib_port;
uint8_t gid_index; uint8_t gid_index;
uint16_t my_lid; uint16_t my_lid;
ibv_gid my_gid; ibv_gid_entry my_gid;
uint32_t mtu; uint32_t mtu;
int max_cqe = 0; int max_cqe = 0;
int used_max_cqe = 0; int used_max_cqe = 0;
static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level); static msgr_rdma_context_t *create(std::vector<std::string> osd_networks, const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level);
~msgr_rdma_context_t(); ~msgr_rdma_context_t();
}; };

View File

@ -65,6 +65,7 @@ void osd_messenger_t::read_requests()
bool osd_messenger_t::handle_read(int result, osd_client_t *cl) bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
{ {
bool ret = false; bool ret = false;
int peer_fd = cl->peer_fd;
cl->read_msg.msg_iovlen = 0; cl->read_msg.msg_iovlen = 0;
cl->refs--; cl->refs--;
if (cl->peer_state == PEER_STOPPED) if (cl->peer_state == PEER_STOPPED)
@ -101,7 +102,8 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
{ {
if (!handle_read_buffer(cl, cl->in_buf, result)) if (!handle_read_buffer(cl, cl->in_buf, result))
{ {
goto fin; clear_immediate_ops(peer_fd);
return false;
} }
} }
else else
@ -113,7 +115,8 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
{ {
if (!handle_finished_read(cl)) if (!handle_finished_read(cl))
{ {
goto fin; clear_immediate_ops(peer_fd);
return false;
} }
} }
} }
@ -122,15 +125,47 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
ret = true; ret = true;
} }
} }
fin: handle_immediate_ops();
for (auto cb: set_immediate)
{
cb();
}
set_immediate.clear();
return ret; return ret;
} }
void osd_messenger_t::clear_immediate_ops(int peer_fd)
{
size_t i = 0, j = 0;
while (i < set_immediate_ops.size())
{
if (set_immediate_ops[i]->peer_fd == peer_fd)
{
delete set_immediate_ops[i];
}
else
{
if (i != j)
set_immediate_ops[j] = set_immediate_ops[i];
j++;
}
i++;
}
set_immediate_ops.resize(j);
}
void osd_messenger_t::handle_immediate_ops()
{
for (auto op: set_immediate_ops)
{
if (op->op_type == OSD_OP_IN)
{
exec_op(op);
}
else
{
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(op->callback)(op);
}
}
set_immediate_ops.clear();
}
bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int remain) bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int remain)
{ {
// Compose operation(s) from the buffer // Compose operation(s) from the buffer
@ -199,7 +234,7 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
{ {
// Operation is ready // Operation is ready
cl->received_ops.push_back(cl->read_op); cl->received_ops.push_back(cl->read_op);
set_immediate.push_back([this, op = cl->read_op]() { exec_op(op); }); set_immediate_ops.push_back(cl->read_op);
cl->read_op = NULL; cl->read_op = NULL;
cl->read_state = 0; cl->read_state = 0;
} }
@ -295,7 +330,7 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
{ {
// Operation is ready // Operation is ready
cl->received_ops.push_back(cur_op); cl->received_ops.push_back(cur_op);
set_immediate.push_back([this, cur_op]() { exec_op(cur_op); }); set_immediate_ops.push_back(cur_op);
cl->read_op = NULL; cl->read_op = NULL;
cl->read_state = 0; cl->read_state = 0;
} }
@ -416,9 +451,5 @@ void osd_messenger_t::handle_reply_ready(osd_op_t *op)
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 + (tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000 (tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
); );
set_immediate.push_back([op]() set_immediate_ops.push_back(op);
{
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(op->callback)(op);
});
} }

View File

@ -16,7 +16,6 @@
#include "qapi/error.h" #include "qapi/error.h"
#include "qapi/qmp/qdict.h" #include "qapi/qmp/qdict.h"
#include "qapi/qmp/qerror.h" #include "qapi/qmp/qerror.h"
#include "qemu/uri.h"
#include "qemu/error-report.h" #include "qemu/error-report.h"
#include "qemu/module.h" #include "qemu/module.h"
#include "qemu/option.h" #include "qemu/option.h"
@ -1021,7 +1020,11 @@ static BlockDriver bdrv_vitastor = {
// FIXME: Implement it along with per-inode statistics // FIXME: Implement it along with per-inode statistics
//.bdrv_get_allocated_file_size = vitastor_get_allocated_file_size, //.bdrv_get_allocated_file_size = vitastor_get_allocated_file_size,
#if QEMU_VERSION_MAJOR > 9 || QEMU_VERSION_MAJOR == 9 && QEMU_VERSION_MINOR > 0
.bdrv_open = vitastor_file_open,
#else
.bdrv_file_open = vitastor_file_open, .bdrv_file_open = vitastor_file_open,
#endif
.bdrv_close = vitastor_close, .bdrv_close = vitastor_close,
// Option list for the create operation // Option list for the create operation

View File

@ -261,6 +261,7 @@ struct dd_out_info_t
else else
{ {
// ok // ok
out_size = owatch->cfg.size;
return true; return true;
} }
// Wait for sub-command // Wait for sub-command
@ -882,6 +883,8 @@ resume_2:
oinfo.end_fsync = oinfo.end_fsync && oinfo.out_seekable; oinfo.end_fsync = oinfo.end_fsync && oinfo.out_seekable;
read_offset = 0; read_offset = 0;
read_end = iinfo.in_seekable ? iinfo.in_size-iseek : 0; read_end = iinfo.in_seekable ? iinfo.in_size-iseek : 0;
if (oinfo.out_size && (!read_end || read_end > oinfo.out_size-oseek))
read_end = oinfo.out_size-oseek;
if (bytelimit && (!read_end || read_end > bytelimit)) if (bytelimit && (!read_end || read_end > bytelimit))
read_end = bytelimit; read_end = bytelimit;
clock_gettime(CLOCK_REALTIME, &tv_begin); clock_gettime(CLOCK_REALTIME, &tv_begin);

View File

@ -216,7 +216,7 @@ resume_1:
for (uint64_t osd_num: node.child_osds) for (uint64_t osd_num: node.child_osds)
{ {
auto & osd = placement_tree->osds.at(osd_num); auto & osd = placement_tree->osds.at(osd_num);
fmt_items.push_back(json11::Json::object{ auto json_osd = json11::Json::object{
{ "type", "osd" }, { "type", "osd" },
{ "name", osd.num }, { "name", osd.num },
{ "parent", node.name }, { "parent", node.name },
@ -230,7 +230,16 @@ resume_1:
{ "bitmap", (uint64_t)osd.bitmap_granularity }, { "bitmap", (uint64_t)osd.bitmap_granularity },
{ "commit", osd.immediate_commit == IMMEDIATE_NONE ? "none" : (osd.immediate_commit == IMMEDIATE_ALL ? "all" : "small") }, { "commit", osd.immediate_commit == IMMEDIATE_NONE ? "none" : (osd.immediate_commit == IMMEDIATE_ALL ? "all" : "small") },
{ "op_stats", osd_stats[osd_num]["op_stats"] }, { "op_stats", osd_stats[osd_num]["op_stats"] },
}); };
if (osd_stats[osd_num]["slow_ops_primary"].uint64_value() > 0)
{
json_osd["slow_ops_primary"] = osd_stats[osd_num]["slow_ops_primary"];
}
if (osd_stats[osd_num]["slow_ops_secondary"].uint64_value() > 0)
{
json_osd["slow_ops_secondary"] = osd_stats[osd_num]["slow_ops_secondary"];
}
fmt_items.push_back(json_osd);
} }
} }
result.data = fmt_items; result.data = fmt_items;

View File

@ -134,6 +134,7 @@ resume_2:
} }
int osd_count = 0, osd_up = 0; int osd_count = 0, osd_up = 0;
uint64_t total_raw = 0, free_raw = 0, free_down_raw = 0, down_raw = 0; uint64_t total_raw = 0, free_raw = 0, free_down_raw = 0, down_raw = 0;
std::vector<uint64_t> slow_op_primary_osds, slow_op_secondary_osds;
parent->iterate_kvs_1(osd_stats, "/osd/stats/", [&](uint64_t stat_osd_num, json11::Json value) parent->iterate_kvs_1(osd_stats, "/osd/stats/", [&](uint64_t stat_osd_num, json11::Json value)
{ {
osd_count++; osd_count++;
@ -141,18 +142,29 @@ resume_2:
auto osd_free = value["free"].uint64_value(); auto osd_free = value["free"].uint64_value();
total_raw += osd_size; total_raw += osd_size;
free_raw += osd_free; free_raw += osd_free;
if (!osd_free) if (osd_size)
{ {
osds_full++; if (!osd_free)
} {
else if (osd_free < (uint64_t)(osd_size*(1-osd_nearfull_ratio))) osds_full++;
{ }
osds_nearfull++; else if (osd_free < (uint64_t)(osd_size*(1-osd_nearfull_ratio)))
{
osds_nearfull++;
}
} }
auto peer_it = parent->cli->st_cli.peer_states.find(stat_osd_num); auto peer_it = parent->cli->st_cli.peer_states.find(stat_osd_num);
if (peer_it != parent->cli->st_cli.peer_states.end()) if (peer_it != parent->cli->st_cli.peer_states.end())
{ {
osd_up++; osd_up++;
if (value["slow_ops_primary"].uint64_value() > 0)
{
slow_op_primary_osds.push_back(stat_osd_num);
}
if (value["slow_ops_secondary"].uint64_value() > 0)
{
slow_op_secondary_osds.push_back(stat_osd_num);
}
} }
else else
{ {
@ -216,6 +228,10 @@ resume_2:
{ "mon_master", mon_master }, { "mon_master", mon_master },
{ "osd_up", osd_up }, { "osd_up", osd_up },
{ "osd_count", osd_count }, { "osd_count", osd_count },
{ "osds_full", osds_full },
{ "osds_nearfull", osds_nearfull },
{ "osds_primary_slow_ops", slow_op_primary_osds },
{ "osds_secondary_slow_ops", slow_op_secondary_osds },
{ "total_raw", total_raw }, { "total_raw", total_raw },
{ "free_raw", free_raw }, { "free_raw", free_raw },
{ "down_raw", down_raw }, { "down_raw", down_raw },
@ -300,6 +316,26 @@ resume_2:
warning_str += " "+std::to_string(osds_nearfull)+ warning_str += " "+std::to_string(osds_nearfull)+
(osds_nearfull > 1 ? " osds are almost full\n" : " osd is almost full\n"); (osds_nearfull > 1 ? " osds are almost full\n" : " osd is almost full\n");
} }
if (slow_op_primary_osds.size() > 0)
{
warning_str += " "+std::to_string(slow_op_primary_osds.size());
warning_str += (slow_op_primary_osds.size() > 1 ? " osds have" : " osd has");
warning_str += " slow client ops: ";
for (int i = 0; i < slow_op_primary_osds.size(); i++)
{
warning_str += (i > 0 ? ", " : "")+std::to_string(slow_op_primary_osds[i])+"\n";
}
}
if (slow_op_secondary_osds.size() > 0)
{
warning_str += " "+std::to_string(slow_op_secondary_osds.size());
warning_str += (slow_op_secondary_osds.size() > 1 ? " osds have" : " osd has");
warning_str += " slow replication ops: ";
for (int i = 0; i < slow_op_secondary_osds.size(); i++)
{
warning_str += (i > 0 ? ", " : "")+std::to_string(slow_op_secondary_osds[i])+"\n";
}
}
if (warning_str != "") if (warning_str != "")
{ {
warning_str = "\n warning:\n"+warning_str; warning_str = "\n warning:\n"+warning_str;

View File

@ -108,6 +108,10 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
try try
{ {
dsk.parse_config(options); dsk.parse_config(options);
// Set all offsets to 4096 to calculate metadata size with excess
dsk.journal_offset = 4096;
dsk.meta_offset = 4096;
dsk.data_offset = 4096;
dsk.data_io = dsk.meta_io = dsk.journal_io = (options["io"] == "cached" ? "cached" : "direct"); dsk.data_io = dsk.meta_io = dsk.journal_io = (options["io"] == "cached" ? "cached" : "direct");
dsk.open_data(); dsk.open_data();
dsk.open_meta(); dsk.open_meta();
@ -171,8 +175,8 @@ int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_
} }
sb["osd_num"] = osd_num; sb["osd_num"] = osd_num;
// Zero out metadata and journal // Zero out metadata and journal
if (write_zero(dsk.meta_fd, dsk.meta_offset, dsk.meta_len) != 0 || if (write_zero(dsk.meta_fd, sb["meta_offset"].uint64_value(), dsk.meta_len) != 0 ||
write_zero(dsk.journal_fd, dsk.journal_offset, dsk.journal_len) != 0) write_zero(dsk.journal_fd, sb["journal_offset"].uint64_value(), dsk.journal_len) != 0)
{ {
fprintf(stderr, "Failed to zero out metadata or journal: %s\n", strerror(errno)); fprintf(stderr, "Failed to zero out metadata or journal: %s\n", strerror(errno));
dsk.close_all(); dsk.close_all();
@ -498,6 +502,9 @@ int disk_tool_t::get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std
{ {
blockstore_disk_t dsk; blockstore_disk_t dsk;
dsk.parse_config(options); dsk.parse_config(options);
dsk.journal_offset = 4096;
dsk.meta_offset = 4096;
dsk.data_offset = 4096;
dsk.data_io = dsk.meta_io = dsk.journal_io = "cached"; dsk.data_io = dsk.meta_io = dsk.journal_io = "cached";
dsk.open_data(); dsk.open_data();
dsk.open_meta(); dsk.open_meta();

View File

@ -535,10 +535,12 @@ void osd_t::print_stats()
void osd_t::print_slow() void osd_t::print_slow()
{ {
bool has_slow = false; cur_slow_op_primary = 0;
cur_slow_op_secondary = 0;
char alloc[1024]; char alloc[1024];
timespec now; timespec now;
clock_gettime(CLOCK_REALTIME, &now); clock_gettime(CLOCK_REALTIME, &now);
// FIXME: Also track slow local blockstore ops and recovery/flush/scrub ops
for (auto & kv: msgr.clients) for (auto & kv: msgr.clients)
{ {
for (auto op: kv.second->received_ops) for (auto op: kv.second->received_ops)
@ -608,6 +610,7 @@ void osd_t::print_slow()
op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK || op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK ||
op->req.hdr.opcode == OSD_OP_SEC_READ_BMP) op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
{ {
cur_slow_op_secondary++;
bufprintf(" state=%d", op->bs_op ? PRIV(op->bs_op)->op_state : -1); bufprintf(" state=%d", op->bs_op ? PRIV(op->bs_op)->op_state : -1);
int wait_for = op->bs_op ? PRIV(op->bs_op)->wait_for : 0; int wait_for = op->bs_op ? PRIV(op->bs_op)->wait_for : 0;
if (wait_for) if (wait_for)
@ -618,15 +621,19 @@ void osd_t::print_slow()
else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE || else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
op->req.hdr.opcode == OSD_OP_SYNC || op->req.hdr.opcode == OSD_OP_DELETE) op->req.hdr.opcode == OSD_OP_SYNC || op->req.hdr.opcode == OSD_OP_DELETE)
{ {
cur_slow_op_primary++;
bufprintf(" state=%d", !op->op_data ? -1 : op->op_data->st); bufprintf(" state=%d", !op->op_data ? -1 : op->op_data->st);
} }
else
{
cur_slow_op_primary++;
}
#undef bufprintf #undef bufprintf
printf("%s\n", alloc); printf("%s\n", alloc);
has_slow = true;
} }
} }
} }
if (has_slow && bs) if ((cur_slow_op_primary+cur_slow_op_secondary) > 0 && bs)
{ {
bs->dump_diagnostics(); bs->dump_diagnostics();
} }

View File

@ -150,7 +150,9 @@ class osd_t
bool pg_config_applied = false; bool pg_config_applied = false;
bool etcd_reporting_pg_state = false; bool etcd_reporting_pg_state = false;
bool etcd_reporting_stats = false; bool etcd_reporting_stats = false;
int autosync_timer_id = -1, print_stats_timer_id = -1, slow_log_timer_id = -1; int print_stats_timer_id = -1, slow_log_timer_id = -1;
uint64_t cur_slow_op_primary = 0;
uint64_t cur_slow_op_secondary = 0;
// peers and PGs // peers and PGs
@ -168,6 +170,8 @@ class osd_t
object_id recovery_last_oid; object_id recovery_last_oid;
int recovery_pg_done = 0, recovery_done = 0; int recovery_pg_done = 0, recovery_done = 0;
osd_op_t *autosync_op = NULL; osd_op_t *autosync_op = NULL;
int autosync_copies_to_delete = 0;
int autosync_timer_id = -1;
// Scrubbing // Scrubbing
uint64_t scrub_nearest_ts = 0; uint64_t scrub_nearest_ts = 0;

View File

@ -201,6 +201,14 @@ json11::Json osd_t::get_statistics()
st["immediate_commit"] = immediate_commit == IMMEDIATE_ALL ? "all" : (immediate_commit == IMMEDIATE_SMALL ? "small" : "none"); st["immediate_commit"] = immediate_commit == IMMEDIATE_ALL ? "all" : (immediate_commit == IMMEDIATE_SMALL ? "small" : "none");
st["host"] = self_state["host"]; st["host"] = self_state["host"];
st["version"] = VITASTOR_VERSION; st["version"] = VITASTOR_VERSION;
if (cur_slow_op_primary > 0)
{
st["slow_ops_primary"] = cur_slow_op_primary;
}
if (cur_slow_op_secondary > 0)
{
st["slow_ops_secondary"] = cur_slow_op_secondary;
}
json11::Json::object op_stats, subop_stats; json11::Json::object op_stats, subop_stats;
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++) for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
{ {

View File

@ -13,10 +13,11 @@ void osd_t::submit_pg_flush_ops(pg_t & pg)
bool first = true; bool first = true;
while (it != pg.flush_actions.end()) while (it != pg.flush_actions.end())
{ {
if (!first && (it->first.oid.inode != prev_it->first.oid.inode || if (!first &&
(it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK)) && (it->first.oid.inode != prev_it->first.oid.inode ||
fb->rollback_lists[it->first.osd_num].size() >= FLUSH_BATCH || (it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK)) &&
fb->stable_lists[it->first.osd_num].size() >= FLUSH_BATCH) (fb->rollback_lists[it->first.osd_num].size() >= FLUSH_BATCH ||
fb->stable_lists[it->first.osd_num].size() >= FLUSH_BATCH))
{ {
// Stop only at the object boundary // Stop only at the object boundary
break; break;
@ -75,6 +76,7 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
// Throw the result away // Throw the result away
return; return;
} }
fb->flush_done++;
if (retval != 0) if (retval != 0)
{ {
if (peer_osd == this->osd_num) if (peer_osd == this->osd_num)
@ -92,12 +94,11 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
auto fd_it = msgr.osd_peer_fds.find(peer_osd); auto fd_it = msgr.osd_peer_fds.find(peer_osd);
if (fd_it != msgr.osd_peer_fds.end()) if (fd_it != msgr.osd_peer_fds.end())
{ {
// Will repeer/stop this PG
msgr.stop_client(fd_it->second); msgr.stop_client(fd_it->second);
} }
return;
} }
} }
fb->flush_done++;
if (fb->flush_done == fb->flush_ops) if (fb->flush_done == fb->flush_ops)
{ {
// This flush batch is done // This flush batch is done

View File

@ -645,6 +645,18 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t **objec
{ {
throw std::runtime_error("BUG: Invalid object state: "+std::to_string((*object_state)->state)); throw std::runtime_error("BUG: Invalid object state: "+std::to_string((*object_state)->state));
} }
if (changed && immediate_commit != IMMEDIATE_ALL)
{
// Trigger double automatic sync after changing PG state when we're running with fsyncs.
// First autosync commits all written objects and applies copies_to_delete_after_sync;
// Second autosync commits all deletions run by the first sync.
// Without it, rebalancing in a cluster without load may result in some small amount of
// garbage left on "extra" OSDs of the PG, because last deletions are not synced at all.
// FIXME: 1000% correct way is to switch PG state only after copies_to_delete_after_sync.
// But it's much more complicated.
unstable_write_count += autosync_writes;
autosync_copies_to_delete = 2;
}
if (changed && report) if (changed && report)
{ {
report_pg_state(pg); report_pg_state(pg);

View File

@ -9,6 +9,10 @@ void osd_t::autosync()
{ {
if (immediate_commit != IMMEDIATE_ALL && !autosync_op) if (immediate_commit != IMMEDIATE_ALL && !autosync_op)
{ {
if (autosync_copies_to_delete > 0)
{
autosync_copies_to_delete--;
}
autosync_op = new osd_op_t(); autosync_op = new osd_op_t();
autosync_op->op_type = OSD_OP_IN; autosync_op->op_type = OSD_OP_IN;
autosync_op->peer_fd = SELF_FD; autosync_op->peer_fd = SELF_FD;
@ -29,6 +33,11 @@ void osd_t::autosync()
} }
delete autosync_op; delete autosync_op;
autosync_op = NULL; autosync_op = NULL;
if (autosync_copies_to_delete > 0)
{
// Trigger the second "copies_to_delete" autosync
autosync();
}
}; };
exec_op(autosync_op); exec_op(autosync_op);
} }

View File

@ -213,6 +213,15 @@ resume_8:
{ {
goto resume_6; goto resume_6;
} }
if (immediate_commit == IMMEDIATE_NONE)
{
// Mark OSDs as dirty because deletions have to be synced too!
for (int i = 0; i < op_data->copies_to_delete_count; i++)
{
auto & chunk = op_data->copies_to_delete[i];
this->dirty_osds.insert(chunk.osd_num);
}
}
} }
for (int i = 0; i < op_data->dirty_pg_count; i++) for (int i = 0; i < op_data->dirty_pg_count; i++)
{ {
@ -227,7 +236,7 @@ resume_8:
start_pg_peering(pg); start_pg_peering(pg);
} }
} }
// FIXME: Free those in the destructor? // FIXME: Free those in the destructor (not here)?
free(op_data->dirty_pgs); free(op_data->dirty_pgs);
op_data->dirty_pgs = NULL; op_data->dirty_pgs = NULL;
op_data->dirty_osds = NULL; op_data->dirty_osds = NULL;

View File

@ -7,6 +7,12 @@
bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg) bool osd_t::check_write_queue(osd_op_t *cur_op, pg_t & pg)
{ {
osd_primary_op_data_t *op_data = cur_op->op_data; osd_primary_op_data_t *op_data = cur_op->op_data;
// First check if PG is not active anymore
if (!(pg.state & PG_ACTIVE))
{
pg_cancel_write_queue(pg, cur_op, op_data->oid, -EPIPE);
return false;
}
// Check if actions are pending for this object // Check if actions are pending for this object
auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){ auto act_it = pg.flush_actions.lower_bound((obj_piece_id_t){
.oid = op_data->oid, .oid = op_data->oid,

View File

@ -65,7 +65,7 @@ std::string addr_to_string(const sockaddr_storage &addr)
return std::string(peer_str)+":"+std::to_string(port); return std::string(peer_str)+":"+std::to_string(port);
} }
static bool cidr_match(const in_addr &addr, const in_addr &net, uint8_t bits) bool cidr_match(const in_addr &addr, const in_addr &net, uint8_t bits)
{ {
if (bits == 0) if (bits == 0)
{ {
@ -75,7 +75,7 @@ static bool cidr_match(const in_addr &addr, const in_addr &net, uint8_t bits)
return !((addr.s_addr ^ net.s_addr) & htonl(0xFFFFFFFFu << (32 - bits))); return !((addr.s_addr ^ net.s_addr) & htonl(0xFFFFFFFFu << (32 - bits)));
} }
static bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_t bits) bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_t bits)
{ {
const uint32_t *a = address.s6_addr32; const uint32_t *a = address.s6_addr32;
const uint32_t *n = network.s6_addr32; const uint32_t *n = network.s6_addr32;
@ -93,47 +93,49 @@ static bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_
return true; return true;
} }
struct addr_mask_t addr_mask_t cidr_parse(std::string mask)
{ {
sa_family_t family; unsigned bits = 255;
int p = mask.find('/');
if (p != std::string::npos)
{
char null_byte = 0;
if (sscanf(mask.c_str()+p+1, "%u%c", &bits, &null_byte) != 1 || bits > 128)
throw std::runtime_error("Invalid IP address mask: " + mask);
mask = mask.substr(0, p);
}
in_addr ipv4; in_addr ipv4;
in6_addr ipv6; in6_addr ipv6;
uint8_t bits; if (inet_pton(AF_INET, mask.c_str(), &ipv4) == 1)
}; {
if (bits == 255)
bits = 32;
if (bits > 32)
throw std::runtime_error("Invalid IP address mask: " + mask);
return (addr_mask_t){ .family = AF_INET, .ipv4 = ipv4, .bits = (uint8_t)(bits ? bits : 32) };
}
else if (inet_pton(AF_INET6, mask.c_str(), &ipv6) == 1)
{
if (bits == 255)
bits = 128;
return (addr_mask_t){ .family = AF_INET6, .ipv6 = ipv6, .bits = (uint8_t)bits };
}
else
{
throw std::runtime_error("Invalid IP address mask: " + mask);
}
}
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg, bool include_v6) std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg, bool include_v6)
{ {
std::vector<addr_mask_t> masks; std::vector<addr_mask_t> masks;
for (auto mask: mask_cfg) for (auto mask: mask_cfg)
{ {
unsigned bits = 0; masks.push_back(cidr_parse(mask));
int p = mask.find('/'); if (masks[masks.size()-1].family == AF_INET6)
if (p != std::string::npos)
{ {
char null_byte = 0; // Auto-enable IPv6 addresses
if (sscanf(mask.c_str()+p+1, "%u%c", &bits, &null_byte) != 1 || bits > 128) include_v6 = true;
{
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
}
mask = mask.substr(0, p);
}
in_addr ipv4;
in6_addr ipv6;
if (inet_pton(AF_INET, mask.c_str(), &ipv4) == 1)
{
if (bits > 32)
{
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
}
masks.push_back((addr_mask_t){ .family = AF_INET, .ipv4 = ipv4, .bits = (uint8_t)bits });
}
else if (include_v6 && inet_pton(AF_INET6, mask.c_str(), &ipv6) == 1)
{
masks.push_back((addr_mask_t){ .family = AF_INET6, .ipv6 = ipv6, .bits = (uint8_t)bits });
}
else
{
throw std::runtime_error((include_v6 ? "Invalid IPv4 address mask: " : "Invalid IP address mask: ") + mask);
} }
} }
std::set<std::string> addresses; std::set<std::string> addresses;

View File

@ -1,10 +1,22 @@
#pragma once #pragma once
#include <netinet/in.h>
#include <sys/socket.h> #include <sys/socket.h>
#include <string> #include <string>
#include <vector> #include <vector>
struct addr_mask_t
{
sa_family_t family;
in_addr ipv4;
in6_addr ipv6;
uint8_t bits;
};
bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr_storage *addr); bool string_to_addr(std::string str, bool parse_port, int default_port, struct sockaddr_storage *addr);
std::string addr_to_string(const sockaddr_storage &addr); std::string addr_to_string(const sockaddr_storage &addr);
addr_mask_t cidr_parse(std::string mask);
bool cidr_match(const in_addr &address, const in_addr &network, uint8_t bits);
bool cidr6_match(const in6_addr &address, const in6_addr &network, uint8_t bits);
std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg = std::vector<std::string>(), bool include_v6 = false); std::vector<std::string> getifaddr_list(std::vector<std::string> mask_cfg = std::vector<std::string>(), bool include_v6 = false);
int create_and_bind_socket(std::string bind_address, int bind_port, int listen_backlog, int *listening_port); int create_and_bind_socket(std::string bind_address, int bind_port, int listen_backlog, int *listening_port);

View File

@ -62,7 +62,7 @@ int timerfd_manager_t::set_timer_us(uint64_t micros, bool repeat, std::function<
.callback = callback, .callback = callback,
}); });
inc_timer(timers[timers.size()-1]); inc_timer(timers[timers.size()-1]);
set_nearest(); set_nearest(false);
return timer_id; return timer_id;
} }
@ -82,13 +82,13 @@ void timerfd_manager_t::clear_timer(int timer_id)
{ {
nearest--; nearest--;
} }
set_nearest(); set_nearest(false);
break; break;
} }
} }
} }
void timerfd_manager_t::set_nearest() void timerfd_manager_t::set_nearest(bool trigger_inline)
{ {
if (onstack > 0) if (onstack > 0)
{ {
@ -134,10 +134,13 @@ again:
} }
if (exp.it_value.tv_sec < 0 || exp.it_value.tv_sec == 0 && exp.it_value.tv_nsec <= 0) if (exp.it_value.tv_sec < 0 || exp.it_value.tv_sec == 0 && exp.it_value.tv_nsec <= 0)
{ {
// It already happened // It already happened - set minimal timeout
// FIXME: Postpone to setImmediate/BH to avoid reenterability problems if (trigger_inline)
trigger_nearest(); {
goto again; trigger_nearest();
goto again;
}
exp.it_value = { .tv_sec = 0, .tv_nsec = 1 };
} }
if (timerfd_settime(timerfd, 0, &exp, NULL)) if (timerfd_settime(timerfd, 0, &exp, NULL))
{ {
@ -157,7 +160,7 @@ void timerfd_manager_t::handle_readable()
trigger_nearest(); trigger_nearest();
} }
wait_state = 0; wait_state = 0;
set_nearest(); set_nearest(true);
} }
void timerfd_manager_t::trigger_nearest() void timerfd_manager_t::trigger_nearest()

View File

@ -26,7 +26,7 @@ class timerfd_manager_t
std::vector<timerfd_timer_t> timers; std::vector<timerfd_timer_t> timers;
void inc_timer(timerfd_timer_t & t); void inc_timer(timerfd_timer_t & t);
void set_nearest(); void set_nearest(bool trigger_inline);
void trigger_nearest(); void trigger_nearest();
void handle_readable(); void handle_readable();
public: public:

View File

@ -53,7 +53,7 @@ kill_osds &
LD_PRELOAD="build/src/client/libfio_vitastor.so" \ LD_PRELOAD="build/src/client/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/client/libfio_vitastor.so -bsrange=4k-128k -blockalign=4k -direct=1 -iodepth=32 -fsync=256 -rw=randrw \ fio -thread -name=test -ioengine=build/src/client/libfio_vitastor.so -bsrange=4k-128k -blockalign=4k -direct=1 -iodepth=32 -fsync=256 -rw=randrw \
-randrepeat=0 -refill_buffers=1 -mirror_file=./testdata/bin/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120 -serialize_overlap=1 -randrepeat=0 -refill_buffers=1 -mirror_file=./testdata/bin/mirror.bin -etcd=$ETCD_URL -image=testimg -loops=10 -runtime=120
qemu-img convert -S 4096 -p \ qemu-img convert -S 4096 -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:image=testimg" \ -f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:image=testimg" \