Compare commits

..

31 Commits

Author SHA1 Message Date
249a233b37 WIP another experiment - "smart" iothreads
Some checks failed
Test / test_snapshot_chain_ec (push) Successful in 2m55s
Test / test_rebalance_verify_imm (push) Successful in 3m55s
Test / test_root_node (push) Successful in 11s
Test / test_rebalance_verify (push) Successful in 4m32s
Test / test_switch_primary (push) Successful in 37s
Test / test_rebalance_verify_ec_imm (push) Successful in 2m53s
Test / test_write (push) Successful in 46s
Test / test_write_no_same (push) Successful in 17s
Test / test_write_xor (push) Successful in 1m55s
Test / test_rebalance_verify_ec (push) Successful in 5m59s
Test / test_heal_pg_size_2 (push) Successful in 3m58s
Test / test_heal_ec (push) Successful in 3m51s
Test / test_heal_csum_32k_dmj (push) Successful in 5m47s
Test / test_heal_csum_32k_dj (push) Successful in 6m10s
Test / test_heal_csum_4k_dmj (push) Successful in 6m50s
Test / test_osd_tags (push) Successful in 19s
Test / test_enospc (push) Successful in 1m11s
Test / test_heal_csum_4k_dj (push) Successful in 6m1s
Test / test_enospc_xor (push) Successful in 1m11s
Test / test_heal_csum_32k (push) Failing after 10m14s
Test / test_enospc_imm (push) Successful in 46s
Test / test_enospc_imm_xor (push) Successful in 1m3s
Test / test_scrub (push) Successful in 28s
Test / test_scrub_zero_osd_2 (push) Successful in 28s
Test / test_scrub_xor (push) Successful in 34s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 39s
Test / test_scrub_pg_size_3 (push) Successful in 49s
Test / test_scrub_ec (push) Successful in 25s
Test / test_nfs (push) Successful in 16s
Test / test_heal_csum_4k (push) Successful in 8m55s
2024-07-03 11:05:45 +03:00
d07e072212 Change bool wr to event mask in epoll_manager 2024-07-01 00:30:59 +03:00
21d1171ba4 Fix parsing after "slightly decopypasting" :)
Some checks failed
Test / test_rebalance_verify (push) Failing after 1m56s
Test / test_snapshot_chain_ec (push) Failing after 6m25s
Test / test_root_node (push) Successful in 17s
Test / test_rebalance_verify_imm (push) Successful in 5m47s
Test / test_switch_primary (push) Successful in 39s
Test / test_write (push) Successful in 43s
Test / test_write_no_same (push) Successful in 18s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m16s
Test / test_write_xor (push) Successful in 1m19s
Test / test_rebalance_verify_ec (push) Successful in 9m23s
Test / test_heal_pg_size_2 (push) Successful in 3m51s
Test / test_heal_ec (push) Successful in 4m57s
Test / test_heal_csum_32k_dmj (push) Successful in 4m44s
Test / test_heal_csum_32k_dj (push) Successful in 5m59s
Test / test_heal_csum_4k (push) Successful in 3m32s
Test / test_osd_tags (push) Successful in 12s
Test / test_enospc (push) Successful in 37s
Test / test_heal_csum_32k (push) Failing after 10m18s
Test / test_heal_csum_4k_dmj (push) Successful in 9m38s
Test / test_enospc_imm (push) Successful in 58s
Test / test_enospc_xor (push) Successful in 1m10s
Test / test_enospc_imm_xor (push) Successful in 57s
Test / test_scrub (push) Successful in 27s
Test / test_heal_csum_4k_dj (push) Failing after 10m14s
Test / test_scrub_zero_osd_2 (push) Successful in 23s
Test / test_scrub_xor (push) Successful in 35s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 37s
Test / test_scrub_ec (push) Successful in 36s
Test / test_scrub_pg_size_3 (push) Successful in 44s
Test / test_nfs (push) Successful in 14s
2024-06-29 00:09:30 +03:00
ace
8f83086889 Nova and cinder driver patches for OpenStack 2023.2 2024-06-28 00:04:57 +03:00
ceb18f25db Add libvirt 10.0 patch (same as 9.10 and 10.4 actually) 2024-06-28 00:03:46 +03:00
ed51a89f70 Add QEMU 8.2 and 9.0 patches 2024-06-27 12:33:16 +03:00
f59456f22d Add libvirt 10.4 patch (same as 9.10 actually) 2024-06-27 01:35:29 +03:00
ca63cd507d Fix possible infinite loop in flusher (surprisingly reproduced in test_write.sh with iothreads)
All checks were successful
Test / test_snapshot_chain_ec (push) Successful in 3m5s
Test / test_rebalance_verify_imm (push) Successful in 3m41s
Test / test_root_node (push) Successful in 11s
Test / test_rebalance_verify (push) Successful in 4m22s
Test / test_switch_primary (push) Successful in 37s
Test / test_write (push) Successful in 40s
Test / test_write_no_same (push) Successful in 20s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m5s
Test / test_write_xor (push) Successful in 1m13s
Test / test_rebalance_verify_ec (push) Successful in 5m21s
Test / test_heal_pg_size_2 (push) Successful in 4m15s
Test / test_heal_ec (push) Successful in 4m58s
Test / test_heal_csum_32k_dmj (push) Successful in 5m30s
Test / test_heal_csum_32k_dj (push) Successful in 6m12s
Test / test_heal_csum_4k (push) Successful in 5m32s
Test / test_osd_tags (push) Successful in 12s
Test / test_enospc (push) Successful in 41s
Test / test_heal_csum_4k_dj (push) Successful in 8m33s
Test / test_enospc_xor (push) Successful in 53s
Test / test_enospc_imm (push) Successful in 43s
Test / test_scrub (push) Successful in 27s
Test / test_enospc_imm_xor (push) Successful in 52s
Test / test_scrub_zero_osd_2 (push) Successful in 29s
Test / test_scrub_xor (push) Successful in 36s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 41s
Test / test_scrub_ec (push) Successful in 25s
Test / test_scrub_pg_size_3 (push) Successful in 49s
Test / test_nfs (push) Successful in 16s
Test / test_heal_csum_32k (push) Successful in 5m9s
Test / test_heal_csum_4k_dmj (push) Successful in 5m7s
2024-06-27 00:38:01 +03:00
ea0d72289c Treat copied buffers as written only after completing the write in client
All checks were successful
Test / test_snapshot_chain_ec (push) Successful in 3m18s
Test / test_rebalance_verify_imm (push) Successful in 3m22s
Test / test_root_node (push) Successful in 57s
Test / test_rebalance_verify (push) Successful in 5m10s
Test / test_switch_primary (push) Successful in 37s
Test / test_write (push) Successful in 55s
Test / test_write_xor (push) Successful in 53s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m40s
Test / test_write_no_same (push) Successful in 15s
Test / test_rebalance_verify_ec (push) Successful in 6m34s
Test / test_heal_pg_size_2 (push) Successful in 4m32s
Test / test_heal_csum_32k_dmj (push) Successful in 4m24s
Test / test_heal_ec (push) Successful in 4m54s
Test / test_heal_csum_32k_dj (push) Successful in 5m58s
Test / test_heal_csum_32k (push) Successful in 9m43s
Test / test_heal_csum_4k_dmj (push) Successful in 9m41s
Test / test_osd_tags (push) Successful in 13s
Test / test_heal_csum_4k_dj (push) Successful in 10m0s
Test / test_enospc (push) Successful in 44s
Test / test_enospc_xor (push) Successful in 55s
Test / test_enospc_imm (push) Successful in 44s
Test / test_scrub (push) Successful in 31s
Test / test_enospc_imm_xor (push) Successful in 57s
Test / test_scrub_zero_osd_2 (push) Successful in 23s
Test / test_scrub_xor (push) Successful in 27s
Test / test_scrub_pg_size_3 (push) Successful in 46s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 40s
Test / test_scrub_ec (push) Successful in 25s
Test / test_nfs (push) Successful in 14s
Test / test_heal_csum_4k (push) Successful in 9m27s
SYNC operation fsyncs only completed operations, so treating writes as "eligible
for fsync" before actually completing them is incorrect

It affected SCHEME=ec test_heal.sh (with immediate_commit=none) test - it was
flapping with lost writes - some non-fsynced writes were legitimately lost by
the OSD, but weren't repeated by the client
2024-06-20 02:11:53 +03:00
e400a851f4 Repeat dirty buffer flushes on any PG primary change because the new primary may not know about unfinished operations of the old primary
Some checks reported warnings
Test / test_rebalance_verify_ec (push) Has been cancelled
Test / test_rebalance_verify_ec_imm (push) Has been cancelled
Test / test_root_node (push) Has been cancelled
Test / test_switch_primary (push) Has been cancelled
Test / test_write (push) Has been cancelled
Test / test_write_xor (push) Has been cancelled
Test / test_write_no_same (push) Has been cancelled
Test / test_heal_pg_size_2 (push) Has been cancelled
Test / test_heal_ec (push) Has been cancelled
Test / test_heal_csum_32k_dmj (push) Has been cancelled
Test / test_heal_csum_32k_dj (push) Has been cancelled
Test / test_heal_csum_32k (push) Has been cancelled
Test / test_heal_csum_4k_dmj (push) Has been cancelled
Test / test_minsize_1 (push) Has been cancelled
Test / test_heal_csum_4k_dj (push) Has been cancelled
Test / test_heal_csum_4k (push) Has been cancelled
Test / test_osd_tags (push) Has been cancelled
Test / test_enospc (push) Has been cancelled
Test / test_enospc_xor (push) Has been cancelled
Test / test_enospc_imm (push) Has been cancelled
Test / test_enospc_imm_xor (push) Has been cancelled
Test / test_scrub (push) Has been cancelled
Test / test_scrub_zero_osd_2 (push) Has been cancelled
Test / test_scrub_xor (push) Has been cancelled
Test / test_scrub_pg_size_3 (push) Has been cancelled
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Has been cancelled
Test / test_scrub_ec (push) Has been cancelled
Test / test_nfs (push) Has been cancelled
Test / test_interrupted_rebalance_ec_imm (push) Has been cancelled
Test / test_snapshot (push) Has been cancelled
2024-06-19 00:28:26 +03:00
0fec7a9fea Drop dirty peer connections also when stopping PG to guarantee that clients do not miss fsync 2024-06-19 00:28:26 +03:00
b9de2a92a9 Print OSD performance stats
Some checks failed
Test / test_snapshot_chain_ec (push) Successful in 3m9s
Test / test_rebalance_verify_imm (push) Successful in 3m47s
Test / test_root_node (push) Successful in 10s
Test / test_rebalance_verify (push) Successful in 4m35s
Test / test_switch_primary (push) Successful in 35s
Test / test_rebalance_verify_ec_imm (push) Successful in 2m55s
Test / test_write (push) Successful in 55s
Test / test_write_no_same (push) Successful in 21s
Test / test_write_xor (push) Successful in 1m19s
Test / test_rebalance_verify_ec (push) Successful in 6m10s
Test / test_heal_pg_size_2 (push) Successful in 3m51s
Test / test_heal_ec (push) Successful in 5m10s
Test / test_heal_csum_32k_dmj (push) Successful in 4m46s
Test / test_heal_csum_32k_dj (push) Successful in 6m25s
Test / test_heal_csum_32k (push) Successful in 6m13s
Test / test_osd_tags (push) Successful in 41s
Test / test_heal_csum_4k_dj (push) Successful in 6m26s
Test / test_heal_csum_4k_dmj (push) Successful in 6m28s
Test / test_enospc (push) Successful in 1m53s
Test / test_enospc_imm (push) Successful in 51s
Test / test_enospc_xor (push) Successful in 1m37s
Test / test_scrub (push) Successful in 1m11s
Test / test_scrub_zero_osd_2 (push) Successful in 34s
Test / test_enospc_imm_xor (push) Successful in 1m29s
Test / test_heal_csum_4k (push) Successful in 5m46s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 43s
Test / test_scrub_pg_size_3 (push) Successful in 53s
Test / test_scrub_ec (push) Successful in 21s
Test / test_nfs (push) Successful in 15s
Test / test_scrub_xor (push) Failing after 3m7s
2024-06-17 13:02:58 +03:00
5360a70853 Make OSD also report derived stats 2024-06-17 13:02:52 +03:00
4c2328eb13 Implement ls-osd command
All checks were successful
Test / test_snapshot_chain_ec (push) Successful in 2m45s
Test / test_rebalance_verify_imm (push) Successful in 2m27s
Test / test_root_node (push) Successful in 11s
Test / test_rebalance_verify (push) Successful in 3m1s
Test / test_switch_primary (push) Successful in 35s
Test / test_write (push) Successful in 41s
Test / test_write_no_same (push) Successful in 19s
Test / test_write_xor (push) Successful in 1m2s
Test / test_rebalance_verify_ec (push) Successful in 3m53s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m40s
Test / test_heal_pg_size_2 (push) Successful in 3m46s
Test / test_heal_ec (push) Successful in 3m49s
Test / test_heal_csum_32k_dmj (push) Successful in 5m41s
Test / test_heal_csum_32k_dj (push) Successful in 6m12s
Test / test_heal_csum_32k (push) Successful in 6m57s
Test / test_heal_csum_4k_dmj (push) Successful in 6m42s
Test / test_osd_tags (push) Successful in 27s
Test / test_enospc (push) Successful in 1m59s
Test / test_heal_csum_4k_dj (push) Successful in 6m33s
Test / test_enospc_xor (push) Successful in 2m24s
Test / test_enospc_imm (push) Successful in 1m32s
Test / test_scrub (push) Successful in 1m1s
Test / test_heal_csum_4k (push) Successful in 6m38s
Test / test_enospc_imm_xor (push) Successful in 1m11s
Test / test_scrub_zero_osd_2 (push) Successful in 33s
Test / test_scrub_xor (push) Successful in 32s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 39s
Test / test_nfs (push) Successful in 16s
Test / test_scrub_pg_size_3 (push) Successful in 48s
Test / test_scrub_ec (push) Successful in 21s
2024-06-17 02:22:14 +03:00
313daef12d Slightly decopypaste etcd key parsing 2024-06-17 01:38:42 +03:00
ad9c12e1b9 Fix Pseudo-FS initialization leading to ENOENTs some time after start
Some checks failed
Test / test_snapshot_chain_ec (push) Successful in 2m44s
Test / test_rebalance_verify_imm (push) Successful in 3m57s
Test / test_root_node (push) Successful in 12s
Test / test_rebalance_verify (push) Successful in 5m9s
Test / test_switch_primary (push) Successful in 40s
Test / test_write (push) Successful in 1m2s
Test / test_write_no_same (push) Successful in 14s
Test / test_write_xor (push) Successful in 1m23s
Test / test_rebalance_verify_ec (push) Successful in 6m33s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m57s
Test / test_heal_pg_size_2 (push) Successful in 3m54s
Test / test_heal_csum_32k_dmj (push) Successful in 5m10s
Test / test_heal_csum_32k_dj (push) Successful in 5m9s
Test / test_heal_csum_32k (push) Successful in 5m43s
Test / test_osd_tags (push) Successful in 1m3s
Test / test_heal_csum_4k_dmj (push) Successful in 5m38s
Test / test_heal_csum_4k_dj (push) Successful in 5m38s
Test / test_enospc (push) Successful in 1m37s
Test / test_enospc_imm (push) Successful in 1m23s
Test / test_enospc_xor (push) Successful in 2m0s
Test / test_scrub (push) Successful in 45s
Test / test_enospc_imm_xor (push) Successful in 1m32s
Test / test_scrub_xor (push) Successful in 30s
Test / test_scrub_zero_osd_2 (push) Successful in 41s
Test / test_heal_csum_4k (push) Successful in 5m29s
Test / test_nfs (push) Successful in 21s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 33s
Test / test_scrub_ec (push) Successful in 31s
Test / test_scrub_pg_size_3 (push) Successful in 56s
Test / test_heal_ec (push) Failing after 2m54s
2024-06-16 23:43:09 +03:00
4473eb5512 Fix slow & failing CAS layer merge
All checks were successful
Test / test_snapshot_chain_ec (push) Successful in 2m56s
Test / test_rebalance_verify_imm (push) Successful in 2m44s
Test / test_root_node (push) Successful in 12s
Test / test_rebalance_verify (push) Successful in 3m24s
Test / test_switch_primary (push) Successful in 34s
Test / test_rebalance_verify_ec (push) Successful in 3m3s
Test / test_write_xor (push) Successful in 44s
Test / test_write_no_same (push) Successful in 15s
Test / test_rebalance_verify_ec_imm (push) Successful in 2m30s
Test / test_heal_pg_size_2 (push) Successful in 4m37s
Test / test_heal_csum_32k_dmj (push) Successful in 4m30s
Test / test_heal_ec (push) Successful in 4m45s
Test / test_heal_csum_32k_dj (push) Successful in 6m8s
Test / test_heal_csum_4k_dmj (push) Successful in 6m39s
Test / test_heal_csum_32k (push) Successful in 6m42s
Test / test_heal_csum_4k_dj (push) Successful in 6m30s
Test / test_osd_tags (push) Successful in 18s
Test / test_enospc (push) Successful in 1m21s
Test / test_enospc_imm (push) Successful in 1m13s
Test / test_enospc_xor (push) Successful in 2m2s
Test / test_scrub (push) Successful in 1m5s
Test / test_enospc_imm_xor (push) Successful in 1m42s
Test / test_scrub_zero_osd_2 (push) Successful in 1m1s
Test / test_heal_csum_4k (push) Successful in 6m18s
Test / test_scrub_xor (push) Successful in 31s
Test / test_nfs (push) Successful in 24s
Test / test_scrub_ec (push) Successful in 34s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 40s
Test / test_scrub_pg_size_3 (push) Successful in 43s
Test / test_write (push) Successful in 40s
2024-06-14 02:15:49 +03:00
6501abc060 Set default etcd_ws_keepalive_interval to 5
All checks were successful
Test / test_snapshot_chain_ec (push) Successful in 2m52s
Test / test_rebalance_verify_imm (push) Successful in 3m4s
Test / test_root_node (push) Successful in 13s
Test / test_rebalance_verify (push) Successful in 3m46s
Test / test_switch_primary (push) Successful in 36s
Test / test_write (push) Successful in 44s
Test / test_write_no_same (push) Successful in 20s
Test / test_write_xor (push) Successful in 1m7s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m8s
Test / test_rebalance_verify_ec (push) Successful in 5m25s
Test / test_heal_pg_size_2 (push) Successful in 3m27s
Test / test_heal_csum_32k_dmj (push) Successful in 5m42s
Test / test_heal_csum_32k_dj (push) Successful in 6m28s
Test / test_heal_csum_32k (push) Successful in 6m40s
Test / test_osd_tags (push) Successful in 25s
Test / test_heal_csum_4k_dmj (push) Successful in 7m16s
Test / test_enospc (push) Successful in 2m12s
Test / test_heal_csum_4k_dj (push) Successful in 6m26s
Test / test_enospc_imm (push) Successful in 1m41s
Test / test_enospc_xor (push) Successful in 2m21s
Test / test_heal_csum_4k (push) Successful in 6m19s
Test / test_enospc_imm_xor (push) Successful in 1m32s
Test / test_scrub (push) Successful in 49s
Test / test_scrub_zero_osd_2 (push) Successful in 34s
Test / test_scrub_xor (push) Successful in 31s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 40s
Test / test_nfs (push) Successful in 18s
Test / test_scrub_pg_size_3 (push) Successful in 46s
Test / test_scrub_ec (push) Successful in 27s
Test / test_heal_ec (push) Successful in 2m57s
2024-06-08 00:38:48 +03:00
1228403e74 Implement internal restart / run_forever in monitor
Some checks failed
Test / test_rebalance_verify_imm (push) Successful in 1m55s
Test / test_rebalance_verify (push) Successful in 2m42s
Test / test_root_node (push) Successful in 1m19s
Test / test_switch_primary (push) Successful in 33s
Test / test_rebalance_verify_ec (push) Successful in 3m21s
Test / test_etcd_fail (push) Failing after 10m8s
Test / test_write (push) Successful in 53s
Test / test_write_no_same (push) Successful in 16s
Test / test_write_xor (push) Successful in 1m0s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m59s
Test / test_heal_pg_size_2 (push) Successful in 4m37s
Test / test_heal_csum_32k_dmj (push) Failing after 4m45s
Test / test_heal_ec (push) Successful in 5m48s
Test / test_heal_csum_32k_dj (push) Successful in 6m12s
Test / test_heal_csum_32k (push) Successful in 6m30s
Test / test_heal_csum_4k_dmj (push) Successful in 6m16s
Test / test_osd_tags (push) Successful in 34s
Test / test_heal_csum_4k_dj (push) Successful in 6m50s
Test / test_enospc (push) Successful in 1m34s
Test / test_enospc_imm (push) Successful in 1m4s
Test / test_enospc_xor (push) Successful in 2m6s
Test / test_heal_csum_4k (push) Successful in 6m47s
Test / test_enospc_imm_xor (push) Successful in 1m26s
Test / test_scrub (push) Successful in 37s
Test / test_scrub_zero_osd_2 (push) Successful in 35s
Test / test_scrub_xor (push) Successful in 28s
Test / test_nfs (push) Successful in 19s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 31s
Test / test_scrub_pg_size_3 (push) Successful in 45s
Test / test_scrub_ec (push) Successful in 29s
2024-06-08 00:35:18 +03:00
4eabebd245 Put all configuration to Mon.config
All checks were successful
Test / test_snapshot_chain_ec (push) Successful in 2m50s
Test / test_rebalance_verify_imm (push) Successful in 2m48s
Test / test_rebalance_verify (push) Successful in 3m18s
Test / test_root_node (push) Successful in 11s
Test / test_switch_primary (push) Successful in 35s
Test / test_write (push) Successful in 39s
Test / test_write_no_same (push) Successful in 19s
Test / test_write_xor (push) Successful in 1m2s
Test / test_rebalance_verify_ec (push) Successful in 4m7s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m49s
Test / test_heal_pg_size_2 (push) Successful in 3m24s
Test / test_heal_ec (push) Successful in 5m29s
Test / test_heal_csum_32k_dmj (push) Successful in 6m5s
Test / test_heal_csum_32k_dj (push) Successful in 5m52s
Test / test_heal_csum_32k (push) Successful in 6m44s
Test / test_osd_tags (push) Successful in 21s
Test / test_enospc (push) Successful in 2m16s
Test / test_heal_csum_4k_dmj (push) Successful in 6m44s
Test / test_heal_csum_4k (push) Successful in 6m31s
Test / test_heal_csum_4k_dj (push) Successful in 6m45s
Test / test_enospc_imm (push) Successful in 1m21s
Test / test_enospc_xor (push) Successful in 1m28s
Test / test_scrub_zero_osd_2 (push) Successful in 35s
Test / test_scrub (push) Successful in 39s
Test / test_scrub_xor (push) Successful in 36s
Test / test_enospc_imm_xor (push) Successful in 49s
Test / test_nfs (push) Successful in 24s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 35s
Test / test_scrub_ec (push) Successful in 33s
Test / test_scrub_pg_size_3 (push) Successful in 41s
2024-06-07 00:20:38 +03:00
cf60b6818c Extract PG generation into pg_gen.js
All checks were successful
Test / test_snapshot_chain_ec (push) Successful in 2m56s
Test / test_rebalance_verify_imm (push) Successful in 3m2s
Test / test_root_node (push) Successful in 12s
Test / test_rebalance_verify (push) Successful in 3m35s
Test / test_switch_primary (push) Successful in 33s
Test / test_write (push) Successful in 40s
Test / test_write_no_same (push) Successful in 19s
Test / test_write_xor (push) Successful in 1m3s
Test / test_rebalance_verify_ec (push) Successful in 4m0s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m39s
Test / test_heal_pg_size_2 (push) Successful in 3m29s
Test / test_heal_csum_32k_dmj (push) Successful in 5m22s
Test / test_heal_ec (push) Successful in 6m27s
Test / test_heal_csum_32k_dj (push) Successful in 5m40s
Test / test_heal_csum_32k (push) Successful in 6m50s
Test / test_osd_tags (push) Successful in 25s
Test / test_heal_csum_4k_dmj (push) Successful in 6m28s
Test / test_enospc (push) Successful in 2m13s
Test / test_heal_csum_4k_dj (push) Successful in 6m10s
Test / test_heal_csum_4k (push) Successful in 6m12s
Test / test_scrub (push) Successful in 44s
Test / test_enospc_imm (push) Successful in 1m1s
Test / test_enospc_xor (push) Successful in 1m24s
Test / test_enospc_imm_xor (push) Successful in 1m5s
Test / test_scrub_zero_osd_2 (push) Successful in 27s
Test / test_scrub_xor (push) Successful in 28s
Test / test_nfs (push) Successful in 19s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 37s
Test / test_scrub_pg_size_3 (push) Successful in 48s
Test / test_scrub_ec (push) Successful in 29s
2024-06-05 11:22:06 +03:00
1a4a7cdc37 Extract OSD Tree generation functions to osd_tree.js 2024-06-05 11:19:35 +03:00
1b48085e21 Extract remote etcd interaction to etcd_adapter.js 2024-06-05 11:19:35 +03:00
a71847244e Rename PGUtil.js to pg_utils.js 2024-06-05 10:51:20 +03:00
848c2d2722 Move LPOptimizer, DSL and tests to lp_optimizer/ 2024-06-05 10:51:20 +03:00
86832dc43f Add eslint import/no-unresolved 2024-06-05 10:51:20 +03:00
1f6da79463 Extract stats calculation into a separate file 2024-06-05 10:51:20 +03:00
9bf57c3760 Mention generic Toshiba MG instead of specific MGxx, fix russian vitastorfs link 2024-06-05 02:08:09 +03:00
a0305b5b4a Extract pool configuration validation into a separate file 2024-06-05 02:08:08 +03:00
1546f8e447 Extract etcd data "schema" into a separate file 2024-06-05 02:07:53 +03:00
8ce962b312 Move scripts 2024-06-05 02:07:53 +03:00
88 changed files with 5367 additions and 2137 deletions

View File

@@ -1,2 +1,3 @@
mon usr/lib/vitastor
mon/vitastor-mon.service /lib/systemd/system
mon usr/lib/vitastor/mon
mon/scripts/make-etcd usr/lib/vitastor/mon
mon/scripts/vitastor-mon.service /lib/systemd/system

View File

@@ -1,6 +1,6 @@
usr/bin/vitastor-osd
usr/bin/vitastor-disk
usr/bin/vitastor-dump-journal
mon/vitastor-osd@.service /lib/systemd/system
mon/vitastor.target /lib/systemd/system
mon/90-vitastor.rules /lib/udev/rules.d
mon/scripts/vitastor-osd@.service /lib/systemd/system
mon/scripts/vitastor.target /lib/systemd/system
mon/scripts/90-vitastor.rules /lib/udev/rules.d

View File

@@ -248,7 +248,7 @@ etcd_report_interval to guarantee that keepalive actually works.
## etcd_ws_keepalive_interval
- Type: seconds
- Default: 30
- Default: 5
- Can be changed online: yes
etcd websocket ping interval required to keep the connection alive and

View File

@@ -259,7 +259,7 @@ etcd_report_interval, чтобы keepalive гарантированно рабо
## etcd_ws_keepalive_interval
- Тип: секунды
- Значение по умолчанию: 30
- Значение по умолчанию: 5
- Можно менять на лету: да
Интервал проверки живости вебсокет-подключений к etcd.

View File

@@ -282,7 +282,7 @@
etcd_report_interval, чтобы keepalive гарантированно работал.
- name: etcd_ws_keepalive_interval
type: sec
default: 30
default: 5
online: true
info: |
etcd websocket ping interval required to keep the connection alive and

View File

@@ -22,7 +22,7 @@
with lazy fsync, but prepare for inferior single-thread latency. Read more about capacitors
[here](../config/layout-cluster.en.md#immediate_commit).
- If you want to use HDDs, get modern HDDs with Media Cache or SSD Cache: HGST Ultrastar,
Toshiba MG08, Seagate EXOS or something similar. If your drives don't have such cache then
Toshiba MG, Seagate EXOS or something similar. If your drives don't have such cache then
you also need small SSDs for journal and metadata (even 2 GB per 1 TB of HDD space is enough).
- Get a fast network (at least 10 Gbit/s). Something like Mellanox ConnectX-4 with RoCEv2 is ideal.
- Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
@@ -33,7 +33,7 @@
- SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
- NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
Intel DC-P3700/P4500/P4600, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
- HDD: HGST Ultrastar, Toshiba MG06/MG07/MG08, Seagate EXOS
- HDD: HGST Ultrastar, Toshiba MG, Seagate EXOS
## Configure monitors

View File

@@ -123,4 +123,4 @@ vitastor-cli create -s 10G testimg
Если вы хотите использовать не только блочные образы виртуальных машин или контейнеров,
а также кластерную файловую систему, то:
- [Следуйте инструкциям](../usage/nfs.en.md#vitastorfs)
- [Следуйте инструкциям](../usage/nfs.ru.md#vitastorfs)

View File

@@ -11,6 +11,7 @@ module.exports = {
"ecmaVersion": 2020
},
"plugins": [
"import"
],
"rules": {
"indent": [
@@ -44,6 +45,10 @@ module.exports = {
],
"node/shebang": [
"off"
],
"import/no-unresolved": [
2,
{ "commonjs": true }
]
}
};

356
mon/etcd_adapter.js Normal file
View File

@@ -0,0 +1,356 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
const http = require('http');
const WebSocket = require('ws');
const MON_STOPPED = 'Monitor instance is stopped';
class EtcdAdapter
{
constructor(mon)
{
this.mon = mon;
this.ws = null;
this.ws_alive = false;
this.ws_keepalive_timer = null;
}
parse_config(config)
{
this.parse_etcd_addresses(config.etcd_address||config.etcd_url);
}
parse_etcd_addresses(addrs)
{
const is_local_ip = this.mon.local_ips(true).reduce((a, c) => { a[c] = true; return a; }, {});
this.etcd_local = [];
this.etcd_urls = [];
this.selected_etcd_url = null;
this.etcd_urls_to_try = [];
if (!(addrs instanceof Array))
addrs = addrs ? (''+(addrs||'')).split(/,/) : [];
if (!addrs.length)
{
console.error('Vitastor etcd address(es) not specified. Please set on the command line or in the config file');
process.exit(1);
}
for (let url of addrs)
{
let scheme = 'http';
url = url.trim().replace(/^(https?):\/\//, (m, m1) => { scheme = m1; return ''; });
const slash = url.indexOf('/');
const colon = url.indexOf(':');
const is_local = is_local_ip[colon >= 0 ? url.substr(0, colon) : (slash >= 0 ? url.substr(0, slash) : url)];
url = scheme+'://'+(slash >= 0 ? url : url+'/v3');
if (is_local)
this.etcd_local.push(url);
else
this.etcd_urls.push(url);
}
}
pick_next_etcd()
{
if (this.selected_etcd_url)
return this.selected_etcd_url;
if (!this.etcd_urls_to_try || !this.etcd_urls_to_try.length)
{
this.etcd_urls_to_try = [ ...this.etcd_local ];
const others = [ ...this.etcd_urls ];
while (others.length)
{
const url = others.splice(0|(others.length*Math.random()), 1);
this.etcd_urls_to_try.push(url[0]);
}
}
this.selected_etcd_url = this.etcd_urls_to_try.shift();
return this.selected_etcd_url;
}
stop_watcher(cur_addr)
{
cur_addr = cur_addr || this.selected_etcd_url;
if (this.ws)
{
console.log('Disconnected from etcd at '+this.ws_used_url);
this.ws.close();
this.ws = null;
}
if (this.ws_keepalive_timer)
{
clearInterval(this.ws_keepalive_timer);
this.ws_keepalive_timer = null;
}
if (this.selected_etcd_url == cur_addr)
{
this.selected_etcd_url = null;
}
}
restart_watcher(cur_addr)
{
this.stop_watcher(cur_addr);
this.start_watcher(this.mon.config.etcd_mon_retries).catch(this.mon.die);
}
async start_watcher(retries)
{
let retry = 0;
if (!retries || retries < 1)
{
retries = 1;
}
const tried = {};
while (retries < 0 || retry < retries)
{
const cur_addr = this.pick_next_etcd();
const base = 'ws'+cur_addr.substr(4);
let now = Date.now();
if (tried[base] && now-tried[base] < this.mon.config.etcd_start_timeout)
{
await new Promise(ok => setTimeout(ok, this.mon.config.etcd_start_timeout-(now-tried[base])));
now = Date.now();
}
tried[base] = now;
if (this.mon.stopped)
{
return;
}
const ok = await new Promise(ok =>
{
const timer_id = setTimeout(() =>
{
if (this.ws)
{
console.log('Disconnected from etcd at '+this.ws_used_url);
this.ws.close();
this.ws = null;
}
ok(false);
}, this.mon.config.etcd_mon_timeout);
this.ws = new WebSocket(base+'/watch');
this.ws_used_url = cur_addr;
const fail = () =>
{
ok(false);
};
this.ws.on('error', fail);
this.ws.on('open', () =>
{
this.ws.removeListener('error', fail);
if (timer_id)
clearTimeout(timer_id);
ok(true);
});
});
if (ok)
break;
if (this.selected_etcd_url == cur_addr)
this.selected_etcd_url = null;
this.ws = null;
retry++;
}
if (!this.ws)
{
this.mon.die('Failed to open etcd watch websocket');
return;
}
if (this.mon.stopped)
{
this.stop_watcher();
return;
}
const cur_addr = this.selected_etcd_url;
this.ws_alive = true;
this.ws_keepalive_timer = setInterval(() =>
{
if (this.ws_alive && this.ws)
{
this.ws_alive = false;
this.ws.send(JSON.stringify({ progress_request: {} }));
}
else
{
console.log('etcd websocket timed out, restarting it');
this.restart_watcher(cur_addr);
}
}, (Number(this.mon.config.etcd_ws_keepalive_interval) || 5)*1000);
this.ws.on('error', () => this.restart_watcher(cur_addr));
this.ws.send(JSON.stringify({
create_request: {
key: b64(this.mon.config.etcd_prefix+'/'),
range_end: b64(this.mon.config.etcd_prefix+'0'),
start_revision: ''+this.mon.etcd_watch_revision,
watch_id: 1,
progress_notify: true,
},
}));
this.ws.on('message', (msg) =>
{
if (this.mon.stopped)
{
this.stop_watcher();
return;
}
this.ws_alive = true;
let data;
try
{
data = JSON.parse(msg);
}
catch (e)
{
}
if (!data || !data.result)
{
console.error('Unknown message received from watch websocket: '+msg);
}
else if (data.result.canceled)
{
// etcd watch canceled
if (data.result.compact_revision)
{
// we may miss events if we proceed
this.mon.die('Revisions before '+data.result.compact_revision+' were compacted by etcd, exiting');
}
this.mon.die('Watch canceled by etcd, reason: '+data.result.cancel_reason+', exiting');
}
else if (data.result.created)
{
// etcd watch created
console.log('Successfully subscribed to etcd at '+this.selected_etcd_url+', revision '+data.result.header.revision);
}
else
{
this.mon.on_message(data.result);
}
});
}
async become_master()
{
const state = { ...this.mon.get_mon_state(), id: ''+this.mon.etcd_lease_id };
// eslint-disable-next-line no-constant-condition
while (1)
{
const res = await this.etcd_call('/kv/txn', {
compare: [ { target: 'CREATE', create_revision: 0, key: b64(this.mon.config.etcd_prefix+'/mon/master') } ],
success: [ { requestPut: { key: b64(this.mon.config.etcd_prefix+'/mon/master'), value: b64(JSON.stringify(state)), lease: ''+this.mon.etcd_lease_id } } ],
}, this.mon.config.etcd_start_timeout, 0);
if (res.succeeded)
{
break;
}
console.log('Waiting to become master');
await new Promise(ok => setTimeout(ok, this.mon.config.etcd_start_timeout));
}
console.log('Became master');
}
async etcd_call(path, body, timeout, retries)
{
let retry = 0;
if (retries >= 0 && retries < 1)
{
retries = 1;
}
const tried = {};
while (retries < 0 || retry < retries)
{
retry++;
const base = this.pick_next_etcd();
let now = Date.now();
if (tried[base] && now-tried[base] < timeout)
{
await new Promise(ok => setTimeout(ok, timeout-(now-tried[base])));
now = Date.now();
}
tried[base] = now;
if (this.mon.stopped)
{
throw new Error(MON_STOPPED);
}
const res = await POST(base+path, body, timeout);
if (this.mon.stopped)
{
throw new Error(MON_STOPPED);
}
if (res.error)
{
if (this.selected_etcd_url == base)
this.selected_etcd_url = null;
console.error('Failed to query etcd '+path+' (retry '+retry+'/'+retries+'): '+res.error);
continue;
}
if (res.json)
{
if (res.json.error)
{
console.error(path+': etcd returned error: '+res.json.error);
break;
}
return res.json;
}
}
throw new Error('Failed to query etcd ('+retries+' retries)');
}
}
function POST(url, body, timeout)
{
return new Promise(ok =>
{
const body_text = Buffer.from(JSON.stringify(body));
let timer_id = timeout > 0 ? setTimeout(() =>
{
if (req)
req.abort();
req = null;
ok({ error: 'timeout' });
}, timeout) : null;
let req = http.request(url, { method: 'POST', headers: {
'Content-Type': 'application/json',
'Content-Length': body_text.length,
} }, (res) =>
{
if (!req)
{
return;
}
clearTimeout(timer_id);
let res_body = '';
res.setEncoding('utf8');
res.on('error', (error) => ok({ error }));
res.on('data', chunk => { res_body += chunk; });
res.on('end', () =>
{
if (res.statusCode != 200)
{
ok({ error: res_body, code: res.statusCode });
return;
}
try
{
res_body = JSON.parse(res_body);
ok({ response: res, json: res_body });
}
catch (e)
{
ok({ error: e, response: res, body: res_body });
}
});
});
req.on('error', (error) => ok({ error }));
req.on('close', () => ok({ error: new Error('Connection closed prematurely') }));
req.write(body_text);
req.end();
});
}
function b64(str)
{
return Buffer.from(str).toString('base64');
}
module.exports = EtcdAdapter;

391
mon/etcd_schema.js Normal file
View File

@@ -0,0 +1,391 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
// FIXME document all etcd keys and config variables in the form of JSON schema or similar
const etcd_nonempty_keys = {
'config/global': 1,
'config/node_placement': 1,
'config/pools': 1,
'config/pgs': 1,
'history/last_clean_pgs': 1,
'stats': 1,
};
const etcd_allow = new RegExp('^'+[
'config/global',
'config/node_placement',
'config/pools',
'config/osd/[1-9]\\d*',
'config/pgs',
'config/inode/[1-9]\\d*/[1-9]\\d*',
'osd/state/[1-9]\\d*',
'osd/stats/[1-9]\\d*',
'osd/inodestats/[1-9]\\d*',
'osd/space/[1-9]\\d*',
'mon/master',
'mon/member/[a-f0-9]+',
'pg/state/[1-9]\\d*/[1-9]\\d*',
'pg/stats/[1-9]\\d*/[1-9]\\d*',
'pg/history/[1-9]\\d*/[1-9]\\d*',
'history/last_clean_pgs',
'inode/stats/[1-9]\\d*/\\d+',
'pool/stats/[1-9]\\d*',
'stats',
'index/image/.*',
'index/maxid/[1-9]\\d*',
].join('$|^')+'$');
const etcd_tree = {
config: {
/* global: {
// WARNING: NOT ALL OF THESE ARE ACTUALLY CONFIGURABLE HERE
// THIS IS JUST A POOR MAN'S CONFIG DOCUMENTATION
// etcd connection
config_path: "/etc/vitastor/vitastor.conf",
etcd_prefix: "/vitastor",
// etcd connection - configurable online
etcd_address: "10.0.115.10:2379/v3",
// mon
etcd_mon_ttl: 5, // min: 1
etcd_mon_timeout: 1000, // ms. min: 0
etcd_mon_retries: 5, // min: 0
mon_change_timeout: 1000, // ms. min: 100
mon_retry_change_timeout: 50, // ms. min: 10
mon_stats_timeout: 1000, // ms. min: 100
osd_out_time: 600, // seconds. min: 0
placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
use_old_pg_combinator: false,
// client and osd
tcp_header_buffer_size: 65536,
use_sync_send_recv: false,
use_rdma: true,
rdma_device: null, // for example, "rocep5s0f0"
rdma_port_num: 1,
rdma_gid_index: 0,
rdma_mtu: 4096,
rdma_max_sge: 128,
rdma_max_send: 8,
rdma_max_recv: 16,
rdma_max_msg: 132096,
block_size: 131072,
disk_alignment: 4096,
bitmap_granularity: 4096,
immediate_commit: false, // 'all' or 'small'
// client - configurable online
client_max_dirty_bytes: 33554432,
client_max_dirty_ops: 1024,
client_enable_writeback: false,
client_max_buffered_bytes: 33554432,
client_max_buffered_ops: 1024,
client_max_writeback_iodepth: 256,
client_retry_interval: 50, // ms. min: 10
client_eio_retry_interval: 1000, // ms
client_retry_enospc: true,
osd_nearfull_ratio: 0.95,
// client and osd - configurable online
log_level: 0,
peer_connect_interval: 5, // seconds. min: 1
peer_connect_timeout: 5, // seconds. min: 1
osd_idle_timeout: 5, // seconds. min: 1
osd_ping_timeout: 5, // seconds. min: 1
max_etcd_attempts: 5,
etcd_quick_timeout: 1000, // ms
etcd_slow_timeout: 5000, // ms
etcd_keepalive_timeout: 30, // seconds, default is max(30, etcd_report_interval*2)
etcd_ws_keepalive_interval: 5, // seconds
// osd
etcd_report_interval: 5, // seconds
etcd_stats_interval: 30, // seconds
run_primary: true,
osd_network: null, // "192.168.7.0/24" or an array of masks
bind_address: "0.0.0.0",
bind_port: 0,
readonly: false,
osd_memlock: false,
// osd - configurable online
autosync_interval: 5,
autosync_writes: 128,
client_queue_depth: 128, // unused
recovery_queue_depth: 1,
recovery_sleep_us: 0,
recovery_tune_util_low: 0.1,
recovery_tune_client_util_low: 0,
recovery_tune_util_high: 1.0,
recovery_tune_client_util_high: 0.5,
recovery_tune_interval: 1,
recovery_tune_agg_interval: 10, // 10 times recovery_tune_interval
recovery_tune_sleep_min_us: 10, // 10 microseconds
recovery_pg_switch: 128,
recovery_sync_batch: 16,
no_recovery: false,
no_rebalance: false,
print_stats_interval: 3,
slow_log_interval: 10,
inode_vanish_time: 60,
auto_scrub: false,
no_scrub: false,
scrub_interval: '30d', // 1s/1m/1h/1d
scrub_queue_depth: 1,
scrub_sleep: 0, // milliseconds
scrub_list_limit: 1000, // objects to list on one scrub iteration
scrub_find_best: true,
scrub_ec_max_bruteforce: 100, // maximum EC error locator brute-force iterators
// blockstore - fixed in superblock
block_size,
disk_alignment,
journal_block_size,
meta_block_size,
bitmap_granularity,
journal_device,
journal_offset,
journal_size,
disable_journal_fsync,
data_device,
data_offset,
data_size,
disable_data_fsync,
meta_device,
meta_offset,
disable_meta_fsync,
disable_device_lock,
// blockstore - configurable offline
inmemory_metadata,
inmemory_journal,
journal_sector_buffer_count,
journal_no_same_sector_overwrites,
// blockstore - configurable online
max_write_iodepth,
min_flusher_count: 1,
max_flusher_count: 256,
throttle_small_writes: false,
throttle_target_iops: 100,
throttle_target_mbs: 100,
throttle_target_parallelism: 1,
throttle_threshold_us: 50,
}, */
global: {},
/* node_placement: {
host1: { level: 'host', parent: 'rack1' },
...
}, */
node_placement: {},
/* pools: {
<id>: {
name: 'testpool',
// 'ec' uses Reed-Solomon-Vandermonde codes, 'jerasure' is an alias for 'ec'
scheme: 'replicated' | 'xor' | 'ec' | 'jerasure',
pg_size: 3,
pg_minsize: 2,
// number of parity chunks, required for EC
parity_chunks?: 1,
pg_count: 100,
// default is failure_domain=host
failure_domain?: 'host',
// additional failure domain rules; failure_domain=x is equivalent to x=123..N
level_placement?: 'dc=112233 host=123456',
raw_placement?: 'any, dc=1 host!=1, dc=1 host!=(1,2)',
old_combinator: false,
max_osd_combinations: 10000,
// block_size, bitmap_granularity, immediate_commit must match all OSDs used in that pool
block_size: 131072,
bitmap_granularity: 4096,
// 'all'/'small'/'none', same as in OSD options
immediate_commit: 'none',
pg_stripe_size: 0,
root_node?: 'rack1',
// restrict pool to OSDs having all of these tags
osd_tags?: 'nvme' | [ 'nvme', ... ],
// prefer to put primary on OSD with these tags
primary_affinity_tags?: 'nvme' | [ 'nvme', ... ],
// scrub interval
scrub_interval?: '30d',
},
...
}, */
pools: {},
osd: {
/* <id>: { reweight?: 1, tags?: [ 'nvme', ... ], noout?: true }, ... */
},
/* pgs: {
hash: string,
items: {
<pool_id>: {
<pg_id>: {
osd_set: [ 1, 2, 3 ],
primary: 1,
pause: false,
}
}
}
}, */
pgs: {},
/* inode: {
<pool_id>: {
<inode_t>: {
name: string,
size?: uint64_t, // bytes
parent_pool?: <pool_id>,
parent_id?: <inode_t>,
readonly?: boolean,
}
}
}, */
inode: {},
},
osd: {
state: {
/* <osd_num_t>: {
state: "up",
addresses: string[],
host: string,
port: uint16_t,
primary_enabled: boolean,
blockstore_enabled: boolean,
}, */
},
stats: {
/* <osd_num_t>: {
time: number, // unix time
blockstore_ready: boolean,
size: uint64_t, // bytes
free: uint64_t, // bytes
host: string,
op_stats: {
<string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
},
subop_stats: {
<string>: { count: uint64_t, usec: uint64_t },
},
recovery_stats: {
degraded: { count: uint64_t, bytes: uint64_t },
misplaced: { count: uint64_t, bytes: uint64_t },
},
}, */
},
inodestats: {
/* <pool_id>: {
<inode_t>: {
read: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
write: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
delete: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
},
}, */
},
space: {
/* <osd_num_t>: {
<pool_id>: {
<inode_t>: uint64_t, // bytes
},
}, */
},
},
mon: {
master: {
/* ip: [ string ], id: uint64_t */
},
standby: {
/* <uint64_t>: { ip: [ string ] }, */
},
},
pg: {
state: {
/* <pool_id>: {
<pg_id>: {
primary: osd_num_t,
state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
"has_invalid"|"has_inconsistent"|"has_corrupted"|"left_on_dead"|"scrubbing")[],
}
}, */
},
stats: {
/* <pool_id>: {
<pg_id>: {
object_count: uint64_t,
clean_count: uint64_t,
misplaced_count: uint64_t,
degraded_count: uint64_t,
incomplete_count: uint64_t,
write_osd_set: osd_num_t[],
},
}, */
},
history: {
/* <pool_id>: {
<pg_id>: {
osd_sets: osd_num_t[][],
all_peers: osd_num_t[],
epoch: uint64_t,
next_scrub: uint64_t,
},
}, */
},
},
inode: {
stats: {
/* <pool_id>: {
<inode_t>: {
raw_used: uint64_t, // raw used bytes on OSDs
read: { count: uint64_t, usec: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t, lat: uint64_t },
write: { count: uint64_t, usec: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t, lat: uint64_t },
delete: { count: uint64_t, usec: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t, lat: uint64_t },
},
}, */
},
},
pool: {
stats: {
/* <pool_id>: {
used_raw_tb: float, // used raw space in the pool
total_raw_tb: float, // maximum amount of space in the pool
raw_to_usable: float, // raw to usable ratio
space_efficiency: float, // 0..1
} */
},
},
stats: {
/* op_stats: {
<string>: { count: uint64_t, usec: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t, lat: uint64_t },
},
subop_stats: {
<string>: { count: uint64_t, usec: uint64_t, iops: uint64_t, lat: uint64_t },
},
recovery_stats: {
degraded: { count: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t },
misplaced: { count: uint64_t, bytes: uint64_t, bps: uint64_t, iops: uint64_t },
},
object_counts: {
object: uint64_t,
clean: uint64_t,
misplaced: uint64_t,
degraded: uint64_t,
incomplete: uint64_t,
},
object_bytes: {
total: uint64_t,
clean: uint64_t,
misplaced: uint64_t,
degraded: uint64_t,
incomplete: uint64_t,
}, */
},
history: {
last_clean_pgs: {},
},
index: {
image: {
/* <name>: {
id: uint64_t,
pool_id: uint64_t,
}, */
},
maxid: {
/* <pool_id>: uint64_t, */
},
},
};
module.exports = {
etcd_nonempty_keys,
etcd_allow,
etcd_tree,
};

View File

@@ -8,7 +8,7 @@
// But we support this case with the "parity_space" parameter in optimize_initial()/optimize_change().
const { SimpleCombinator } = require('./simple_pgs.js');
const LPOptimizer = require('./lp-optimizer.js');
const LPOptimizer = require('./lp_optimizer.js');
const osd_tree = {
ripper5: {

View File

@@ -2,7 +2,7 @@
// License: VNPL-1.1 (see README.md for details)
const { compat } = require('./simple_pgs.js');
const LPOptimizer = require('./lp-optimizer.js');
const LPOptimizer = require('./lp_optimizer.js');
async function run()
{

View File

@@ -2,7 +2,7 @@
// License: VNPL-1.1 (see README.md for details)
const { compat, flatten_tree } = require('./simple_pgs.js');
const LPOptimizer = require('./lp-optimizer.js');
const LPOptimizer = require('./lp_optimizer.js');
const crush_tree = [
{ level: 1, children: [

View File

@@ -2,7 +2,7 @@
// License: VNPL-1.1 (see README.md for details)
const { compat } = require('./simple_pgs.js');
const LPOptimizer = require('./lp-optimizer.js');
const LPOptimizer = require('./lp_optimizer.js');
const osd_tree = {
100: {

View File

@@ -2,7 +2,7 @@
// License: VNPL-1.1 (see README.md for details)
const { compat, flatten_tree } = require('./simple_pgs.js');
const LPOptimizer = require('./lp-optimizer.js');
const LPOptimizer = require('./lp_optimizer.js');
const osd_tree = {
100: {

View File

@@ -23,4 +23,4 @@ for (let i = 2; i < process.argv.length; i++)
}
}
new Mon(options).start().catch(e => { console.error(e); process.exit(1); });
Mon.run_forever(options);

1888
mon/mon.js

File diff suppressed because it is too large Load Diff

215
mon/osd_tree.js Normal file
View File

@@ -0,0 +1,215 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
function get_osd_tree(global_config, state)
{
const levels = global_config.placement_levels||{};
levels.host = levels.host || 100;
levels.osd = levels.osd || 101;
const tree = {};
let up_osds = {};
// This requires monitor system time to be in sync with OSD system times (at least to some extent)
const down_time = Date.now()/1000 - global_config.osd_out_time;
for (const osd_num of Object.keys(state.osd.stats).sort((a, b) => a - b))
{
const stat = state.osd.stats[osd_num];
const osd_cfg = state.config.osd[osd_num];
let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight);
if (reweight < 0 || isNaN(reweight))
reweight = 1;
if (stat && stat.size && reweight && (state.osd.state[osd_num] || Number(stat.time) >= down_time ||
osd_cfg && osd_cfg.noout))
{
// Numeric IDs are reserved for OSDs
if (state.osd.state[osd_num] && reweight > 0)
{
// React to down OSDs immediately
up_osds[osd_num] = true;
}
tree[osd_num] = tree[osd_num] || {};
tree[osd_num].id = osd_num;
tree[osd_num].parent = tree[osd_num].parent || stat.host;
tree[osd_num].level = 'osd';
tree[osd_num].size = reweight * stat.size / 1024 / 1024 / 1024 / 1024; // terabytes
if (osd_cfg && osd_cfg.tags)
{
tree[osd_num].tags = (osd_cfg.tags instanceof Array ? [ ...osd_cfg.tags ] : [ osd_cfg.tags ])
.reduce((a, c) => { a[c] = true; return a; }, {});
}
delete tree[osd_num].children;
if (!tree[stat.host])
{
tree[stat.host] = {
id: stat.host,
level: 'host',
parent: null,
children: [],
};
}
}
}
for (const node_id in state.config.node_placement||{})
{
const node_cfg = state.config.node_placement[node_id];
if (/^\d+$/.exec(node_id))
{
node_cfg.level = 'osd';
}
if (!node_id || !node_cfg.level || !levels[node_cfg.level] ||
node_cfg.level === 'osd' && !tree[node_id])
{
// All nodes must have non-empty IDs and valid levels
// OSDs have to actually exist
continue;
}
tree[node_id] = tree[node_id] || {};
tree[node_id].id = node_id;
tree[node_id].level = node_cfg.level;
tree[node_id].parent = node_cfg.parent;
if (node_cfg.level !== 'osd')
{
tree[node_id].children = [];
}
}
return { up_osds, levels, osd_tree: tree };
}
function make_hier_tree(global_config, tree)
{
const levels = global_config.placement_levels||{};
levels.host = levels.host || 100;
levels.osd = levels.osd || 101;
tree = { ...tree };
for (const node_id in tree)
{
tree[node_id] = { ...tree[node_id], children: [] };
}
tree[''] = { children: [] };
for (const node_id in tree)
{
if (node_id === '' || tree[node_id].level === 'osd' && (!tree[node_id].size || tree[node_id].size <= 0))
{
continue;
}
const node_cfg = tree[node_id];
const node_level = levels[node_cfg.level] || node_cfg.level;
let parent_level = node_cfg.parent && tree[node_cfg.parent] && tree[node_cfg.parent].children
&& tree[node_cfg.parent].level;
parent_level = parent_level ? (levels[parent_level] || parent_level) : null;
// Parent's level must be less than child's; OSDs must be leaves
const parent = parent_level && parent_level < node_level ? node_cfg.parent : '';
tree[parent].children.push(tree[node_id]);
}
// Delete empty nodes
let deleted = 0;
do
{
deleted = 0;
for (const node_id in tree)
{
if (tree[node_id].level !== 'osd' && (!tree[node_id].children || !tree[node_id].children.length))
{
const parent = tree[node_id].parent;
if (parent)
{
tree[parent].children = tree[parent].children.filter(c => c != tree[node_id]);
}
deleted++;
delete tree[node_id];
}
}
} while (deleted > 0);
return tree;
}
function filter_osds_by_root_node(global_config, pool_tree, root_node)
{
if (!root_node)
{
return;
}
let hier_tree = make_hier_tree(global_config, pool_tree);
let included = [ ...(hier_tree[root_node] || {}).children||[] ];
for (let i = 0; i < included.length; i++)
{
if (included[i].children)
{
included.splice(i+1, 0, ...included[i].children);
}
}
let cur = pool_tree[root_node] || {};
while (cur && cur.id)
{
included.unshift(cur);
cur = pool_tree[cur.parent||''];
}
included = included.reduce((a, c) => { a[c.id||''] = true; return a; }, {});
for (const item in pool_tree)
{
if (!included[item])
{
delete pool_tree[item];
}
}
}
function filter_osds_by_tags(orig_tree, tags)
{
if (!tags)
{
return;
}
for (const tag of (tags instanceof Array ? tags : [ tags ]))
{
for (const osd in orig_tree)
{
if (orig_tree[osd].level === 'osd' &&
(!orig_tree[osd].tags || !orig_tree[osd].tags[tag]))
{
delete orig_tree[osd];
}
}
}
}
function filter_osds_by_block_layout(orig_tree, osd_stats, block_size, bitmap_granularity, immediate_commit)
{
for (const osd in orig_tree)
{
if (orig_tree[osd].level === 'osd')
{
const osd_stat = osd_stats[osd];
if (osd_stat && (osd_stat.bs_block_size && osd_stat.bs_block_size != block_size ||
osd_stat.bitmap_granularity && osd_stat.bitmap_granularity != bitmap_granularity ||
osd_stat.immediate_commit == 'small' && immediate_commit == 'all' ||
osd_stat.immediate_commit == 'none' && immediate_commit != 'none'))
{
delete orig_tree[osd];
}
}
}
}
function get_affinity_osds(pool_cfg, up_osds, osd_tree)
{
let aff_osds = up_osds;
if (pool_cfg.primary_affinity_tags)
{
aff_osds = Object.keys(up_osds).reduce((a, c) => { a[c] = osd_tree[c]; return a; }, {});
filter_osds_by_tags(aff_osds, pool_cfg.primary_affinity_tags);
for (const osd in aff_osds)
{
aff_osds[osd] = true;
}
}
return aff_osds;
}
module.exports = {
get_osd_tree,
make_hier_tree,
filter_osds_by_root_node,
filter_osds_by_tags,
filter_osds_by_block_layout,
get_affinity_osds,
};

View File

@@ -4,7 +4,7 @@
"description": "Vitastor SDS monitor service",
"main": "mon-main.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
"lint": "eslint *.js lp_optimizer/*.js scripts/*.js"
},
"author": "Vitaliy Filippov",
"license": "UNLICENSED",
@@ -14,12 +14,10 @@
},
"devDependencies": {
"eslint": "^8.0.0",
"eslint-plugin-import": "^2.29.1",
"eslint-plugin-node": "^11.1.0"
},
"engines": {
"node": ">=12.0.0"
},
"scripts": {
"lint": "eslint *.js"
}
}

267
mon/pg_gen.js Normal file
View File

@@ -0,0 +1,267 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
const { RuleCombinator } = require('./lp_optimizer/dsl_pgs.js');
const { SimpleCombinator, flatten_tree } = require('./lp_optimizer/simple_pgs.js');
const { validate_pool_cfg, get_pg_rules } = require('./pool_config.js');
const LPOptimizer = require('./lp_optimizer/lp_optimizer.js');
const { scale_pg_count } = require('./pg_utils.js');
const { make_hier_tree, filter_osds_by_root_node,
filter_osds_by_tags, filter_osds_by_block_layout, get_affinity_osds } = require('./osd_tree.js');
let seed;
function reset_rng()
{
seed = 0x5f020e43;
}
function rng()
{
seed ^= seed << 13;
seed ^= seed >> 17;
seed ^= seed << 5;
return seed + 2147483648;
}
function pick_primary(pool_config, osd_set, up_osds, aff_osds)
{
let alive_set;
if (pool_config.scheme === 'replicated')
{
// Prefer "affinity" OSDs
alive_set = osd_set.filter(osd_num => osd_num && aff_osds[osd_num]);
if (!alive_set.length)
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
}
else
{
// Prefer data OSDs for EC because they can actually read something without an additional network hop
const pg_data_size = (pool_config.pg_size||0) - (pool_config.parity_chunks||0);
alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && aff_osds[osd_num]);
if (!alive_set.length)
alive_set = osd_set.filter(osd_num => osd_num && aff_osds[osd_num]);
if (!alive_set.length)
{
alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && up_osds[osd_num]);
if (!alive_set.length)
alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]);
}
}
if (!alive_set.length)
{
return 0;
}
return alive_set[rng() % alive_set.length];
}
function recheck_primary(state, global_config, up_osds, osd_tree)
{
let new_config_pgs;
for (const pool_id in state.config.pools)
{
const pool_cfg = state.config.pools[pool_id];
if (!validate_pool_cfg(pool_id, pool_cfg, global_config.placement_levels, false))
{
continue;
}
const aff_osds = get_affinity_osds(pool_cfg, up_osds, osd_tree);
reset_rng();
for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++)
{
if (!state.config.pgs.items[pool_id])
{
continue;
}
const pg_cfg = state.config.pgs.items[pool_id][pg_num];
if (pg_cfg)
{
const new_primary = pick_primary(state.config.pools[pool_id], pg_cfg.osd_set, up_osds, aff_osds);
if (pg_cfg.primary != new_primary)
{
if (!new_config_pgs)
{
new_config_pgs = JSON.parse(JSON.stringify(state.config.pgs));
}
console.log(
`Moving pool ${pool_id} (${pool_cfg.name || 'unnamed'}) PG ${pg_num}`+
` primary OSD from ${pg_cfg.primary} to ${new_primary}`
);
new_config_pgs.items[pool_id][pg_num].primary = new_primary;
}
}
}
}
return new_config_pgs;
}
function save_new_pgs_txn(save_to, request, state, etcd_prefix, etcd_watch_revision, pool_id, up_osds, osd_tree, prev_pgs, new_pgs, pg_history)
{
const aff_osds = get_affinity_osds(state.config.pools[pool_id] || {}, up_osds, osd_tree);
const pg_items = {};
reset_rng();
new_pgs.map((osd_set, i) =>
{
osd_set = osd_set.map(osd_num => osd_num === LPOptimizer.NO_OSD ? 0 : osd_num);
pg_items[i+1] = {
osd_set,
primary: pick_primary(state.config.pools[pool_id], osd_set, up_osds, aff_osds),
};
if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' ') &&
prev_pgs[i].filter(osd_num => osd_num).length > 0)
{
pg_history[i] = pg_history[i] || {};
pg_history[i].osd_sets = pg_history[i].osd_sets || [];
pg_history[i].osd_sets.push(prev_pgs[i]);
}
if (pg_history[i] && pg_history[i].osd_sets)
{
pg_history[i].osd_sets = Object.values(pg_history[i].osd_sets
.reduce((a, c) => { a[c.join(' ')] = c; return a; }, {}));
}
});
for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++)
{
// FIXME: etcd has max_txn_ops limit, and it's 128 by default
// Sooo we probably want to change our storage scheme for PG histories...
request.compare.push({
key: b64(etcd_prefix+'/pg/history/'+pool_id+'/'+(i+1)),
target: 'MOD',
mod_revision: ''+etcd_watch_revision,
result: 'LESS',
});
if (pg_history[i])
{
request.success.push({
requestPut: {
key: b64(etcd_prefix+'/pg/history/'+pool_id+'/'+(i+1)),
value: b64(JSON.stringify(pg_history[i])),
},
});
}
else
{
request.success.push({
requestDeleteRange: {
key: b64(etcd_prefix+'/pg/history/'+pool_id+'/'+(i+1)),
},
});
}
}
save_to.items = save_to.items || {};
if (!new_pgs.length)
{
delete save_to.items[pool_id];
}
else
{
save_to.items[pool_id] = pg_items;
}
}
async function generate_pool_pgs(state, global_config, pool_id, osd_tree, levels)
{
const pool_cfg = state.config.pools[pool_id];
if (!validate_pool_cfg(pool_id, pool_cfg, global_config.placement_levels, false))
{
return null;
}
let pool_tree = { ...osd_tree };
filter_osds_by_root_node(global_config, pool_tree, pool_cfg.root_node);
filter_osds_by_tags(pool_tree, pool_cfg.osd_tags);
filter_osds_by_block_layout(
pool_tree,
state.osd.stats,
pool_cfg.block_size || global_config.block_size || 131072,
pool_cfg.bitmap_granularity || global_config.bitmap_granularity || 4096,
pool_cfg.immediate_commit || global_config.immediate_commit || 'none'
);
pool_tree = make_hier_tree(global_config, pool_tree);
// First try last_clean_pgs to minimize data movement
let prev_pgs = [];
for (const pg in ((state.history.last_clean_pgs.items||{})[pool_id]||{}))
{
prev_pgs[pg-1] = [ ...state.history.last_clean_pgs.items[pool_id][pg].osd_set ];
}
if (!prev_pgs.length)
{
// Fall back to config/pgs if it's empty
for (const pg in ((state.config.pgs.items||{})[pool_id]||{}))
{
prev_pgs[pg-1] = [ ...state.config.pgs.items[pool_id][pg].osd_set ];
}
}
const old_pg_count = prev_pgs.length;
const optimize_cfg = {
osd_weights: Object.values(pool_tree).filter(item => item.level === 'osd').reduce((a, c) => { a[c.id] = c.size; return a; }, {}),
combinator: !global_config.use_old_pg_combinator || pool_cfg.level_placement || pool_cfg.raw_placement
// new algorithm:
? new RuleCombinator(pool_tree, get_pg_rules(pool_id, pool_cfg, global_config.placement_levels), pool_cfg.max_osd_combinations)
// old algorithm:
: new SimpleCombinator(flatten_tree(pool_tree[''].children, levels, pool_cfg.failure_domain, 'osd'), pool_cfg.pg_size, pool_cfg.max_osd_combinations),
pg_count: pool_cfg.pg_count,
pg_size: pool_cfg.pg_size,
pg_minsize: pool_cfg.pg_minsize,
ordered: pool_cfg.scheme != 'replicated',
};
let optimize_result;
// Re-shuffle PGs if config/pgs.hash is empty
if (old_pg_count > 0 && state.config.pgs.hash)
{
if (prev_pgs.length != pool_cfg.pg_count)
{
// Scale PG count
// Do it even if old_pg_count is already equal to pool_cfg.pg_count,
// because last_clean_pgs may still contain the old number of PGs
scale_pg_count(prev_pgs, pool_cfg.pg_count);
}
for (const pg of prev_pgs)
{
while (pg.length < pool_cfg.pg_size)
{
pg.push(0);
}
}
optimize_result = await LPOptimizer.optimize_change({
prev_pgs,
...optimize_cfg,
});
}
else
{
optimize_result = await LPOptimizer.optimize_initial(optimize_cfg);
}
console.log(`Pool ${pool_id} (${pool_cfg.name || 'unnamed'}):`);
LPOptimizer.print_change_stats(optimize_result);
let pg_effsize = pool_cfg.pg_size;
for (const pg of optimize_result.int_pgs)
{
const this_pg_size = pg.filter(osd => osd != LPOptimizer.NO_OSD).length;
if (this_pg_size && this_pg_size < pg_effsize)
{
pg_effsize = this_pg_size;
}
}
return {
pool_id,
pgs: optimize_result.int_pgs,
stats: {
total_raw_tb: optimize_result.space,
pg_real_size: pg_effsize || pool_cfg.pg_size,
raw_to_usable: (pg_effsize || pool_cfg.pg_size) / (pool_cfg.scheme === 'replicated'
? 1 : (pool_cfg.pg_size - (pool_cfg.parity_chunks||0))),
space_efficiency: optimize_result.space/(optimize_result.total_space||1),
},
};
}
function b64(str)
{
return Buffer.from(str).toString('base64');
}
module.exports = {
recheck_primary,
save_new_pgs_txn,
generate_pool_pgs,
};

169
mon/pool_config.js Normal file
View File

@@ -0,0 +1,169 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
const { parse_level_indexes, parse_pg_dsl } = require('./lp_optimizer/dsl_pgs.js');
function validate_pool_cfg(pool_id, pool_cfg, placement_levels, warn)
{
pool_cfg.pg_size = Math.floor(pool_cfg.pg_size);
pool_cfg.pg_minsize = Math.floor(pool_cfg.pg_minsize);
pool_cfg.parity_chunks = Math.floor(pool_cfg.parity_chunks) || undefined;
pool_cfg.pg_count = Math.floor(pool_cfg.pg_count);
pool_cfg.max_osd_combinations = Math.floor(pool_cfg.max_osd_combinations) || 10000;
if (!/^[1-9]\d*$/.exec(''+pool_id))
{
if (warn)
console.log('Pool ID '+pool_id+' is invalid');
return false;
}
if (pool_cfg.scheme !== 'xor' && pool_cfg.scheme !== 'replicated' &&
pool_cfg.scheme !== 'ec' && pool_cfg.scheme !== 'jerasure')
{
if (warn)
console.log('Pool '+pool_id+' has invalid coding scheme (one of "xor", "replicated", "ec" and "jerasure" required)');
return false;
}
if (!pool_cfg.pg_size || pool_cfg.pg_size < 1 || pool_cfg.pg_size > 256 ||
pool_cfg.scheme !== 'replicated' && pool_cfg.pg_size < 3)
{
if (warn)
console.log('Pool '+pool_id+' has invalid pg_size');
return false;
}
if (!pool_cfg.pg_minsize || pool_cfg.pg_minsize < 1 || pool_cfg.pg_minsize > pool_cfg.pg_size ||
pool_cfg.scheme === 'xor' && pool_cfg.pg_minsize < (pool_cfg.pg_size - 1))
{
if (warn)
console.log('Pool '+pool_id+' has invalid pg_minsize');
return false;
}
if (pool_cfg.scheme === 'xor' && pool_cfg.parity_chunks != 0 && pool_cfg.parity_chunks != 1)
{
if (warn)
console.log('Pool '+pool_id+' has invalid parity_chunks (must be 1)');
return false;
}
if ((pool_cfg.scheme === 'ec' || pool_cfg.scheme === 'jerasure') &&
(pool_cfg.parity_chunks < 1 || pool_cfg.parity_chunks > pool_cfg.pg_size-2))
{
if (warn)
console.log('Pool '+pool_id+' has invalid parity_chunks (must be between 1 and pg_size-2)');
return false;
}
if (!pool_cfg.pg_count || pool_cfg.pg_count < 1)
{
if (warn)
console.log('Pool '+pool_id+' has invalid pg_count');
return false;
}
if (!pool_cfg.name)
{
if (warn)
console.log('Pool '+pool_id+' has empty name');
return false;
}
if (pool_cfg.max_osd_combinations < 100)
{
if (warn)
console.log('Pool '+pool_id+' has invalid max_osd_combinations (must be at least 100)');
return false;
}
if (pool_cfg.root_node && typeof(pool_cfg.root_node) != 'string')
{
if (warn)
console.log('Pool '+pool_id+' has invalid root_node (must be a string)');
return false;
}
if (pool_cfg.osd_tags && typeof(pool_cfg.osd_tags) != 'string' &&
(!(pool_cfg.osd_tags instanceof Array) || pool_cfg.osd_tags.filter(t => typeof t != 'string').length > 0))
{
if (warn)
console.log('Pool '+pool_id+' has invalid osd_tags (must be a string or array of strings)');
return false;
}
if (pool_cfg.primary_affinity_tags && typeof(pool_cfg.primary_affinity_tags) != 'string' &&
(!(pool_cfg.primary_affinity_tags instanceof Array) || pool_cfg.primary_affinity_tags.filter(t => typeof t != 'string').length > 0))
{
if (warn)
console.log('Pool '+pool_id+' has invalid primary_affinity_tags (must be a string or array of strings)');
return false;
}
if (!get_pg_rules(pool_id, pool_cfg, placement_levels, true))
{
return false;
}
return true;
}
function get_pg_rules(pool_id, pool_cfg, placement_levels, warn)
{
if (pool_cfg.level_placement)
{
const pg_size = (0|pool_cfg.pg_size);
let rules = pool_cfg.level_placement;
if (typeof rules === 'string')
{
rules = rules.split(/\s+/).map(s => s.split(/=/, 2)).reduce((a, c) => { a[c[0]] = c[1]; return a; }, {});
}
else
{
rules = { ...rules };
}
// Always add failure_domain to prevent rules from being totally incorrect
const all_diff = [];
for (let i = 1; i <= pg_size; i++)
{
all_diff.push(i);
}
rules[pool_cfg.failure_domain || 'host'] = all_diff;
placement_levels = placement_levels||{};
placement_levels.host = placement_levels.host || 100;
placement_levels.osd = placement_levels.osd || 101;
for (const k in rules)
{
if (!placement_levels[k] || typeof rules[k] !== 'string' &&
(!(rules[k] instanceof Array) ||
rules[k].filter(s => typeof s !== 'string' && typeof s !== 'number').length > 0))
{
if (warn)
console.log('Pool '+pool_id+' configuration is invalid: level_placement should be { [level]: string | (string|number)[] }');
return null;
}
else if (rules[k].length != pg_size)
{
if (warn)
console.log('Pool '+pool_id+' configuration is invalid: values in level_placement should contain exactly pg_size ('+pg_size+') items');
return null;
}
}
return parse_level_indexes(rules);
}
else if (typeof pool_cfg.raw_placement === 'string')
{
try
{
return parse_pg_dsl(pool_cfg.raw_placement);
}
catch (e)
{
if (warn)
console.log('Pool '+pool_id+' configuration is invalid: invalid raw_placement: '+e.message);
}
}
else
{
let rules = [ [] ];
let prev = [ 1 ];
for (let i = 1; i < pool_cfg.pg_size; i++)
{
rules.push([ [ pool_cfg.failure_domain||'host', '!=', prev ] ]);
prev = [ ...prev, i+1 ];
}
return rules;
}
}
module.exports = {
validate_pool_cfg,
get_pg_rules,
};

286
mon/stats.js Normal file
View File

@@ -0,0 +1,286 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
function derive_osd_stats(st, prev, prev_diff)
{
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
if (!st || !st.time || !prev || !prev.time || prev.time >= st.time)
{
return prev_diff || diff;
}
const timediff = BigInt(st.time*1000 - prev.time*1000);
for (const op in st.op_stats||{})
{
const pr = prev && prev.op_stats && prev.op_stats[op];
let c = st.op_stats[op];
c = { bytes: BigInt(c.bytes||0), usec: BigInt(c.usec||0), count: BigInt(c.count||0) };
const b = c.bytes - BigInt(pr && pr.bytes||0);
const us = c.usec - BigInt(pr && pr.usec||0);
const n = c.count - BigInt(pr && pr.count||0);
if (n > 0)
diff.op_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff, lat: us/n };
}
for (const op in st.subop_stats||{})
{
const pr = prev && prev.subop_stats && prev.subop_stats[op];
let c = st.subop_stats[op];
c = { usec: BigInt(c.usec||0), count: BigInt(c.count||0) };
const us = c.usec - BigInt(pr && pr.usec||0);
const n = c.count - BigInt(pr && pr.count||0);
if (n > 0)
diff.subop_stats[op] = { ...c, iops: n*1000n/timediff, lat: us/n };
}
for (const op in st.recovery_stats||{})
{
const pr = prev && prev.recovery_stats && prev.recovery_stats[op];
let c = st.recovery_stats[op];
c = { bytes: BigInt(c.bytes||0), count: BigInt(c.count||0) };
const b = c.bytes - BigInt(pr && pr.bytes||0);
const n = c.count - BigInt(pr && pr.count||0);
if (n > 0)
diff.recovery_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff };
}
for (const pool_id in st.inode_stats||{})
{
diff.inode_stats[pool_id] = {};
for (const inode_num in st.inode_stats[pool_id])
{
const inode_diff = diff.inode_stats[pool_id][inode_num] = {};
for (const op of [ 'read', 'write', 'delete' ])
{
const c = st.inode_stats[pool_id][inode_num][op];
const pr = prev && prev.inode_stats && prev.inode_stats[pool_id] &&
prev.inode_stats[pool_id][inode_num] && prev.inode_stats[pool_id][inode_num][op];
const n = BigInt(c.count||0) - BigInt(pr && pr.count||0);
inode_diff[op] = {
bps: (BigInt(c.bytes||0) - BigInt(pr && pr.bytes||0))*1000n/timediff,
iops: n*1000n/timediff,
lat: (BigInt(c.usec||0) - BigInt(pr && pr.usec||0))/(n || 1n),
};
}
}
}
return diff;
}
// sum_op_stats(this.state.osd, this.prev_stats)
function sum_op_stats(all_osd, prev_stats)
{
for (const osd in all_osd.stats)
{
const cur = { ...all_osd.stats[osd], inode_stats: all_osd.inodestats[osd]||{} };
prev_stats.osd_diff[osd] = derive_osd_stats(
cur, prev_stats.osd_stats[osd], prev_stats.osd_diff[osd]
);
prev_stats.osd_stats[osd] = cur;
}
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
// Sum derived values instead of deriving summed
for (const osd in all_osd.state)
{
const derived = prev_stats.osd_diff[osd];
if (!all_osd.state[osd] || !derived)
{
continue;
}
for (const type in sum_diff)
{
for (const op in derived[type]||{})
{
for (const k in derived[type][op])
{
sum_diff[type][op] = sum_diff[type][op] || {};
sum_diff[type][op][k] = (sum_diff[type][op][k] || 0n) + derived[type][op][k];
}
}
}
}
return sum_diff;
}
// sum_object_counts(this.state, this.config)
function sum_object_counts(state, global_config)
{
const object_counts = { object: 0n, clean: 0n, misplaced: 0n, degraded: 0n, incomplete: 0n };
const object_bytes = { object: 0n, clean: 0n, misplaced: 0n, degraded: 0n, incomplete: 0n };
for (const pool_id in state.pg.stats)
{
let object_size = 0;
for (const osd_num of state.pg.stats[pool_id].write_osd_set||[])
{
if (osd_num && state.osd.stats[osd_num] && state.osd.stats[osd_num].block_size)
{
object_size = state.osd.stats[osd_num].block_size;
break;
}
}
const pool_cfg = (state.config.pools[pool_id]||{});
if (!object_size)
{
object_size = pool_cfg.block_size || global_config.block_size || 131072;
}
if (pool_cfg.scheme !== 'replicated')
{
object_size *= ((pool_cfg.pg_size||0) - (pool_cfg.parity_chunks||0));
}
object_size = BigInt(object_size);
for (const pg_num in state.pg.stats[pool_id])
{
const st = state.pg.stats[pool_id][pg_num];
if (st)
{
for (const k in object_counts)
{
if (st[k+'_count'])
{
object_counts[k] += BigInt(st[k+'_count']);
object_bytes[k] += BigInt(st[k+'_count']) * object_size;
}
}
}
}
}
return { object_counts, object_bytes };
}
// sum_inode_stats(this.state, this.prev_stats)
function sum_inode_stats(state, prev_stats)
{
const inode_stats = {};
const inode_stub = () => ({
raw_used: 0n,
read: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
write: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
delete: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
});
const seen_pools = {};
for (const pool_id in state.config.pools)
{
seen_pools[pool_id] = true;
state.pool.stats[pool_id] = state.pool.stats[pool_id] || {};
state.pool.stats[pool_id].used_raw_tb = 0n;
}
for (const osd_num in state.osd.space)
{
for (const pool_id in state.osd.space[osd_num])
{
state.pool.stats[pool_id] = state.pool.stats[pool_id] || {};
if (!seen_pools[pool_id])
{
state.pool.stats[pool_id].used_raw_tb = 0n;
seen_pools[pool_id] = true;
}
inode_stats[pool_id] = inode_stats[pool_id] || {};
for (const inode_num in state.osd.space[osd_num][pool_id])
{
const u = BigInt(state.osd.space[osd_num][pool_id][inode_num]||0);
if (inode_num)
{
inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
inode_stats[pool_id][inode_num].raw_used += u;
}
state.pool.stats[pool_id].used_raw_tb += u;
}
}
}
for (const pool_id in seen_pools)
{
const used = state.pool.stats[pool_id].used_raw_tb;
state.pool.stats[pool_id].used_raw_tb = Number(used)/1024/1024/1024/1024;
}
for (const osd_num in state.osd.state)
{
const ist = state.osd.inodestats[osd_num];
if (!ist || !state.osd.state[osd_num])
{
continue;
}
for (const pool_id in ist)
{
inode_stats[pool_id] = inode_stats[pool_id] || {};
for (const inode_num in ist[pool_id])
{
inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
for (const op of [ 'read', 'write', 'delete' ])
{
inode_stats[pool_id][inode_num][op].count += BigInt(ist[pool_id][inode_num][op].count||0);
inode_stats[pool_id][inode_num][op].usec += BigInt(ist[pool_id][inode_num][op].usec||0);
inode_stats[pool_id][inode_num][op].bytes += BigInt(ist[pool_id][inode_num][op].bytes||0);
}
}
}
}
for (const osd in state.osd.state)
{
const osd_diff = prev_stats.osd_diff[osd];
if (!osd_diff || !state.osd.state[osd])
{
continue;
}
for (const pool_id in osd_diff.inode_stats)
{
for (const inode_num in prev_stats.osd_diff[osd].inode_stats[pool_id])
{
inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
for (const op of [ 'read', 'write', 'delete' ])
{
const op_diff = prev_stats.osd_diff[osd].inode_stats[pool_id][inode_num][op] || {};
const op_st = inode_stats[pool_id][inode_num][op];
op_st.bps += op_diff.bps;
op_st.iops += op_diff.iops;
op_st.lat += op_diff.lat;
op_st.n_osd = (op_st.n_osd || 0) + 1;
}
}
}
}
for (const pool_id in inode_stats)
{
for (const inode_num in inode_stats[pool_id])
{
let nonzero = inode_stats[pool_id][inode_num].raw_used > 0;
for (const op of [ 'read', 'write', 'delete' ])
{
const op_st = inode_stats[pool_id][inode_num][op];
if (op_st.n_osd)
{
op_st.lat /= BigInt(op_st.n_osd);
delete op_st.n_osd;
}
if (op_st.bps > 0 || op_st.iops > 0)
nonzero = true;
}
if (!nonzero && (!state.config.inode[pool_id] || !state.config.inode[pool_id][inode_num]))
{
// Deleted inode (no data, no I/O, no config)
delete inode_stats[pool_id][inode_num];
}
}
}
return { inode_stats, seen_pools };
}
function serialize_bigints(obj)
{
obj = { ...obj };
for (const k in obj)
{
if (typeof obj[k] == 'bigint')
{
obj[k] = ''+obj[k];
}
else if (typeof obj[k] == 'object')
{
obj[k] = serialize_bigints(obj[k]);
}
}
return obj;
}
module.exports = {
derive_osd_stats,
sum_op_stats,
sum_object_counts,
sum_inode_stats,
serialize_bigints,
};

View File

@@ -707,10 +707,10 @@ class VitastorDriver(driver.CloneableImageVD,
return ({}, True)
return ({}, False)
def copy_image_to_encrypted_volume(self, context, volume, image_service, image_id):
self.copy_image_to_volume(context, volume, image_service, image_id, encrypted = True)
def copy_image_to_encrypted_volume(self, context, volume, image_service, image_id, disable_sparse=False):
self.copy_image_to_volume(context, volume, image_service, image_id, encrypted = True, disable_sparse=False)
def copy_image_to_volume(self, context, volume, image_service, image_id, encrypted = False):
def copy_image_to_volume(self, context, volume, image_service, image_id, encrypted = False, disable_sparse=False):
tmp_dir = volume_utils.image_conversion_dir()
with tempfile.NamedTemporaryFile(dir = tmp_dir) as tmp:
image_utils.fetch_to_raw(

View File

@@ -0,0 +1,670 @@
From 571bde71268dcca6446454bb1e895e21bcc7b2a0 Mon Sep 17 00:00:00 2001
From: ace <ace@0xace.cc>
Date: Sat, 18 May 2024 19:45:49 +0300
Subject: [PATCH] Add Vitastor support
---
include/libvirt/libvirt-storage.h | 1 +
src/conf/domain_conf.c | 4 +-
src/conf/domain_validate.c | 10 +-
src/conf/schemas/domaincommon.rng | 30 +++++
src/conf/storage_conf.c | 20 ++-
src/conf/storage_conf.h | 2 +
src/conf/storage_source_conf.c | 2 +
src/conf/storage_source_conf.h | 1 +
src/conf/virstorageobj.c | 3 +
src/libvirt-storage.c | 1 +
src/libxl/libxl_conf.c | 1 +
src/libxl/xen_xl.c | 1 +
src/qemu/qemu_block.c | 45 +++++++
src/qemu/qemu_domain.c | 4 +-
src/qemu/qemu_snapshot.c | 2 +
src/storage/storage_driver.c | 1 +
.../storage_source_backingstore.c | 123 ++++++++++++++++++
src/test/test_driver.c | 1 +
.../storagepoolcapsschemadata/poolcaps-fs.xml | 7 +
.../poolcaps-full.xml | 7 +
tests/storagepoolxml2argvtest.c | 1 +
tools/virsh-pool.c | 3 +
22 files changed, 265 insertions(+), 5 deletions(-)
diff --git a/include/libvirt/libvirt-storage.h b/include/libvirt/libvirt-storage.h
index aaad4a3da1..5f5daa8341 100644
--- a/include/libvirt/libvirt-storage.h
+++ b/include/libvirt/libvirt-storage.h
@@ -326,6 +326,7 @@ typedef enum {
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS = 1 << 17, /* (Since: 1.2.8) */
VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE = 1 << 18, /* (Since: 3.1.0) */
VIR_CONNECT_LIST_STORAGE_POOLS_ISCSI_DIRECT = 1 << 19, /* (Since: 5.6.0) */
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR = 1 << 20, /* (Since: 5.0.0) */
} virConnectListAllStoragePoolsFlags;
int virConnectListAllStoragePools(virConnectPtr conn,
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index 52a5796ad2..089697b2a3 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -7191,7 +7191,8 @@ virDomainDiskSourceNetworkParse(xmlNodePtr node,
src->configFile = virXPathString("string(./config/@file)", ctxt);
if (src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTP ||
- src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS)
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS ||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_VITASTOR)
src->query = virXMLPropString(node, "query");
if (virDomainStorageNetworkParseHosts(node, ctxt, &src->hosts, &src->nhosts) < 0)
@@ -30657,6 +30658,7 @@ virDomainStorageSourceTranslateSourcePool(virStorageSource *src,
case VIR_STORAGE_POOL_MPATH:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_SHEEPDOG:
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_LAST:
diff --git a/src/conf/domain_validate.c b/src/conf/domain_validate.c
index faa7659f07..01b907d60d 100644
--- a/src/conf/domain_validate.c
+++ b/src/conf/domain_validate.c
@@ -495,6 +495,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
case VIR_STORAGE_NET_PROTOCOL_RBD:
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_NBD:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
@@ -541,7 +542,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
}
}
- /* internal snapshots and config files are currently supported only with rbd: */
+ /* internal snapshots are currently supported only with rbd: */
if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD) {
if (src->snapshot) {
@@ -549,10 +550,15 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
_("<snapshot> element is currently supported only with 'rbd' disks"));
return -1;
}
+ }
+ /* config files are currently supported only with rbd and vitastor: */
+ if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR) {
if (src->configFile) {
virReportError(VIR_ERR_XML_ERROR, "%s",
- _("<config> element is currently supported only with 'rbd' disks"));
+ _("<config> element is currently supported only with 'rbd' and 'vitastor' disks"));
return -1;
}
}
diff --git a/src/conf/schemas/domaincommon.rng b/src/conf/schemas/domaincommon.rng
index df44cd9857..4bb72fc697 100644
--- a/src/conf/schemas/domaincommon.rng
+++ b/src/conf/schemas/domaincommon.rng
@@ -1997,6 +1997,35 @@
</element>
</define>
+ <define name="diskSourceNetworkProtocolVitastor">
+ <element name="source">
+ <interleave>
+ <attribute name="protocol">
+ <value>vitastor</value>
+ </attribute>
+ <ref name="diskSourceCommon"/>
+ <optional>
+ <attribute name="name"/>
+ </optional>
+ <optional>
+ <attribute name="query"/>
+ </optional>
+ <zeroOrMore>
+ <ref name="diskSourceNetworkHost"/>
+ </zeroOrMore>
+ <optional>
+ <element name="config">
+ <attribute name="file">
+ <ref name="absFilePath"/>
+ </attribute>
+ <empty/>
+ </element>
+ </optional>
+ <empty/>
+ </interleave>
+ </element>
+ </define>
+
<define name="diskSourceNetworkProtocolISCSI">
<element name="source">
<attribute name="protocol">
@@ -2347,6 +2376,7 @@
<ref name="diskSourceNetworkProtocolSimple"/>
<ref name="diskSourceNetworkProtocolVxHS"/>
<ref name="diskSourceNetworkProtocolNFS"/>
+ <ref name="diskSourceNetworkProtocolVitastor"/>
</choice>
</define>
diff --git a/src/conf/storage_conf.c b/src/conf/storage_conf.c
index 68842004b7..1d69a788b6 100644
--- a/src/conf/storage_conf.c
+++ b/src/conf/storage_conf.c
@@ -56,7 +56,7 @@ VIR_ENUM_IMPL(virStoragePool,
"logical", "disk", "iscsi",
"iscsi-direct", "scsi", "mpath",
"rbd", "sheepdog", "gluster",
- "zfs", "vstorage",
+ "zfs", "vstorage", "vitastor",
);
VIR_ENUM_IMPL(virStoragePoolFormatFileSystem,
@@ -242,6 +242,18 @@ static virStoragePoolTypeInfo poolTypeInfo[] = {
.formatToString = virStorageFileFormatTypeToString,
}
},
+ {.poolType = VIR_STORAGE_POOL_VITASTOR,
+ .poolOptions = {
+ .flags = (VIR_STORAGE_POOL_SOURCE_HOST |
+ VIR_STORAGE_POOL_SOURCE_NETWORK |
+ VIR_STORAGE_POOL_SOURCE_NAME),
+ },
+ .volOptions = {
+ .defaultFormat = VIR_STORAGE_FILE_RAW,
+ .formatFromString = virStorageVolumeFormatFromString,
+ .formatToString = virStorageFileFormatTypeToString,
+ }
+ },
{.poolType = VIR_STORAGE_POOL_SHEEPDOG,
.poolOptions = {
.flags = (VIR_STORAGE_POOL_SOURCE_HOST |
@@ -538,6 +550,11 @@ virStoragePoolDefParseSource(xmlXPathContextPtr ctxt,
_("element 'name' is mandatory for RBD pool"));
return -1;
}
+ if (pool_type == VIR_STORAGE_POOL_VITASTOR && source->name == NULL) {
+ virReportError(VIR_ERR_XML_ERROR, "%s",
+ _("element 'name' is mandatory for Vitastor pool"));
+ return -1;
+ }
if (options->formatFromString) {
g_autofree char *format = NULL;
@@ -1127,6 +1144,7 @@ virStoragePoolDefFormatBuf(virBuffer *buf,
/* RBD, Sheepdog, Gluster and Iscsi-direct devices are not local block devs nor
* files, so they don't have a target */
if (def->type != VIR_STORAGE_POOL_RBD &&
+ def->type != VIR_STORAGE_POOL_VITASTOR &&
def->type != VIR_STORAGE_POOL_SHEEPDOG &&
def->type != VIR_STORAGE_POOL_GLUSTER &&
def->type != VIR_STORAGE_POOL_ISCSI_DIRECT) {
diff --git a/src/conf/storage_conf.h b/src/conf/storage_conf.h
index fc67957cfe..720c07ef74 100644
--- a/src/conf/storage_conf.h
+++ b/src/conf/storage_conf.h
@@ -103,6 +103,7 @@ typedef enum {
VIR_STORAGE_POOL_GLUSTER, /* Gluster device */
VIR_STORAGE_POOL_ZFS, /* ZFS */
VIR_STORAGE_POOL_VSTORAGE, /* Virtuozzo Storage */
+ VIR_STORAGE_POOL_VITASTOR, /* Vitastor */
VIR_STORAGE_POOL_LAST,
} virStoragePoolType;
@@ -454,6 +455,7 @@ VIR_ENUM_DECL(virStoragePartedFs);
VIR_CONNECT_LIST_STORAGE_POOLS_SCSI | \
VIR_CONNECT_LIST_STORAGE_POOLS_MPATH | \
VIR_CONNECT_LIST_STORAGE_POOLS_RBD | \
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR | \
VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG | \
VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER | \
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS | \
diff --git a/src/conf/storage_source_conf.c b/src/conf/storage_source_conf.c
index 959ec5ed40..e751dd4d6a 100644
--- a/src/conf/storage_source_conf.c
+++ b/src/conf/storage_source_conf.c
@@ -88,6 +88,7 @@ VIR_ENUM_IMPL(virStorageNetProtocol,
"ssh",
"vxhs",
"nfs",
+ "vitastor",
);
@@ -1301,6 +1302,7 @@ virStorageSourceNetworkDefaultPort(virStorageNetProtocol protocol)
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
return 24007;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_RBD:
/* we don't provide a default for RBD */
return 0;
diff --git a/src/conf/storage_source_conf.h b/src/conf/storage_source_conf.h
index 05b4bda16c..b5ed143c39 100644
--- a/src/conf/storage_source_conf.h
+++ b/src/conf/storage_source_conf.h
@@ -129,6 +129,7 @@ typedef enum {
VIR_STORAGE_NET_PROTOCOL_SSH,
VIR_STORAGE_NET_PROTOCOL_VXHS,
VIR_STORAGE_NET_PROTOCOL_NFS,
+ VIR_STORAGE_NET_PROTOCOL_VITASTOR,
VIR_STORAGE_NET_PROTOCOL_LAST
} virStorageNetProtocol;
diff --git a/src/conf/virstorageobj.c b/src/conf/virstorageobj.c
index 59fa5da372..4739167f5f 100644
--- a/src/conf/virstorageobj.c
+++ b/src/conf/virstorageobj.c
@@ -1438,6 +1438,7 @@ virStoragePoolObjSourceFindDuplicateCb(const void *payload,
return 1;
break;
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_ISCSI_DIRECT:
case VIR_STORAGE_POOL_RBD:
case VIR_STORAGE_POOL_LAST:
@@ -1921,6 +1922,8 @@ virStoragePoolObjMatch(virStoragePoolObj *obj,
(obj->def->type == VIR_STORAGE_POOL_MPATH)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_RBD) &&
(obj->def->type == VIR_STORAGE_POOL_RBD)) ||
+ (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR) &&
+ (obj->def->type == VIR_STORAGE_POOL_VITASTOR)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG) &&
(obj->def->type == VIR_STORAGE_POOL_SHEEPDOG)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER) &&
diff --git a/src/libvirt-storage.c b/src/libvirt-storage.c
index db7660aac4..561df34709 100644
--- a/src/libvirt-storage.c
+++ b/src/libvirt-storage.c
@@ -94,6 +94,7 @@ virStoragePoolGetConnect(virStoragePoolPtr pool)
* VIR_CONNECT_LIST_STORAGE_POOLS_SCSI
* VIR_CONNECT_LIST_STORAGE_POOLS_MPATH
* VIR_CONNECT_LIST_STORAGE_POOLS_RBD
+ * VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR
* VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG
* VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER
* VIR_CONNECT_LIST_STORAGE_POOLS_ZFS
diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c
index 62e1be6672..71a1d42896 100644
--- a/src/libxl/libxl_conf.c
+++ b/src/libxl/libxl_conf.c
@@ -979,6 +979,7 @@ libxlMakeNetworkDiskSrcStr(virStorageSource *src,
case VIR_STORAGE_NET_PROTOCOL_SSH:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
case VIR_STORAGE_NET_PROTOCOL_NFS:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
virReportError(VIR_ERR_NO_SUPPORT,
diff --git a/src/libxl/xen_xl.c b/src/libxl/xen_xl.c
index f175359307..8efcf4c329 100644
--- a/src/libxl/xen_xl.c
+++ b/src/libxl/xen_xl.c
@@ -1456,6 +1456,7 @@ xenFormatXLDiskSrcNet(virStorageSource *src)
case VIR_STORAGE_NET_PROTOCOL_SSH:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
case VIR_STORAGE_NET_PROTOCOL_NFS:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
virReportError(VIR_ERR_NO_SUPPORT,
diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c
index c9f5cbbf29..dbbac36836 100644
--- a/src/qemu/qemu_block.c
+++ b/src/qemu/qemu_block.c
@@ -758,6 +758,38 @@ qemuBlockStorageSourceGetRBDProps(virStorageSource *src,
}
+static virJSONValue *
+qemuBlockStorageSourceGetVitastorProps(virStorageSource *src)
+{
+ virJSONValue *ret = NULL;
+ virStorageNetHostDef *host;
+ size_t i;
+ g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER;
+ g_autofree char *etcd = NULL;
+
+ for (i = 0; i < src->nhosts; i++) {
+ host = src->hosts + i;
+ if ((virStorageNetHostTransport)host->transport != VIR_STORAGE_NET_HOST_TRANS_TCP) {
+ return NULL;
+ }
+ virBufferAsprintf(&buf, i > 0 ? ",%s:%u" : "%s:%u", host->name, host->port);
+ }
+ if (src->nhosts > 0) {
+ etcd = virBufferContentAndReset(&buf);
+ }
+
+ if (virJSONValueObjectAdd(&ret,
+ "S:etcd-host", etcd,
+ "S:etcd-prefix", src->query,
+ "S:config-path", src->configFile,
+ "s:image", src->path,
+ NULL) < 0)
+ return NULL;
+
+ return ret;
+}
+
+
static virJSONValue *
qemuBlockStorageSourceGetSheepdogProps(virStorageSource *src)
{
@@ -1140,6 +1172,12 @@ qemuBlockStorageSourceGetBackendProps(virStorageSource *src,
return NULL;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ driver = "vitastor";
+ if (!(fileprops = qemuBlockStorageSourceGetVitastorProps(src)))
+ return NULL;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
driver = "sheepdog";
if (!(fileprops = qemuBlockStorageSourceGetSheepdogProps(src)))
@@ -2020,6 +2058,7 @@ qemuBlockGetBackingStoreString(virStorageSource *src,
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
case VIR_STORAGE_NET_PROTOCOL_NFS:
case VIR_STORAGE_NET_PROTOCOL_SSH:
@@ -2400,6 +2439,12 @@ qemuBlockStorageSourceCreateGetStorageProps(virStorageSource *src,
return -1;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ driver = "vitastor";
+ if (!(location = qemuBlockStorageSourceGetVitastorProps(src)))
+ return -1;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
driver = "sheepdog";
if (!(location = qemuBlockStorageSourceGetSheepdogProps(src)))
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index 341c543280..61b248fa2c 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -5207,7 +5207,8 @@ qemuDomainValidateStorageSource(virStorageSource *src,
if (src->query &&
(actualType != VIR_STORAGE_TYPE_NETWORK ||
(src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTPS &&
- src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP))) {
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR))) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
_("query is supported only with HTTP(S) protocols"));
return -1;
@@ -10387,6 +10388,7 @@ qemuDomainPrepareStorageSourceTLS(virStorageSource *src,
break;
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
diff --git a/src/qemu/qemu_snapshot.c b/src/qemu/qemu_snapshot.c
index 0cac0c4146..4955ebd8d4 100644
--- a/src/qemu/qemu_snapshot.c
+++ b/src/qemu/qemu_snapshot.c
@@ -423,6 +423,7 @@ qemuSnapshotPrepareDiskExternalInactive(virDomainSnapshotDiskDef *snapdisk,
case VIR_STORAGE_NET_PROTOCOL_NONE:
case VIR_STORAGE_NET_PROTOCOL_NBD:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
@@ -648,6 +649,7 @@ qemuSnapshotPrepareDiskInternal(virDomainDiskDef *disk,
case VIR_STORAGE_NET_PROTOCOL_NONE:
case VIR_STORAGE_NET_PROTOCOL_NBD:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
diff --git a/src/storage/storage_driver.c b/src/storage/storage_driver.c
index 314fe930e0..fb615a8b4e 100644
--- a/src/storage/storage_driver.c
+++ b/src/storage/storage_driver.c
@@ -1626,6 +1626,7 @@ storageVolLookupByPathCallback(virStoragePoolObj *obj,
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_SHEEPDOG:
case VIR_STORAGE_POOL_ZFS:
case VIR_STORAGE_POOL_LAST:
diff --git a/src/storage_file/storage_source_backingstore.c b/src/storage_file/storage_source_backingstore.c
index 80681924ea..8a3ade9ec0 100644
--- a/src/storage_file/storage_source_backingstore.c
+++ b/src/storage_file/storage_source_backingstore.c
@@ -287,6 +287,75 @@ virStorageSourceParseRBDColonString(const char *rbdstr,
}
+static int
+virStorageSourceParseVitastorColonString(const char *colonstr,
+ virStorageSource *src)
+{
+ char *p, *e, *next;
+ g_autofree char *options = NULL;
+
+ /* optionally skip the "vitastor:" prefix if provided */
+ if (STRPREFIX(colonstr, "vitastor:"))
+ colonstr += strlen("vitastor:");
+
+ options = g_strdup(colonstr);
+
+ p = options;
+ while (*p) {
+ /* find : delimiter or end of string */
+ for (e = p; *e && *e != ':'; ++e) {
+ if (*e == '\\') {
+ e++;
+ if (*e == '\0')
+ break;
+ }
+ }
+ if (*e == '\0') {
+ next = e; /* last kv pair */
+ } else {
+ next = e + 1;
+ *e = '\0';
+ }
+
+ if (STRPREFIX(p, "image=")) {
+ src->path = g_strdup(p + strlen("image="));
+ } else if (STRPREFIX(p, "etcd-prefix=")) {
+ src->query = g_strdup(p + strlen("etcd-prefix="));
+ } else if (STRPREFIX(p, "config-path=")) {
+ src->configFile = g_strdup(p + strlen("config-path="));
+ } else if (STRPREFIX(p, "etcd-host=")) {
+ char *h, *sep;
+
+ h = p + strlen("etcd-host=");
+ while (h < e) {
+ for (sep = h; sep < e; ++sep) {
+ if (*sep == '\\' && (sep[1] == ',' ||
+ sep[1] == ';' ||
+ sep[1] == ' ')) {
+ *sep = '\0';
+ sep += 2;
+ break;
+ }
+ }
+
+ if (virStorageSourceRBDAddHost(src, h) < 0)
+ return -1;
+
+ h = sep;
+ }
+ }
+
+ p = next;
+ }
+
+ if (!src->path) {
+ return -1;
+ }
+
+ return 0;
+}
+
+
static int
virStorageSourceParseNBDColonString(const char *nbdstr,
virStorageSource *src)
@@ -399,6 +468,11 @@ virStorageSourceParseBackingColon(virStorageSource *src,
return -1;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ if (virStorageSourceParseVitastorColonString(path, src) < 0)
+ return -1;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
@@ -975,6 +1049,54 @@ virStorageSourceParseBackingJSONRBD(virStorageSource *src,
return 0;
}
+static int
+virStorageSourceParseBackingJSONVitastor(virStorageSource *src,
+ virJSONValue *json,
+ const char *jsonstr G_GNUC_UNUSED,
+ int opaque G_GNUC_UNUSED)
+{
+ const char *filename;
+ const char *image = virJSONValueObjectGetString(json, "image");
+ const char *conf = virJSONValueObjectGetString(json, "config-path");
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd-prefix");
+ virJSONValue *servers = virJSONValueObjectGetArray(json, "server");
+ size_t nservers;
+ size_t i;
+
+ src->type = VIR_STORAGE_TYPE_NETWORK;
+ src->protocol = VIR_STORAGE_NET_PROTOCOL_VITASTOR;
+
+ /* legacy syntax passed via 'filename' option */
+ if ((filename = virJSONValueObjectGetString(json, "filename")))
+ return virStorageSourceParseVitastorColonString(filename, src);
+
+ if (!image) {
+ virReportError(VIR_ERR_INVALID_ARG, "%s",
+ _("missing image name in Vitastor backing volume "
+ "JSON specification"));
+ return -1;
+ }
+
+ src->path = g_strdup(image);
+ src->configFile = g_strdup(conf);
+ src->query = g_strdup(etcd_prefix);
+
+ if (servers) {
+ nservers = virJSONValueArraySize(servers);
+
+ src->hosts = g_new0(virStorageNetHostDef, nservers);
+ src->nhosts = nservers;
+
+ for (i = 0; i < nservers; i++) {
+ if (virStorageSourceParseBackingJSONInetSocketAddress(src->hosts + i,
+ virJSONValueArrayGet(servers, i)) < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
static int
virStorageSourceParseBackingJSONRaw(virStorageSource *src,
virJSONValue *json,
@@ -1152,6 +1274,7 @@ static const struct virStorageSourceJSONDriverParser jsonParsers[] = {
{"sheepdog", false, virStorageSourceParseBackingJSONSheepdog, 0},
{"ssh", false, virStorageSourceParseBackingJSONSSH, 0},
{"rbd", false, virStorageSourceParseBackingJSONRBD, 0},
+ {"vitastor", false, virStorageSourceParseBackingJSONVitastor, 0},
{"raw", true, virStorageSourceParseBackingJSONRaw, 0},
{"nfs", false, virStorageSourceParseBackingJSONNFS, 0},
{"vxhs", false, virStorageSourceParseBackingJSONVxHS, 0},
diff --git a/src/test/test_driver.c b/src/test/test_driver.c
index ed545848af..dbfdbe8476 100644
--- a/src/test/test_driver.c
+++ b/src/test/test_driver.c
@@ -7336,6 +7336,7 @@ testStorageVolumeTypeForPool(int pooltype)
case VIR_STORAGE_POOL_ISCSI_DIRECT:
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
return VIR_STORAGE_VOL_NETWORK;
case VIR_STORAGE_POOL_LOGICAL:
case VIR_STORAGE_POOL_DISK:
diff --git a/tests/storagepoolcapsschemadata/poolcaps-fs.xml b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
index eee75af746..8bd0a57bdd 100644
--- a/tests/storagepoolcapsschemadata/poolcaps-fs.xml
+++ b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
@@ -204,4 +204,11 @@
</enum>
</volOptions>
</pool>
+ <pool type='vitastor' supported='no'>
+ <volOptions>
+ <defaultFormat type='raw'/>
+ <enum name='targetFormatType'>
+ </enum>
+ </volOptions>
+ </pool>
</storagepoolCapabilities>
diff --git a/tests/storagepoolcapsschemadata/poolcaps-full.xml b/tests/storagepoolcapsschemadata/poolcaps-full.xml
index 805950a937..852df0de16 100644
--- a/tests/storagepoolcapsschemadata/poolcaps-full.xml
+++ b/tests/storagepoolcapsschemadata/poolcaps-full.xml
@@ -204,4 +204,11 @@
</enum>
</volOptions>
</pool>
+ <pool type='vitastor' supported='yes'>
+ <volOptions>
+ <defaultFormat type='raw'/>
+ <enum name='targetFormatType'>
+ </enum>
+ </volOptions>
+ </pool>
</storagepoolCapabilities>
diff --git a/tests/storagepoolxml2argvtest.c b/tests/storagepoolxml2argvtest.c
index e8e40d695e..db55fe5f3a 100644
--- a/tests/storagepoolxml2argvtest.c
+++ b/tests/storagepoolxml2argvtest.c
@@ -65,6 +65,7 @@ testCompareXMLToArgvFiles(bool shouldFail,
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_ZFS:
case VIR_STORAGE_POOL_VSTORAGE:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_LAST:
default:
VIR_TEST_DEBUG("pool type '%s' has no xml2argv test", defTypeStr);
diff --git a/tools/virsh-pool.c b/tools/virsh-pool.c
index 36f00cf643..5f5bd3464e 100644
--- a/tools/virsh-pool.c
+++ b/tools/virsh-pool.c
@@ -1223,6 +1223,9 @@ cmdPoolList(vshControl *ctl, const vshCmd *cmd G_GNUC_UNUSED)
case VIR_STORAGE_POOL_VSTORAGE:
flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE;
break;
+ case VIR_STORAGE_POOL_VITASTOR:
+ flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR;
+ break;
case VIR_STORAGE_POOL_LAST:
break;
}
--
2.43.0

View File

@@ -0,0 +1,643 @@
commit 1f7e90e36b2afca0312392979b96d31951a8d66b
Author: Vitaliy Filippov <vitalif@yourcmc.ru>
Date: Thu Jun 27 01:34:54 2024 +0300
Add Vitastor support
diff --git a/include/libvirt/libvirt-storage.h b/include/libvirt/libvirt-storage.h
index aaad4a3da1..5f5daa8341 100644
--- a/include/libvirt/libvirt-storage.h
+++ b/include/libvirt/libvirt-storage.h
@@ -326,6 +326,7 @@ typedef enum {
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS = 1 << 17, /* (Since: 1.2.8) */
VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE = 1 << 18, /* (Since: 3.1.0) */
VIR_CONNECT_LIST_STORAGE_POOLS_ISCSI_DIRECT = 1 << 19, /* (Since: 5.6.0) */
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR = 1 << 20, /* (Since: 5.0.0) */
} virConnectListAllStoragePoolsFlags;
int virConnectListAllStoragePools(virConnectPtr conn,
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index fde594f811..66537db3e3 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -7220,7 +7220,8 @@ virDomainDiskSourceNetworkParse(xmlNodePtr node,
src->configFile = virXPathString("string(./config/@file)", ctxt);
if (src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTP ||
- src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS)
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS ||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_VITASTOR)
src->query = virXMLPropString(node, "query");
if (virDomainStorageNetworkParseHosts(node, ctxt, &src->hosts, &src->nhosts) < 0)
@@ -30734,6 +30735,7 @@ virDomainStorageSourceTranslateSourcePool(virStorageSource *src,
case VIR_STORAGE_POOL_MPATH:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_SHEEPDOG:
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_LAST:
diff --git a/src/conf/domain_validate.c b/src/conf/domain_validate.c
index 395e036e8f..8a0190f85b 100644
--- a/src/conf/domain_validate.c
+++ b/src/conf/domain_validate.c
@@ -495,6 +495,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
case VIR_STORAGE_NET_PROTOCOL_RBD:
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_NBD:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
@@ -541,7 +542,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
}
}
- /* internal snapshots and config files are currently supported only with rbd: */
+ /* internal snapshots are currently supported only with rbd: */
if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD) {
if (src->snapshot) {
@@ -549,10 +550,15 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
_("<snapshot> element is currently supported only with 'rbd' disks"));
return -1;
}
+ }
+ /* config files are currently supported only with rbd and vitastor: */
+ if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR) {
if (src->configFile) {
virReportError(VIR_ERR_XML_ERROR, "%s",
- _("<config> element is currently supported only with 'rbd' disks"));
+ _("<config> element is currently supported only with 'rbd' and 'vitastor' disks"));
return -1;
}
}
diff --git a/src/conf/schemas/domaincommon.rng b/src/conf/schemas/domaincommon.rng
index a46a824f88..4c5b720643 100644
--- a/src/conf/schemas/domaincommon.rng
+++ b/src/conf/schemas/domaincommon.rng
@@ -1997,6 +1997,35 @@
</element>
</define>
+ <define name="diskSourceNetworkProtocolVitastor">
+ <element name="source">
+ <interleave>
+ <attribute name="protocol">
+ <value>vitastor</value>
+ </attribute>
+ <ref name="diskSourceCommon"/>
+ <optional>
+ <attribute name="name"/>
+ </optional>
+ <optional>
+ <attribute name="query"/>
+ </optional>
+ <zeroOrMore>
+ <ref name="diskSourceNetworkHost"/>
+ </zeroOrMore>
+ <optional>
+ <element name="config">
+ <attribute name="file">
+ <ref name="absFilePath"/>
+ </attribute>
+ <empty/>
+ </element>
+ </optional>
+ <empty/>
+ </interleave>
+ </element>
+ </define>
+
<define name="diskSourceNetworkProtocolISCSI">
<element name="source">
<attribute name="protocol">
@@ -2347,6 +2376,7 @@
<ref name="diskSourceNetworkProtocolSimple"/>
<ref name="diskSourceNetworkProtocolVxHS"/>
<ref name="diskSourceNetworkProtocolNFS"/>
+ <ref name="diskSourceNetworkProtocolVitastor"/>
</choice>
</define>
diff --git a/src/conf/storage_conf.c b/src/conf/storage_conf.c
index 68842004b7..1d69a788b6 100644
--- a/src/conf/storage_conf.c
+++ b/src/conf/storage_conf.c
@@ -56,7 +56,7 @@ VIR_ENUM_IMPL(virStoragePool,
"logical", "disk", "iscsi",
"iscsi-direct", "scsi", "mpath",
"rbd", "sheepdog", "gluster",
- "zfs", "vstorage",
+ "zfs", "vstorage", "vitastor",
);
VIR_ENUM_IMPL(virStoragePoolFormatFileSystem,
@@ -242,6 +242,18 @@ static virStoragePoolTypeInfo poolTypeInfo[] = {
.formatToString = virStorageFileFormatTypeToString,
}
},
+ {.poolType = VIR_STORAGE_POOL_VITASTOR,
+ .poolOptions = {
+ .flags = (VIR_STORAGE_POOL_SOURCE_HOST |
+ VIR_STORAGE_POOL_SOURCE_NETWORK |
+ VIR_STORAGE_POOL_SOURCE_NAME),
+ },
+ .volOptions = {
+ .defaultFormat = VIR_STORAGE_FILE_RAW,
+ .formatFromString = virStorageVolumeFormatFromString,
+ .formatToString = virStorageFileFormatTypeToString,
+ }
+ },
{.poolType = VIR_STORAGE_POOL_SHEEPDOG,
.poolOptions = {
.flags = (VIR_STORAGE_POOL_SOURCE_HOST |
@@ -538,6 +550,11 @@ virStoragePoolDefParseSource(xmlXPathContextPtr ctxt,
_("element 'name' is mandatory for RBD pool"));
return -1;
}
+ if (pool_type == VIR_STORAGE_POOL_VITASTOR && source->name == NULL) {
+ virReportError(VIR_ERR_XML_ERROR, "%s",
+ _("element 'name' is mandatory for Vitastor pool"));
+ return -1;
+ }
if (options->formatFromString) {
g_autofree char *format = NULL;
@@ -1127,6 +1144,7 @@ virStoragePoolDefFormatBuf(virBuffer *buf,
/* RBD, Sheepdog, Gluster and Iscsi-direct devices are not local block devs nor
* files, so they don't have a target */
if (def->type != VIR_STORAGE_POOL_RBD &&
+ def->type != VIR_STORAGE_POOL_VITASTOR &&
def->type != VIR_STORAGE_POOL_SHEEPDOG &&
def->type != VIR_STORAGE_POOL_GLUSTER &&
def->type != VIR_STORAGE_POOL_ISCSI_DIRECT) {
diff --git a/src/conf/storage_conf.h b/src/conf/storage_conf.h
index fc67957cfe..720c07ef74 100644
--- a/src/conf/storage_conf.h
+++ b/src/conf/storage_conf.h
@@ -103,6 +103,7 @@ typedef enum {
VIR_STORAGE_POOL_GLUSTER, /* Gluster device */
VIR_STORAGE_POOL_ZFS, /* ZFS */
VIR_STORAGE_POOL_VSTORAGE, /* Virtuozzo Storage */
+ VIR_STORAGE_POOL_VITASTOR, /* Vitastor */
VIR_STORAGE_POOL_LAST,
} virStoragePoolType;
@@ -454,6 +455,7 @@ VIR_ENUM_DECL(virStoragePartedFs);
VIR_CONNECT_LIST_STORAGE_POOLS_SCSI | \
VIR_CONNECT_LIST_STORAGE_POOLS_MPATH | \
VIR_CONNECT_LIST_STORAGE_POOLS_RBD | \
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR | \
VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG | \
VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER | \
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS | \
diff --git a/src/conf/storage_source_conf.c b/src/conf/storage_source_conf.c
index 959ec5ed40..e751dd4d6a 100644
--- a/src/conf/storage_source_conf.c
+++ b/src/conf/storage_source_conf.c
@@ -88,6 +88,7 @@ VIR_ENUM_IMPL(virStorageNetProtocol,
"ssh",
"vxhs",
"nfs",
+ "vitastor",
);
@@ -1301,6 +1302,7 @@ virStorageSourceNetworkDefaultPort(virStorageNetProtocol protocol)
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
return 24007;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_RBD:
/* we don't provide a default for RBD */
return 0;
diff --git a/src/conf/storage_source_conf.h b/src/conf/storage_source_conf.h
index 05b4bda16c..b5ed143c39 100644
--- a/src/conf/storage_source_conf.h
+++ b/src/conf/storage_source_conf.h
@@ -129,6 +129,7 @@ typedef enum {
VIR_STORAGE_NET_PROTOCOL_SSH,
VIR_STORAGE_NET_PROTOCOL_VXHS,
VIR_STORAGE_NET_PROTOCOL_NFS,
+ VIR_STORAGE_NET_PROTOCOL_VITASTOR,
VIR_STORAGE_NET_PROTOCOL_LAST
} virStorageNetProtocol;
diff --git a/src/conf/virstorageobj.c b/src/conf/virstorageobj.c
index 59fa5da372..4739167f5f 100644
--- a/src/conf/virstorageobj.c
+++ b/src/conf/virstorageobj.c
@@ -1438,6 +1438,7 @@ virStoragePoolObjSourceFindDuplicateCb(const void *payload,
return 1;
break;
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_ISCSI_DIRECT:
case VIR_STORAGE_POOL_RBD:
case VIR_STORAGE_POOL_LAST:
@@ -1921,6 +1922,8 @@ virStoragePoolObjMatch(virStoragePoolObj *obj,
(obj->def->type == VIR_STORAGE_POOL_MPATH)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_RBD) &&
(obj->def->type == VIR_STORAGE_POOL_RBD)) ||
+ (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR) &&
+ (obj->def->type == VIR_STORAGE_POOL_VITASTOR)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG) &&
(obj->def->type == VIR_STORAGE_POOL_SHEEPDOG)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER) &&
diff --git a/src/libvirt-storage.c b/src/libvirt-storage.c
index db7660aac4..561df34709 100644
--- a/src/libvirt-storage.c
+++ b/src/libvirt-storage.c
@@ -94,6 +94,7 @@ virStoragePoolGetConnect(virStoragePoolPtr pool)
* VIR_CONNECT_LIST_STORAGE_POOLS_SCSI
* VIR_CONNECT_LIST_STORAGE_POOLS_MPATH
* VIR_CONNECT_LIST_STORAGE_POOLS_RBD
+ * VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR
* VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG
* VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER
* VIR_CONNECT_LIST_STORAGE_POOLS_ZFS
diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c
index 62e1be6672..71a1d42896 100644
--- a/src/libxl/libxl_conf.c
+++ b/src/libxl/libxl_conf.c
@@ -979,6 +979,7 @@ libxlMakeNetworkDiskSrcStr(virStorageSource *src,
case VIR_STORAGE_NET_PROTOCOL_SSH:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
case VIR_STORAGE_NET_PROTOCOL_NFS:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
virReportError(VIR_ERR_NO_SUPPORT,
diff --git a/src/libxl/xen_xl.c b/src/libxl/xen_xl.c
index 53f6871efc..c34b8cee1a 100644
--- a/src/libxl/xen_xl.c
+++ b/src/libxl/xen_xl.c
@@ -1456,6 +1456,7 @@ xenFormatXLDiskSrcNet(virStorageSource *src)
case VIR_STORAGE_NET_PROTOCOL_SSH:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
case VIR_STORAGE_NET_PROTOCOL_NFS:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
virReportError(VIR_ERR_NO_SUPPORT,
diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c
index 738b72d7ea..5dd082fc89 100644
--- a/src/qemu/qemu_block.c
+++ b/src/qemu/qemu_block.c
@@ -758,6 +758,38 @@ qemuBlockStorageSourceGetRBDProps(virStorageSource *src,
}
+static virJSONValue *
+qemuBlockStorageSourceGetVitastorProps(virStorageSource *src)
+{
+ virJSONValue *ret = NULL;
+ virStorageNetHostDef *host;
+ size_t i;
+ g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER;
+ g_autofree char *etcd = NULL;
+
+ for (i = 0; i < src->nhosts; i++) {
+ host = src->hosts + i;
+ if ((virStorageNetHostTransport)host->transport != VIR_STORAGE_NET_HOST_TRANS_TCP) {
+ return NULL;
+ }
+ virBufferAsprintf(&buf, i > 0 ? ",%s:%u" : "%s:%u", host->name, host->port);
+ }
+ if (src->nhosts > 0) {
+ etcd = virBufferContentAndReset(&buf);
+ }
+
+ if (virJSONValueObjectAdd(&ret,
+ "S:etcd-host", etcd,
+ "S:etcd-prefix", src->query,
+ "S:config-path", src->configFile,
+ "s:image", src->path,
+ NULL) < 0)
+ return NULL;
+
+ return ret;
+}
+
+
static virJSONValue *
qemuBlockStorageSourceGetSheepdogProps(virStorageSource *src)
{
@@ -1140,6 +1172,12 @@ qemuBlockStorageSourceGetBackendProps(virStorageSource *src,
return NULL;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ driver = "vitastor";
+ if (!(fileprops = qemuBlockStorageSourceGetVitastorProps(src)))
+ return NULL;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
driver = "sheepdog";
if (!(fileprops = qemuBlockStorageSourceGetSheepdogProps(src)))
@@ -2020,6 +2058,7 @@ qemuBlockGetBackingStoreString(virStorageSource *src,
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
case VIR_STORAGE_NET_PROTOCOL_NFS:
case VIR_STORAGE_NET_PROTOCOL_SSH:
@@ -2400,6 +2439,12 @@ qemuBlockStorageSourceCreateGetStorageProps(virStorageSource *src,
return -1;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ driver = "vitastor";
+ if (!(location = qemuBlockStorageSourceGetVitastorProps(src)))
+ return -1;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
driver = "sheepdog";
if (!(location = qemuBlockStorageSourceGetSheepdogProps(src)))
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index bda62f2e5c..84b4e5f2b8 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -5260,7 +5260,8 @@ qemuDomainValidateStorageSource(virStorageSource *src,
if (src->query &&
(actualType != VIR_STORAGE_TYPE_NETWORK ||
(src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTPS &&
- src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP))) {
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR))) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
_("query is supported only with HTTP(S) protocols"));
return -1;
@@ -10514,6 +10515,7 @@ qemuDomainPrepareStorageSourceTLS(virStorageSource *src,
break;
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
diff --git a/src/qemu/qemu_snapshot.c b/src/qemu/qemu_snapshot.c
index f5260c4a22..2f9d8406fe 100644
--- a/src/qemu/qemu_snapshot.c
+++ b/src/qemu/qemu_snapshot.c
@@ -423,6 +423,7 @@ qemuSnapshotPrepareDiskExternalInactive(virDomainSnapshotDiskDef *snapdisk,
case VIR_STORAGE_NET_PROTOCOL_NONE:
case VIR_STORAGE_NET_PROTOCOL_NBD:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
@@ -648,6 +649,7 @@ qemuSnapshotPrepareDiskInternal(virDomainDiskDef *disk,
case VIR_STORAGE_NET_PROTOCOL_NONE:
case VIR_STORAGE_NET_PROTOCOL_NBD:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
diff --git a/src/storage/storage_driver.c b/src/storage/storage_driver.c
index 86c03762d2..630c6eff1a 100644
--- a/src/storage/storage_driver.c
+++ b/src/storage/storage_driver.c
@@ -1626,6 +1626,7 @@ storageVolLookupByPathCallback(virStoragePoolObj *obj,
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_SHEEPDOG:
case VIR_STORAGE_POOL_ZFS:
case VIR_STORAGE_POOL_LAST:
diff --git a/src/storage_file/storage_source_backingstore.c b/src/storage_file/storage_source_backingstore.c
index 80681924ea..8a3ade9ec0 100644
--- a/src/storage_file/storage_source_backingstore.c
+++ b/src/storage_file/storage_source_backingstore.c
@@ -287,6 +287,75 @@ virStorageSourceParseRBDColonString(const char *rbdstr,
}
+static int
+virStorageSourceParseVitastorColonString(const char *colonstr,
+ virStorageSource *src)
+{
+ char *p, *e, *next;
+ g_autofree char *options = NULL;
+
+ /* optionally skip the "vitastor:" prefix if provided */
+ if (STRPREFIX(colonstr, "vitastor:"))
+ colonstr += strlen("vitastor:");
+
+ options = g_strdup(colonstr);
+
+ p = options;
+ while (*p) {
+ /* find : delimiter or end of string */
+ for (e = p; *e && *e != ':'; ++e) {
+ if (*e == '\\') {
+ e++;
+ if (*e == '\0')
+ break;
+ }
+ }
+ if (*e == '\0') {
+ next = e; /* last kv pair */
+ } else {
+ next = e + 1;
+ *e = '\0';
+ }
+
+ if (STRPREFIX(p, "image=")) {
+ src->path = g_strdup(p + strlen("image="));
+ } else if (STRPREFIX(p, "etcd-prefix=")) {
+ src->query = g_strdup(p + strlen("etcd-prefix="));
+ } else if (STRPREFIX(p, "config-path=")) {
+ src->configFile = g_strdup(p + strlen("config-path="));
+ } else if (STRPREFIX(p, "etcd-host=")) {
+ char *h, *sep;
+
+ h = p + strlen("etcd-host=");
+ while (h < e) {
+ for (sep = h; sep < e; ++sep) {
+ if (*sep == '\\' && (sep[1] == ',' ||
+ sep[1] == ';' ||
+ sep[1] == ' ')) {
+ *sep = '\0';
+ sep += 2;
+ break;
+ }
+ }
+
+ if (virStorageSourceRBDAddHost(src, h) < 0)
+ return -1;
+
+ h = sep;
+ }
+ }
+
+ p = next;
+ }
+
+ if (!src->path) {
+ return -1;
+ }
+
+ return 0;
+}
+
+
static int
virStorageSourceParseNBDColonString(const char *nbdstr,
virStorageSource *src)
@@ -399,6 +468,11 @@ virStorageSourceParseBackingColon(virStorageSource *src,
return -1;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ if (virStorageSourceParseVitastorColonString(path, src) < 0)
+ return -1;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
@@ -975,6 +1049,54 @@ virStorageSourceParseBackingJSONRBD(virStorageSource *src,
return 0;
}
+static int
+virStorageSourceParseBackingJSONVitastor(virStorageSource *src,
+ virJSONValue *json,
+ const char *jsonstr G_GNUC_UNUSED,
+ int opaque G_GNUC_UNUSED)
+{
+ const char *filename;
+ const char *image = virJSONValueObjectGetString(json, "image");
+ const char *conf = virJSONValueObjectGetString(json, "config-path");
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd-prefix");
+ virJSONValue *servers = virJSONValueObjectGetArray(json, "server");
+ size_t nservers;
+ size_t i;
+
+ src->type = VIR_STORAGE_TYPE_NETWORK;
+ src->protocol = VIR_STORAGE_NET_PROTOCOL_VITASTOR;
+
+ /* legacy syntax passed via 'filename' option */
+ if ((filename = virJSONValueObjectGetString(json, "filename")))
+ return virStorageSourceParseVitastorColonString(filename, src);
+
+ if (!image) {
+ virReportError(VIR_ERR_INVALID_ARG, "%s",
+ _("missing image name in Vitastor backing volume "
+ "JSON specification"));
+ return -1;
+ }
+
+ src->path = g_strdup(image);
+ src->configFile = g_strdup(conf);
+ src->query = g_strdup(etcd_prefix);
+
+ if (servers) {
+ nservers = virJSONValueArraySize(servers);
+
+ src->hosts = g_new0(virStorageNetHostDef, nservers);
+ src->nhosts = nservers;
+
+ for (i = 0; i < nservers; i++) {
+ if (virStorageSourceParseBackingJSONInetSocketAddress(src->hosts + i,
+ virJSONValueArrayGet(servers, i)) < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
static int
virStorageSourceParseBackingJSONRaw(virStorageSource *src,
virJSONValue *json,
@@ -1152,6 +1274,7 @@ static const struct virStorageSourceJSONDriverParser jsonParsers[] = {
{"sheepdog", false, virStorageSourceParseBackingJSONSheepdog, 0},
{"ssh", false, virStorageSourceParseBackingJSONSSH, 0},
{"rbd", false, virStorageSourceParseBackingJSONRBD, 0},
+ {"vitastor", false, virStorageSourceParseBackingJSONVitastor, 0},
{"raw", true, virStorageSourceParseBackingJSONRaw, 0},
{"nfs", false, virStorageSourceParseBackingJSONNFS, 0},
{"vxhs", false, virStorageSourceParseBackingJSONVxHS, 0},
diff --git a/src/test/test_driver.c b/src/test/test_driver.c
index d2d1bc43e3..31a92e4a01 100644
--- a/src/test/test_driver.c
+++ b/src/test/test_driver.c
@@ -7339,6 +7339,7 @@ testStorageVolumeTypeForPool(int pooltype)
case VIR_STORAGE_POOL_ISCSI_DIRECT:
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
return VIR_STORAGE_VOL_NETWORK;
case VIR_STORAGE_POOL_LOGICAL:
case VIR_STORAGE_POOL_DISK:
diff --git a/tests/storagepoolcapsschemadata/poolcaps-fs.xml b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
index eee75af746..8bd0a57bdd 100644
--- a/tests/storagepoolcapsschemadata/poolcaps-fs.xml
+++ b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
@@ -204,4 +204,11 @@
</enum>
</volOptions>
</pool>
+ <pool type='vitastor' supported='no'>
+ <volOptions>
+ <defaultFormat type='raw'/>
+ <enum name='targetFormatType'>
+ </enum>
+ </volOptions>
+ </pool>
</storagepoolCapabilities>
diff --git a/tests/storagepoolcapsschemadata/poolcaps-full.xml b/tests/storagepoolcapsschemadata/poolcaps-full.xml
index 805950a937..852df0de16 100644
--- a/tests/storagepoolcapsschemadata/poolcaps-full.xml
+++ b/tests/storagepoolcapsschemadata/poolcaps-full.xml
@@ -204,4 +204,11 @@
</enum>
</volOptions>
</pool>
+ <pool type='vitastor' supported='yes'>
+ <volOptions>
+ <defaultFormat type='raw'/>
+ <enum name='targetFormatType'>
+ </enum>
+ </volOptions>
+ </pool>
</storagepoolCapabilities>
diff --git a/tests/storagepoolxml2argvtest.c b/tests/storagepoolxml2argvtest.c
index e8e40d695e..db55fe5f3a 100644
--- a/tests/storagepoolxml2argvtest.c
+++ b/tests/storagepoolxml2argvtest.c
@@ -65,6 +65,7 @@ testCompareXMLToArgvFiles(bool shouldFail,
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_ZFS:
case VIR_STORAGE_POOL_VSTORAGE:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_LAST:
default:
VIR_TEST_DEBUG("pool type '%s' has no xml2argv test", defTypeStr);
diff --git a/tools/virsh-pool.c b/tools/virsh-pool.c
index f9aad8ded0..64704b4288 100644
--- a/tools/virsh-pool.c
+++ b/tools/virsh-pool.c
@@ -1187,6 +1187,9 @@ cmdPoolList(vshControl *ctl, const vshCmd *cmd G_GNUC_UNUSED)
case VIR_STORAGE_POOL_VSTORAGE:
flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE;
break;
+ case VIR_STORAGE_POOL_VITASTOR:
+ flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR;
+ break;
case VIR_STORAGE_POOL_LAST:
break;
}

288
patches/nova-28.diff Normal file
View File

@@ -0,0 +1,288 @@
diff --git a/nova/virt/image/model.py b/nova/virt/image/model.py
index 971f7e9c07..ec3fca72cb 100644
--- a/nova/virt/image/model.py
+++ b/nova/virt/image/model.py
@@ -129,3 +129,22 @@ class RBDImage(Image):
self.user = user
self.password = password
self.servers = servers
+
+
+class VitastorImage(Image):
+ """Class for images in a remote Vitastor cluster"""
+
+ def __init__(self, name, etcd_address = None, etcd_prefix = None, config_path = None):
+ """Create a new Vitastor image object
+
+ :param name: name of the image
+ :param etcd_address: etcd URL(s) (optional)
+ :param etcd_prefix: etcd prefix (optional)
+ :param config_path: path to the configuration (optional)
+ """
+ super(VitastorImage, self).__init__(FORMAT_RAW)
+
+ self.name = name
+ self.etcd_address = etcd_address
+ self.etcd_prefix = etcd_prefix
+ self.config_path = config_path
diff --git a/nova/virt/images.py b/nova/virt/images.py
index 5358f3766a..ebe3d6effb 100644
--- a/nova/virt/images.py
+++ b/nova/virt/images.py
@@ -41,7 +41,7 @@ IMAGE_API = glance.API()
def qemu_img_info(path, format=None):
"""Return an object containing the parsed output from qemu-img info."""
- if not os.path.exists(path) and not path.startswith('rbd:'):
+ if not os.path.exists(path) and not path.startswith('rbd:') and not path.startswith('vitastor:'):
raise exception.DiskNotFound(location=path)
info = nova.privsep.qemu.unprivileged_qemu_img_info(path, format=format)
@@ -50,7 +50,7 @@ def qemu_img_info(path, format=None):
def privileged_qemu_img_info(path, format=None, output_format='json'):
"""Return an object containing the parsed output from qemu-img info."""
- if not os.path.exists(path) and not path.startswith('rbd:'):
+ if not os.path.exists(path) and not path.startswith('rbd:') and not path.startswith('vitastor:'):
raise exception.DiskNotFound(location=path)
info = nova.privsep.qemu.privileged_qemu_img_info(path, format=format)
diff --git a/nova/virt/libvirt/config.py b/nova/virt/libvirt/config.py
index f9475776b3..a2e18aab67 100644
--- a/nova/virt/libvirt/config.py
+++ b/nova/virt/libvirt/config.py
@@ -1060,6 +1060,8 @@ class LibvirtConfigGuestDisk(LibvirtConfigGuestDevice):
self.driver_iommu = False
self.source_path = None
self.source_protocol = None
+ self.source_query = None
+ self.source_config = None
self.source_name = None
self.source_hosts = []
self.source_ports = []
@@ -1189,6 +1191,10 @@ class LibvirtConfigGuestDisk(LibvirtConfigGuestDevice):
source = etree.Element("source", protocol=self.source_protocol)
if self.source_name is not None:
source.set('name', self.source_name)
+ if self.source_query is not None:
+ source.set('query', self.source_query)
+ if self.source_config is not None:
+ source.append(etree.Element('config', file=self.source_config))
hosts_info = zip(self.source_hosts, self.source_ports)
for name, port in hosts_info:
host = etree.Element('host', name=name)
diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
index 391231c527..f38faa1608 100644
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@@ -179,6 +179,7 @@ VOLUME_DRIVERS = {
'local': 'nova.virt.libvirt.volume.volume.LibvirtVolumeDriver',
'fake': 'nova.virt.libvirt.volume.volume.LibvirtFakeVolumeDriver',
'rbd': 'nova.virt.libvirt.volume.net.LibvirtNetVolumeDriver',
+ 'vitastor': 'nova.virt.libvirt.volume.vitastor.LibvirtVitastorVolumeDriver',
'nfs': 'nova.virt.libvirt.volume.nfs.LibvirtNFSVolumeDriver',
'smbfs': 'nova.virt.libvirt.volume.smbfs.LibvirtSMBFSVolumeDriver',
'fibre_channel': 'nova.virt.libvirt.volume.fibrechannel.LibvirtFibreChannelVolumeDriver', # noqa:E501
@@ -385,10 +386,10 @@ class LibvirtDriver(driver.ComputeDriver):
# This prevents the risk of one test setting a capability
# which bleeds over into other tests.
- # LVM and RBD require raw images. If we are not configured to
+ # LVM, RBD, Vitastor require raw images. If we are not configured to
# force convert images into raw format, then we _require_ raw
# images only.
- raw_only = ('rbd', 'lvm')
+ raw_only = ('rbd', 'lvm', 'vitastor')
requires_raw_image = (CONF.libvirt.images_type in raw_only and
not CONF.force_raw_images)
requires_ploop_image = CONF.libvirt.virt_type == 'parallels'
@@ -775,12 +776,12 @@ class LibvirtDriver(driver.ComputeDriver):
# Some imagebackends are only able to import raw disk images,
# and will fail if given any other format. See the bug
# https://bugs.launchpad.net/nova/+bug/1816686 for more details.
- if CONF.libvirt.images_type in ('rbd',):
+ if CONF.libvirt.images_type in ('rbd', 'vitastor'):
if not CONF.force_raw_images:
msg = _("'[DEFAULT]/force_raw_images = False' is not "
- "allowed with '[libvirt]/images_type = rbd'. "
+ "allowed with '[libvirt]/images_type = rbd' or 'vitastor'. "
"Please check the two configs and if you really "
- "do want to use rbd as images_type, set "
+ "do want to use rbd or vitastor as images_type, set "
"force_raw_images to True.")
raise exception.InvalidConfiguration(msg)
@@ -2603,6 +2604,16 @@ class LibvirtDriver(driver.ComputeDriver):
if connection_info['data'].get('auth_enabled'):
username = connection_info['data']['auth_username']
path = f"rbd:{volume_name}:id={username}"
+ elif connection_info['driver_volume_type'] == 'vitastor':
+ volume_name = connection_info['data']['name']
+ path = 'vitastor:image='+volume_name.replace(':', '\\:')
+ for k in [ 'config_path', 'etcd_address', 'etcd_prefix' ]:
+ if k in connection_info['data']:
+ kk = k
+ if kk == 'etcd_address':
+ # FIXME use etcd_address in qemu driver
+ kk = 'etcd_host'
+ path += ":"+kk.replace('_', '-')+"="+connection_info['data'][k].replace(':', '\\:')
else:
path = 'unknown'
raise exception.DiskNotFound(location='unknown')
@@ -2827,8 +2838,8 @@ class LibvirtDriver(driver.ComputeDriver):
image_format = CONF.libvirt.snapshot_image_format or source_type
- # NOTE(bfilippov): save lvm and rbd as raw
- if image_format == 'lvm' or image_format == 'rbd':
+ # NOTE(bfilippov): save lvm and rbd and vitastor as raw
+ if image_format == 'lvm' or image_format == 'rbd' or image_format == 'vitastor':
image_format = 'raw'
metadata = self._create_snapshot_metadata(instance.image_meta,
@@ -2899,7 +2910,7 @@ class LibvirtDriver(driver.ComputeDriver):
expected_state=task_states.IMAGE_UPLOADING)
# TODO(nic): possibly abstract this out to the root_disk
- if source_type == 'rbd' and live_snapshot:
+ if (source_type == 'rbd' or source_type == 'vitastor') and live_snapshot:
# Standard snapshot uses qemu-img convert from RBD which is
# not safe to run with live_snapshot.
live_snapshot = False
@@ -4099,7 +4110,7 @@ class LibvirtDriver(driver.ComputeDriver):
# cleanup rescue volume
lvm.remove_volumes([lvmdisk for lvmdisk in self._lvm_disks(instance)
if lvmdisk.endswith('.rescue')])
- if CONF.libvirt.images_type == 'rbd':
+ if CONF.libvirt.images_type == 'rbd' or CONF.libvirt.images_type == 'vitastor':
filter_fn = lambda disk: (disk.startswith(instance.uuid) and
disk.endswith('.rescue'))
rbd_utils.RBDDriver().cleanup_volumes(filter_fn)
@@ -4356,6 +4367,8 @@ class LibvirtDriver(driver.ComputeDriver):
# TODO(mikal): there is a bug here if images_type has
# changed since creation of the instance, but I am pretty
# sure that this bug already exists.
+ if CONF.libvirt.images_type == 'vitastor':
+ return 'vitastor'
return 'rbd' if CONF.libvirt.images_type == 'rbd' else 'raw'
@staticmethod
@@ -4764,10 +4777,10 @@ class LibvirtDriver(driver.ComputeDriver):
finally:
# NOTE(mikal): if the config drive was imported into RBD,
# then we no longer need the local copy
- if CONF.libvirt.images_type == 'rbd':
+ if CONF.libvirt.images_type == 'rbd' or CONF.libvirt.images_type == 'vitastor':
LOG.info('Deleting local config drive %(path)s '
- 'because it was imported into RBD.',
- {'path': config_disk_local_path},
+ 'because it was imported into %(type).',
+ {'path': config_disk_local_path, 'type': CONF.libvirt.images_type},
instance=instance)
os.unlink(config_disk_local_path)
diff --git a/nova/virt/libvirt/utils.py b/nova/virt/libvirt/utils.py
index da2a6e8b8a..52c02e72f1 100644
--- a/nova/virt/libvirt/utils.py
+++ b/nova/virt/libvirt/utils.py
@@ -340,6 +340,10 @@ def find_disk(guest: libvirt_guest.Guest) -> ty.Tuple[str, ty.Optional[str]]:
disk_path = disk.source_name
if disk_path:
disk_path = 'rbd:' + disk_path
+ elif not disk_path and disk.source_protocol == 'vitastor':
+ disk_path = disk.source_name
+ if disk_path:
+ disk_path = 'vitastor:' + disk_path
if not disk_path:
raise RuntimeError(_("Can't retrieve root device path "
@@ -354,6 +358,8 @@ def get_disk_type_from_path(path: str) -> ty.Optional[str]:
return 'lvm'
elif path.startswith('rbd:'):
return 'rbd'
+ elif path.startswith('vitastor:'):
+ return 'vitastor'
elif (os.path.isdir(path) and
os.path.exists(os.path.join(path, "DiskDescriptor.xml"))):
return 'ploop'
diff --git a/nova/virt/libvirt/volume/vitastor.py b/nova/virt/libvirt/volume/vitastor.py
new file mode 100644
index 0000000000..0256df62c1
--- /dev/null
+++ b/nova/virt/libvirt/volume/vitastor.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2021+, Vitaliy Filippov <vitalif@yourcmc.ru>
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+from os_brick import exception as os_brick_exception
+from os_brick import initiator
+from os_brick.initiator import connector
+from oslo_log import log as logging
+
+import nova.conf
+from nova import utils
+from nova.virt.libvirt.volume import volume as libvirt_volume
+
+
+CONF = nova.conf.CONF
+LOG = logging.getLogger(__name__)
+
+
+class LibvirtVitastorVolumeDriver(libvirt_volume.LibvirtBaseVolumeDriver):
+ """Driver to attach Vitastor volumes to libvirt."""
+ def __init__(self, host):
+ super(LibvirtVitastorVolumeDriver, self).__init__(host, is_block_dev=False)
+
+ def connect_volume(self, connection_info, instance):
+ pass
+
+ def disconnect_volume(self, connection_info, instance, force=False):
+ pass
+
+ def get_config(self, connection_info, disk_info):
+ """Returns xml for libvirt."""
+ conf = super(LibvirtVitastorVolumeDriver, self).get_config(connection_info, disk_info)
+ conf.source_type = 'network'
+ conf.source_protocol = 'vitastor'
+ conf.source_name = connection_info['data'].get('name')
+ conf.source_query = connection_info['data'].get('etcd_prefix') or None
+ conf.source_config = connection_info['data'].get('config_path') or None
+ conf.source_hosts = []
+ conf.source_ports = []
+ addresses = connection_info['data'].get('etcd_address', '')
+ if addresses:
+ if not isinstance(addresses, list):
+ addresses = addresses.split(',')
+ for addr in addresses:
+ if addr.startswith('https://'):
+ raise NotImplementedError('Vitastor block driver does not support SSL for etcd communication yet')
+ if addr.startswith('http://'):
+ addr = addr[7:]
+ addr = addr.rstrip('/')
+ if addr.endswith('/v3'):
+ addr = addr[0:-3]
+ p = addr.find('/')
+ if p > 0:
+ raise NotImplementedError('libvirt does not support custom URL paths for Vitastor etcd yet. Use /etc/vitastor/vitastor.conf')
+ p = addr.find(':')
+ port = '2379'
+ if p > 0:
+ port = addr[p+1:]
+ addr = addr[0:p]
+ conf.source_hosts.append(addr)
+ conf.source_ports.append(port)
+ return conf
+
+ def extend_volume(self, connection_info, instance, requested_size):
+ return requested_size

View File

@@ -0,0 +1,190 @@
diff --git a/block/meson.build b/block/meson.build
index 59ff6d380c..abde3715c2 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -109,6 +109,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
diff --git a/meson.build b/meson.build
index 6c77d9687d..390683ee71 100644
--- a/meson.build
+++ b/meson.build
@@ -1295,6 +1295,26 @@ if not get_option('rbd').auto() or have_block
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'))
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -2157,6 +2177,7 @@ endif
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
@@ -4356,6 +4377,7 @@ summary_info += {'fdt support': fdt_opt == 'disabled' ? false : fdt_opt}
summary_info += {'libcap-ng support': libcap_ng}
summary_info += {'bpf support': libbpf}
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
diff --git a/meson_options.txt b/meson_options.txt
index c9baeda639..85e1df5a56 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -194,6 +194,8 @@ option('lzo', type : 'feature', value : 'auto',
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
diff --git a/qapi/block-core.json b/qapi/block-core.json
index ca390c5700..8f11ae9fa5 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3201,7 +3201,7 @@
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
- 'ssh', 'throttle', 'vdi', 'vhdx',
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -4255,6 +4255,28 @@
'*key-secret': 'str',
'*server': ['InetSocketAddressBase'] } }
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
##
# @ReplicationMode:
#
@@ -4713,6 +4735,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'virtio-blk-vfio-pci':
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
'if': 'CONFIG_BLKIO' },
@@ -5148,6 +5171,17 @@
'*cluster-size' : 'size',
'*encrypt' : 'RbdEncryptionCreateOptions' } }
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
##
# @BlockdevVmdkSubformat:
#
@@ -5370,6 +5404,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
index 76781f17f4..ac5fe3aa08 100755
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
@@ -30,7 +30,7 @@
--with-suffix="qemu-kvm" \
--firmwarepath=/usr/share/qemu-firmware \
--target-list="x86_64-softmmu" \
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
--audio-drv-list="" \
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
--with-coroutine=ucontext \
@@ -176,6 +176,7 @@
--enable-opengl \
--enable-pie \
--enable-rbd \
+--enable-vitastor \
--enable-rdma \
--enable-seccomp \
--enable-snappy \
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 680fa3f581..dab422bf04 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -168,6 +168,7 @@ meson_options_help() {
printf "%s\n" ' qed qed image format support'
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' rdma Enable RDMA-based migration'
printf "%s\n" ' replication replication support'
printf "%s\n" ' rutabaga-gfx rutabaga_gfx support'
@@ -445,6 +446,8 @@ _meson_option_parse() {
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-rdma) printf "%s" -Drdma=enabled ;;
--disable-rdma) printf "%s" -Drdma=disabled ;;
--enable-relocatable) printf "%s" -Drelocatable=true ;;

View File

@@ -0,0 +1,190 @@
diff --git a/block/meson.build b/block/meson.build
index e1f03fd773..db0cfb2321 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -114,6 +114,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
diff --git a/meson.build b/meson.build
index 91a0aa64c6..e8bc710578 100644
--- a/meson.build
+++ b/meson.build
@@ -1452,6 +1452,26 @@ if not get_option('rbd').auto() or have_block
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'))
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -2250,6 +2270,7 @@ endif
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
@@ -4443,6 +4464,7 @@ summary_info += {'fdt support': fdt_opt == 'disabled' ? false : fdt_opt}
summary_info += {'libcap-ng support': libcap_ng}
summary_info += {'bpf support': libbpf}
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
diff --git a/meson_options.txt b/meson_options.txt
index 0a99a059ec..16dc440118 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -194,6 +194,8 @@ option('lzo', type : 'feature', value : 'auto',
description: 'lzo compression support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 746d1694c2..fb7aa4423b 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3203,7 +3203,7 @@
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
- 'ssh', 'throttle', 'vdi', 'vhdx',
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -4285,6 +4285,28 @@
'*key-secret': 'str',
'*server': ['InetSocketAddressBase'] } }
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
##
# @ReplicationMode:
#
@@ -4741,6 +4763,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'virtio-blk-vfio-pci':
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
'if': 'CONFIG_BLKIO' },
@@ -5180,6 +5203,17 @@
'*cluster-size' : 'size',
'*encrypt' : 'RbdEncryptionCreateOptions' } }
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
##
# @BlockdevVmdkSubformat:
#
@@ -5402,6 +5436,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure b/scripts/ci/org.centos/stream/8/x86_64/configure
index 76781f17f4..ac5fe3aa08 100755
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
@@ -30,7 +30,7 @@
--with-suffix="qemu-kvm" \
--firmwarepath=/usr/share/qemu-firmware \
--target-list="x86_64-softmmu" \
---block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
+--block-drv-rw-whitelist="qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle,gluster" \
--audio-drv-list="" \
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
--with-coroutine=ucontext \
@@ -176,6 +176,7 @@
--enable-opengl \
--enable-pie \
--enable-rbd \
+--enable-vitastor \
--enable-rdma \
--enable-seccomp \
--enable-snappy \
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 680fa3f581..dab422bf04 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -168,6 +168,7 @@ meson_options_help() {
printf "%s\n" ' qed qed image format support'
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' rdma Enable RDMA-based migration'
printf "%s\n" ' replication replication support'
printf "%s\n" ' rutabaga-gfx rutabaga_gfx support'
@@ -445,6 +446,8 @@ _meson_option_parse() {
--disable-qom-cast-debug) printf "%s" -Dqom_cast_debug=false ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-rdma) printf "%s" -Drdma=enabled ;;
--disable-rdma) printf "%s" -Drdma=disabled ;;
--enable-relocatable) printf "%s" -Drelocatable=true ;;

View File

@@ -108,10 +108,11 @@ npm install --production
cd ..
mkdir -p %buildroot/usr/lib/vitastor
cp -r mon %buildroot/usr/lib/vitastor
mv %buildroot/usr/lib/vitastor/mon/scripts/make-etcd %buildroot/usr/lib/vitastor/mon/
mkdir -p %buildroot/lib/systemd/system
cp mon/vitastor.target mon/vitastor-mon.service mon/vitastor-osd@.service %buildroot/lib/systemd/system
cp mon/scripts/vitastor.target mon/scripts/vitastor-mon.service mon/scripts/vitastor-osd@.service %buildroot/lib/systemd/system
mkdir -p %buildroot/lib/udev/rules.d
cp mon/90-vitastor.rules %buildroot/lib/udev/rules.d
cp mon/scripts/90-vitastor.rules %buildroot/lib/udev/rules.d
%files

View File

@@ -105,10 +105,11 @@ npm install --production
cd ..
mkdir -p %buildroot/usr/lib/vitastor
cp -r mon %buildroot/usr/lib/vitastor
mv %buildroot/usr/lib/vitastor/mon/scripts/make-etcd %buildroot/usr/lib/vitastor/mon/
mkdir -p %buildroot/lib/systemd/system
cp mon/vitastor.target mon/vitastor-mon.service mon/vitastor-osd@.service %buildroot/lib/systemd/system
cp mon/scripts/vitastor.target mon/scripts/vitastor-mon.service mon/scripts/vitastor-osd@.service %buildroot/lib/systemd/system
mkdir -p %buildroot/lib/udev/rules.d
cp mon/90-vitastor.rules %buildroot/lib/udev/rules.d
cp mon/scripts/90-vitastor.rules %buildroot/lib/udev/rules.d
%files

View File

@@ -98,10 +98,11 @@ npm install --production
cd ..
mkdir -p %buildroot/usr/lib/vitastor
cp -r mon %buildroot/usr/lib/vitastor
mv %buildroot/usr/lib/vitastor/mon/scripts/make-etcd %buildroot/usr/lib/vitastor/mon/
mkdir -p %buildroot/lib/systemd/system
cp mon/vitastor.target mon/vitastor-mon.service mon/vitastor-osd@.service %buildroot/lib/systemd/system
cp mon/scripts/vitastor.target mon/scripts/vitastor-mon.service mon/scripts/vitastor-osd@.service %buildroot/lib/systemd/system
mkdir -p %buildroot/lib/udev/rules.d
cp mon/90-vitastor.rules %buildroot/lib/udev/rules.d
cp mon/scripts/90-vitastor.rules %buildroot/lib/udev/rules.d
%files

View File

@@ -366,6 +366,7 @@ resume_0:
!flusher->flush_queue.size() || !flusher->dequeuing)
{
stop_flusher:
flusher->dequeuing = false;
if (flusher->trim_wanted > 0 && try_trim)
{
// Attempt forced trim
@@ -373,7 +374,6 @@ stop_flusher:
flusher->active_flushers++;
goto trim_journal;
}
flusher->dequeuing = false;
wait_state = 0;
return true;
}

View File

@@ -34,7 +34,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
{
// peer_osd just dropped connection
// determine WHICH dirty_buffers are now obsolete and repeat them
if (wb->repeat_ops_for(this, peer_osd) > 0)
if (wb->repeat_ops_for(this, peer_osd, 0, 0) > 0)
{
continue_ops();
}
@@ -52,7 +52,8 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
st_cli.tfd = tfd;
st_cli.on_load_config_hook = [this](json11::Json::object & cfg) { on_load_config_hook(cfg); };
st_cli.on_change_osd_state_hook = [this](uint64_t peer_osd) { on_change_osd_state_hook(peer_osd); };
st_cli.on_change_hook = [this](std::map<std::string, etcd_kv_t> & changes) { on_change_hook(changes); };
st_cli.on_change_pool_config_hook = [this]() { on_change_pool_config_hook(); };
st_cli.on_change_pg_state_hook = [this](pool_id_t pool_id, pg_num_t pg_num, osd_num_t prev_primary) { on_change_pg_state_hook(pool_id, pg_num, prev_primary); };
st_cli.on_load_pgs_hook = [this](bool success) { on_load_pgs_hook(success); };
st_cli.on_reload_hook = [this]() { st_cli.load_global_config(); };
@@ -77,11 +78,6 @@ cluster_client_t::~cluster_client_t()
cluster_op_t::~cluster_op_t()
{
if (buf)
{
free(buf);
buf = NULL;
}
if (bitmap_buf)
{
free(bitmap_buf);
@@ -427,7 +423,7 @@ void cluster_client_t::on_load_pgs_hook(bool success)
continue_ops();
}
void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes)
void cluster_client_t::on_change_pool_config_hook()
{
for (auto pool_item: st_cli.pool_config)
{
@@ -450,6 +446,19 @@ void cluster_client_t::on_change_hook(std::map<std::string, etcd_kv_t> & changes
continue_ops();
}
void cluster_client_t::on_change_pg_state_hook(pool_id_t pool_id, pg_num_t pg_num, osd_num_t prev_primary)
{
auto & pg_cfg = st_cli.pool_config[pool_id].pg_config[pg_num];
if (pg_cfg.cur_primary != prev_primary)
{
// Repeat this PG operations because an OSD which stopped being primary may not fsync operations
if (wb->repeat_ops_for(this, 0, pool_id, pg_num) > 0)
{
continue_ops();
}
}
}
bool cluster_client_t::get_immediate_commit(uint64_t inode)
{
if (enable_writeback)
@@ -570,6 +579,14 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
{
op->cur_inode = op->inode;
op->retval = 0;
op->state = 0;
op->retry_after = 0;
op->inflight_count = 0;
op->done_count = 0;
op->part_bitmaps = NULL;
op->bitmap_buf_size = 0;
op->prev_wait = 0;
assert(!op->prev && !op->next);
// check alignment, readonly flag and so on
if (!check_rw(op))
{
@@ -600,7 +617,9 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
{
if (!(op->flags & OP_FLUSH_BUFFER) && !op->version /* no CAS write-repeat */)
{
wb->copy_write(op, CACHE_WRITTEN);
uint64_t flush_id = ++wb->last_flush_id;
wb->copy_write(op, CACHE_REPEATING, flush_id);
op->flush_id = flush_id;
}
if (dirty_bytes >= client_max_dirty_bytes || dirty_ops >= client_max_dirty_ops)
{
@@ -816,6 +835,10 @@ resume_2:
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->inode));
op->retval = op->len / pool_cfg.bitmap_granularity;
}
if (op->flush_id)
{
wb->mark_flush_written(op->inode, op->offset, op->len, op->flush_id);
}
erase_op(op);
return 1;
}
@@ -988,6 +1011,29 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
}
}
bool cluster_client_t::affects_pg(uint64_t inode, uint64_t offset, uint64_t len, pool_id_t pool_id, pg_num_t pg_num)
{
if (INODE_POOL(inode) != pool_id)
{
return false;
}
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(inode));
uint32_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
uint64_t pg_block_size = pool_cfg.data_block_size * pg_data_size;
uint64_t first_stripe = (offset / pg_block_size) * pg_block_size;
uint64_t last_stripe = len > 0 ? ((offset + len - 1) / pg_block_size) * pg_block_size : first_stripe;
if ((last_stripe/pool_cfg.pg_stripe_size) - (first_stripe/pool_cfg.pg_stripe_size) + 1 >= pool_cfg.real_pg_count)
{
// All PGs are affected
return true;
}
pg_num_t first_pg_num = (first_stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; // like map_to_pg()
pg_num_t last_pg_num = (last_stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; // like map_to_pg()
return (first_pg_num <= last_pg_num
? (pg_num >= first_pg_num && pg_num <= last_pg_num)
: (pg_num >= first_pg_num || pg_num <= last_pg_num));
}
bool cluster_client_t::affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd)
{
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(inode));
@@ -1210,7 +1256,9 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
// So do all these things after modifying operation state, otherwise we may hit reenterability bugs
// FIXME postpone such things to set_immediate here to avoid bugs
// Set op->retry_after to retry operation after a short pause (not immediately)
if (!op->retry_after)
if (!op->retry_after && (op->retval == -EPIPE ||
op->retval == -EIO && client_eio_retry_interval ||
op->retval == -ENOSPC && client_retry_enospc))
{
op->retry_after = op->retval != -EPIPE ? client_eio_retry_interval : client_retry_interval;
}

View File

@@ -56,8 +56,6 @@ struct cluster_op_t
protected:
int state = 0;
uint64_t cur_inode; // for snapshot reads
void *buf = NULL;
cluster_op_t *orig_op = NULL;
bool needs_reslice = false;
int retry_after = 0;
int inflight_count = 0, done_count = 0;
@@ -66,6 +64,7 @@ protected:
unsigned bitmap_buf_size = 0;
cluster_op_t *prev = NULL, *next = NULL;
int prev_wait = 0;
uint64_t flush_id = 0;
friend class cluster_client_t;
friend class writeback_cache_t;
};
@@ -81,6 +80,7 @@ class cluster_client_t
ring_loop_t *ringloop;
std::map<pool_id_t, uint64_t> pg_counts;
std::map<pool_pg_num_t, osd_num_t> pg_primary;
// client_max_dirty_* is actually "max unsynced", for the case when immediate_commit is off
uint64_t client_max_dirty_bytes = 0;
uint64_t client_max_dirty_ops = 0;
@@ -146,9 +146,11 @@ public:
protected:
bool affects_osd(uint64_t inode, uint64_t offset, uint64_t len, osd_num_t osd);
bool affects_pg(uint64_t inode, uint64_t offset, uint64_t len, pool_id_t pool_id, pg_num_t pg_num);
void on_load_config_hook(json11::Json::object & config);
void on_load_pgs_hook(bool success);
void on_change_hook(std::map<std::string, etcd_kv_t> & changes);
void on_change_pool_config_hook();
void on_change_pg_state_hook(pool_id_t pool_id, pg_num_t pg_num, osd_num_t prev_primary);
void on_change_osd_state_hook(uint64_t peer_osd);
void execute_internal(cluster_op_t *op);
void unshift_op(cluster_op_t *op);

View File

@@ -46,11 +46,12 @@ public:
bool is_left_merged(dirty_buf_it_t dirty_it);
bool is_right_merged(dirty_buf_it_t dirty_it);
bool is_merged(const dirty_buf_it_t & dirty_it);
void copy_write(cluster_op_t *op, int state);
int repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd);
void copy_write(cluster_op_t *op, int state, uint64_t new_flush_id = 0);
int repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd, pool_id_t pool_id, pg_num_t pg_num);
void start_writebacks(cluster_client_t *cli, int count);
bool read_from_cache(cluster_op_t *op, uint32_t bitmap_granularity);
void flush_buffers(cluster_client_t *cli, dirty_buf_it_t from_it, dirty_buf_it_t to_it);
void mark_flush_written(uint64_t inode, uint64_t offset, uint64_t len, uint64_t flush_id);
void fsync_start();
void fsync_error();
void fsync_ok();

View File

@@ -71,7 +71,7 @@ bool writeback_cache_t::is_merged(const dirty_buf_it_t & dirty_it)
return is_left_merged(dirty_it) || is_right_merged(dirty_it);
}
void writeback_cache_t::copy_write(cluster_op_t *op, int state)
void writeback_cache_t::copy_write(cluster_op_t *op, int state, uint64_t new_flush_id)
{
// Save operation for replay when one of PGs goes out of sync
// (primary OSD drops our connection in this case)
@@ -180,6 +180,7 @@ void writeback_cache_t::copy_write(cluster_op_t *op, int state)
.buf = buf,
.len = op->len,
.state = state,
.flush_id = new_flush_id,
.refcnt = refcnt,
});
if (state == CACHE_DIRTY)
@@ -208,7 +209,7 @@ void writeback_cache_t::copy_write(cluster_op_t *op, int state)
}
}
int writeback_cache_t::repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd)
int writeback_cache_t::repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd, pool_id_t pool_id, pg_num_t pg_num)
{
int repeated = 0;
if (dirty_buffers.size())
@@ -218,8 +219,11 @@ int writeback_cache_t::repeat_ops_for(cluster_client_t *cli, osd_num_t peer_osd)
for (auto wr_it = dirty_buffers.begin(), flush_it = wr_it, last_it = wr_it; ; )
{
bool end = wr_it == dirty_buffers.end();
bool flush_this = !end && wr_it->second.state != CACHE_REPEATING &&
cli->affects_osd(wr_it->first.inode, wr_it->first.stripe, wr_it->second.len, peer_osd);
bool flush_this = !end && wr_it->second.state != CACHE_REPEATING;
if (peer_osd)
flush_this = flush_this && cli->affects_osd(wr_it->first.inode, wr_it->first.stripe, wr_it->second.len, peer_osd);
if (pool_id && pg_num)
flush_this = flush_this && cli->affects_pg(wr_it->first.inode, wr_it->first.stripe, wr_it->second.len, pool_id, pg_num);
if (flush_it != wr_it && (end || !flush_this ||
wr_it->first.inode != flush_it->first.inode ||
wr_it->first.stripe != last_it->first.stripe+last_it->second.len))
@@ -265,7 +269,7 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
writebacks_active++;
op->callback = [this, flush_id](cluster_op_t* op)
{
// Buffer flushes should be always retried, regardless of the error,
// Buffer flushes are always retried, regardless of the error,
// so they should never result in an error here
assert(op->retval == op->len);
for (auto fl_it = flushed_buffers.find(flush_id);
@@ -277,16 +281,7 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
}
flushed_buffers.erase(fl_it++);
}
for (auto dirty_it = find_dirty(op->inode, op->offset);
dirty_it != dirty_buffers.end() && dirty_it->first.inode == op->inode &&
dirty_it->first.stripe < op->offset+op->len; dirty_it++)
{
if (dirty_it->second.flush_id == flush_id && dirty_it->second.state == CACHE_REPEATING)
{
dirty_it->second.flush_id = 0;
dirty_it->second.state = CACHE_WRITTEN;
}
}
mark_flush_written(op->inode, op->offset, op->len, flush_id);
delete op;
writebacks_active--;
// We can't call execute_internal because it affects an invalid copy of the list here
@@ -304,6 +299,20 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
}
}
void writeback_cache_t::mark_flush_written(uint64_t inode, uint64_t offset, uint64_t len, uint64_t flush_id)
{
for (auto dirty_it = find_dirty(inode, offset);
dirty_it != dirty_buffers.end() && dirty_it->first.inode == inode &&
dirty_it->first.stripe < offset+len; dirty_it++)
{
if (dirty_it->second.flush_id == flush_id && dirty_it->second.state == CACHE_REPEATING)
{
dirty_it->second.flush_id = 0;
dirty_it->second.state = CACHE_WRITTEN;
}
}
}
void writeback_cache_t::start_writebacks(cluster_client_t *cli, int count)
{
if (!writeback_queue.size())

View File

@@ -253,7 +253,7 @@ void etcd_state_client_t::parse_config(const json11::Json & config)
this->etcd_ws_keepalive_interval = config["etcd_ws_keepalive_interval"].uint64_value();
if (this->etcd_ws_keepalive_interval <= 0)
{
this->etcd_ws_keepalive_interval = 30;
this->etcd_ws_keepalive_interval = 5;
}
this->max_etcd_attempts = config["max_etcd_attempts"].uint64_value();
if (this->max_etcd_attempts <= 0)
@@ -890,6 +890,10 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
}
}
}
if (on_change_pool_config_hook)
{
on_change_pool_config_hook();
}
}
else if (key == etcd_prefix+"/config/pgs")
{
@@ -1028,13 +1032,19 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
else if (value.is_null())
{
auto & pg_cfg = this->pool_config[pool_id].pg_config[pg_num];
auto prev_primary = pg_cfg.cur_primary;
pg_cfg.state_exists = false;
pg_cfg.cur_primary = 0;
pg_cfg.cur_state = 0;
if (on_change_pg_state_hook)
{
on_change_pg_state_hook(pool_id, pg_num, prev_primary);
}
}
else
{
auto & pg_cfg = this->pool_config[pool_id].pg_config[pg_num];
auto prev_primary = pg_cfg.cur_primary;
pg_cfg.state_exists = true;
osd_num_t cur_primary = value["primary"].uint64_value();
int state = 0;
@@ -1065,6 +1075,10 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
}
pg_cfg.cur_primary = cur_primary;
pg_cfg.cur_state = state;
if (on_change_pg_state_hook)
{
on_change_pg_state_hook(pool_id, pg_num, prev_primary);
}
}
}
else if (key.substr(0, etcd_prefix.length()+11) == etcd_prefix+"/osd/state/")

View File

@@ -103,7 +103,7 @@ protected:
void pick_next_etcd();
public:
int etcd_keepalive_timeout = 30;
int etcd_ws_keepalive_interval = 30;
int etcd_ws_keepalive_interval = 5;
int max_etcd_attempts = 5;
int etcd_quick_timeout = 1000;
int etcd_slow_timeout = 5000;
@@ -127,6 +127,8 @@ public:
std::function<void(json11::Json::object &)> on_load_config_hook;
std::function<json11::Json()> load_pgs_checks_hook;
std::function<void(bool)> on_load_pgs_hook;
std::function<void()> on_change_pool_config_hook;
std::function<void(pool_id_t, pg_num_t, osd_num_t)> on_change_pg_state_hook;
std::function<void(pool_id_t, pg_num_t)> on_change_pg_history_hook;
std::function<void(osd_num_t)> on_change_osd_state_hook;
std::function<void()> on_reload_hook;

View File

@@ -271,7 +271,7 @@ void http_co_t::close_connection()
}
if (peer_fd >= 0)
{
tfd->set_fd_handler(peer_fd, false, NULL);
tfd->set_fd_handler(peer_fd, 0, NULL);
close(peer_fd);
peer_fd = -1;
}
@@ -314,7 +314,7 @@ void http_co_t::start_connection()
stackout();
return;
}
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
tfd->set_fd_handler(peer_fd, EPOLLIN|EPOLLOUT, [this](int peer_fd, int epoll_events)
{
this->epoll_events |= epoll_events;
handle_events();
@@ -372,7 +372,7 @@ void http_co_t::handle_connect_result()
}
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
tfd->set_fd_handler(peer_fd, EPOLLIN, [this](int peer_fd, int epoll_events)
{
this->epoll_events |= epoll_events;
handle_events();

View File

@@ -15,6 +15,207 @@
#include "msgr_rdma.h"
#endif
#include <sys/poll.h>
#include <sys/eventfd.h>
static uint64_t one = 1;
msgr_iothread_t::msgr_iothread_t()
{
ring = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
epmgr = new epoll_manager_t(ring);
submit_eventfd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
if (submit_eventfd < 0)
{
throw std::runtime_error(std::string("failed to create eventfd: ")+strerror(errno));
}
epmgr->tfd->set_fd_handler(submit_eventfd, EPOLLIN, [this](int fd, int epoll_events)
{
// Reset eventfd counter
uint64_t ctr = 0;
int r = read(submit_eventfd, &ctr, 8);
if (r < 0 && errno != EAGAIN && errno != EINTR)
{
fprintf(stderr, "Error resetting eventfd: %s\n", strerror(errno));
}
ring->wakeup();
});
consumer.loop = [this]()
{
read_requests();
send_replies();
ring->submit();
};
ring->register_consumer(&consumer);
thread = new std::thread(&msgr_iothread_t::run, this);
}
msgr_iothread_t::~msgr_iothread_t()
{
stop();
delete thread;
delete epmgr;
delete ring;
}
void msgr_iothread_t::stop()
{
mu.lock();
if (stopped)
{
mu.unlock();
return;
}
stopped = true;
write(submit_eventfd, &one, sizeof(one));
mu.unlock();
thread->join();
ring->unregister_consumer(&consumer);
close(submit_eventfd);
}
void msgr_iothread_t::add_client(osd_client_t *cl)
{
mu.lock();
if (stopped)
{
mu.unlock();
return;
}
assert(!clients[cl->peer_fd]);
clients[cl->peer_fd] = cl;
epmgr->tfd->set_fd_handler(cl->peer_fd, EPOLLIN, [this](int peer_fd, int epoll_events)
{
// FIXME: Slight copypaste (see handle_peer_epoll)
if (epoll_events & EPOLLIN)
{
auto cl_it = clients.find(peer_fd);
if (cl_it != clients.end())
{
auto cl = cl_it->second;
cl->mu.lock();
cl->read_ready++;
if (cl->read_ready == 1)
{
read_ready_clients.push_back(peer_fd);
ring->wakeup();
}
cl->mu.unlock();
}
}
});
mu.unlock();
}
void msgr_iothread_t::remove_client(osd_client_t *cl)
{
mu.lock();
if (stopped)
{
mu.unlock();
return;
}
auto cl_it = clients.find(cl->peer_fd);
if (cl_it != clients.end() && cl_it->second == cl)
{
clients.erase(cl->peer_fd);
epmgr->tfd->set_fd_handler(cl->peer_fd, 0, NULL);
}
mu.unlock();
}
void msgr_iothread_t::wakeup_out(int peer_fd, ring_loop_t *outer_ring)
{
write_ready_mu.lock();
if (!write_ready_clients.size())
{
io_uring_sqe* sqe = outer_ring->get_sqe();
if (!sqe)
{
write(submit_eventfd, &one, sizeof(one));
}
else
{
ring_data_t* data = ((ring_data_t*)sqe->user_data);
data->callback = [](ring_data_t*){};
my_uring_prep_write(sqe, submit_eventfd, &one, sizeof(one), 0);
}
}
write_ready_clients.push_back(peer_fd);
write_ready_mu.unlock();
}
void msgr_iothread_t::read_requests()
{
// FIXME: Slight copypaste (see messenger_t::read_requests)
auto to_recv = std::move(read_ready_clients);
for (int i = 0; i < to_recv.size(); i++)
{
int peer_fd = to_recv[i];
auto cl_it = clients.find(peer_fd);
if (cl_it == clients.end())
{
continue;
}
osd_client_t *cl = cl_it->second;
cl->mu.lock();
auto ok = cl->try_recv(ring, false);
cl->mu.unlock();
if (!ok)
{
read_ready_clients.insert(read_ready_clients.end(), to_recv.begin()+i, to_recv.end());
break;
}
}
}
void msgr_iothread_t::send_replies()
{
if (stopped)
{
return;
}
write_ready_mu.lock();
auto to_send = std::move(write_ready_clients);
write_ready_mu.unlock();
for (int i = 0; i < to_send.size(); i++)
{
auto cl_it = clients.find(to_send[i]);
if (cl_it == clients.end())
{
continue;
}
auto cl = cl_it->second;
cl->mu.lock();
auto ok = cl->try_send(ring, false/*, lock*/);
cl->mu.unlock();
if (!ok)
{
// ring is full (rare but what if...)
write_ready_mu.lock();
write_ready_clients.insert(write_ready_clients.end(), to_send.begin()+i, to_send.end());
write_ready_mu.unlock();
break;
}
}
}
void msgr_iothread_t::run()
{
while (true)
{
mu.lock();
if (stopped)
{
mu.unlock();
return;
}
ring->loop();
mu.unlock();
ring->wait();
}
}
void osd_messenger_t::init()
{
#ifdef WITH_RDMA
@@ -35,7 +236,7 @@ void osd_messenger_t::init()
? rdma_max_sge : rdma_context->attrx.orig_attr.max_sge;
fprintf(stderr, "[OSD %ju] RDMA initialized successfully\n", osd_num);
fcntl(rdma_context->channel->fd, F_SETFL, fcntl(rdma_context->channel->fd, F_GETFL, 0) | O_NONBLOCK);
tfd->set_fd_handler(rdma_context->channel->fd, false, [this](int notify_fd, int epoll_events)
tfd->set_fd_handler(rdma_context->channel->fd, EPOLLIN, [this](int notify_fd, int epoll_events)
{
handle_rdma_events();
});
@@ -43,6 +244,44 @@ void osd_messenger_t::init()
}
}
#endif
if (ringloop && iothread_count > 0)
{
for (int i = 0; i < iothread_count; i++)
{
auto iot = new msgr_iothread_t();
iothreads.push_back(iot);
}
immediates_eventfd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
if (immediates_eventfd < 0)
{
throw std::runtime_error(std::string("failed to create set_immediate eventfd: ")+strerror(errno));
}
tfd->set_fd_handler(immediates_eventfd, EPOLLIN, [this](int peer_fd, int epoll_events)
{
// Reset eventfd counter
uint64_t ctr = 0;
int r = read(immediates_eventfd, &ctr, 8);
if (r < 0 && errno != EAGAIN && errno != EINTR)
{
fprintf(stderr, "Error resetting eventfd: %s\n", strerror(errno));
}
while (true)
{
immediates_mu.lock();
auto to_run = std::move(immediates);
immediates_mu.unlock();
if (!to_run.size())
{
break;
}
for (auto & cb: to_run)
{
cb();
}
}
ringloop->wakeup();
});
}
keepalive_timer_id = tfd->set_timer(1000, true, [this](int)
{
auto cl_it = clients.begin();
@@ -120,6 +359,12 @@ void osd_messenger_t::init()
osd_messenger_t::~osd_messenger_t()
{
if (immediates_eventfd >= 0)
{
tfd->set_fd_handler(immediates_eventfd, 0, NULL);
close(immediates_eventfd);
immediates_eventfd = -1;
}
if (keepalive_timer_id >= 0)
{
tfd->clear_timer(keepalive_timer_id);
@@ -129,6 +374,14 @@ osd_messenger_t::~osd_messenger_t()
{
stop_client(clients.begin()->first, true, true);
}
if (iothreads.size())
{
for (auto iot: iothreads)
{
delete iot;
}
iothreads.clear();
}
#ifdef WITH_RDMA
if (rdma_context)
{
@@ -165,6 +418,10 @@ void osd_messenger_t::parse_config(const json11::Json & config)
this->rdma_max_msg = 129*1024;
this->rdma_odp = config["rdma_odp"].bool_value();
#endif
if (!osd_num)
this->iothread_count = config["client_iothread_count"].is_null() ? 4 : (uint32_t)config["client_iothread_count"].uint64_value();
else
this->iothread_count = (uint32_t)config["osd_iothread_count"].uint64_value();
this->receive_buffer_size = (uint32_t)config["tcp_header_buffer_size"].uint64_value();
if (!this->receive_buffer_size || this->receive_buffer_size > 1024*1024*1024)
this->receive_buffer_size = 65536;
@@ -255,6 +512,7 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
{
fprintf(stderr, "Connecting to OSD %ju at %s:%d (client %d)\n", peer_osd, peer_host, peer_port, peer_fd);
}
clients[peer_fd]->msgr = this;
clients[peer_fd]->peer_addr = addr;
clients[peer_fd]->peer_port = peer_port;
clients[peer_fd]->peer_fd = peer_fd;
@@ -262,7 +520,8 @@ void osd_messenger_t::try_connect_peer_addr(osd_num_t peer_osd, const char *peer
clients[peer_fd]->connect_timeout_id = -1;
clients[peer_fd]->osd_num = peer_osd;
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
tfd->set_fd_handler(peer_fd, true, [this](int peer_fd, int epoll_events)
clients[peer_fd]->receive_buffer_size = receive_buffer_size;
tfd->set_fd_handler(peer_fd, EPOLLIN|EPOLLOUT, [this](int peer_fd, int epoll_events)
{
// Either OUT (connected) or HUP
handle_connect_epoll(peer_fd);
@@ -303,7 +562,11 @@ void osd_messenger_t::handle_connect_epoll(int peer_fd)
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
cl->peer_state = PEER_CONNECTED;
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
if (iothreads.size())
{
iothreads[peer_fd % iothreads.size()]->add_client(cl);
}
tfd->set_fd_handler(peer_fd, iothreads.size() ? 0 : EPOLLIN, [this](int peer_fd, int epoll_events)
{
handle_peer_epoll(peer_fd, epoll_events);
});
@@ -487,7 +750,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
fprintf(stderr, "Connected to OSD %ju using RDMA\n", cl->osd_num);
}
cl->peer_state = PEER_RDMA;
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
tfd->set_fd_handler(cl->peer_fd, 0, [this](int peer_fd, int epoll_events)
{
// Do not miss the disconnection!
if (epoll_events & EPOLLRDHUP)
@@ -522,13 +785,19 @@ void osd_messenger_t::accept_connections(int listen_fd)
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
clients[peer_fd] = new osd_client_t();
clients[peer_fd]->msgr = this;
clients[peer_fd]->peer_addr = addr;
clients[peer_fd]->peer_port = ntohs(((sockaddr_in*)&addr)->sin_port);
clients[peer_fd]->peer_fd = peer_fd;
clients[peer_fd]->peer_state = PEER_CONNECTED;
clients[peer_fd]->in_buf = malloc_or_die(receive_buffer_size);
clients[peer_fd]->receive_buffer_size = receive_buffer_size;
// Add FD to epoll
tfd->set_fd_handler(peer_fd, false, [this](int peer_fd, int epoll_events)
if (iothreads.size())
{
iothreads[peer_fd % iothreads.size()]->add_client(clients[peer_fd]);
}
tfd->set_fd_handler(peer_fd, iothreads.size() ? 0 : EPOLLIN, [this](int peer_fd, int epoll_events)
{
handle_peer_epoll(peer_fd, epoll_events);
});

View File

@@ -11,6 +11,7 @@
#include <map>
#include <deque>
#include <vector>
#include <mutex>
#include "malloc_or_die.h"
#include "json11/json11.hpp"
@@ -45,8 +46,13 @@ struct msgr_rdma_connection_t;
struct msgr_rdma_context_t;
#endif
struct osd_messenger_t;
struct osd_client_t
{
std::mutex mu;
osd_messenger_t *msgr = NULL;
int refs = 0;
sockaddr_storage peer_addr;
@@ -59,6 +65,7 @@ struct osd_client_t
osd_num_t osd_num = 0;
void *in_buf = NULL;
uint32_t receive_buffer_size = 0;
#ifdef WITH_RDMA
msgr_rdma_connection_t *rdma_conn = NULL;
@@ -89,6 +96,17 @@ struct osd_client_t
std::vector<msgr_sendp_t> outbox, next_outbox;
~osd_client_t();
bool try_send(ring_loop_t *ringloop, bool use_sync_send_recv);
int handle_send(int result);
bool try_recv(ring_loop_t *ringloop, bool use_sync_send_recv);
int handle_read(int result);
bool handle_read_buffer(void *curbuf, int remain);
bool handle_finished_read();
void handle_op_hdr();
bool handle_reply_hdr();
void handle_reply_ready(osd_op_t *op);
};
struct osd_wanted_peer_t
@@ -111,6 +129,53 @@ struct osd_op_stats_t
uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
};
#ifdef __MOCK__
class msgr_iothread_t;
#else
#include <thread>
#include "epoll_manager.h"
class msgr_iothread_t
{
protected:
ring_loop_t *ring = NULL;
epoll_manager_t *epmgr = NULL;
ring_consumer_t consumer;
int submit_eventfd = -1;
bool stopped = false;
std::mutex mu;
std::map<int, osd_client_t*> clients;
std::vector<int> read_ready_clients;
std::mutex write_ready_mu;
std::vector<int> write_ready_clients;
std::thread *thread = NULL;
void run();
void read_requests();
void send_replies();
public:
void handle_client_read(osd_client_t *cl, int res);
void handle_client_send(osd_client_t *cl, int res);
msgr_iothread_t();
~msgr_iothread_t();
void add_client(osd_client_t *cl);
void remove_client(osd_client_t *cl);
void wakeup_out(int peer_fd, ring_loop_t *outer_ring);
void stop();
};
#endif
struct osd_messenger_t
{
protected:
@@ -123,6 +188,7 @@ protected:
int osd_ping_timeout = 0;
int log_level = 0;
bool use_sync_send_recv = false;
int iothread_count = 0;
#ifdef WITH_RDMA
bool use_rdma = true;
@@ -134,10 +200,13 @@ protected:
bool rdma_odp = false;
#endif
std::vector<msgr_iothread_t*> iothreads;
std::vector<int> read_ready_clients;
std::vector<int> write_ready_clients;
int immediates_eventfd = -1;
std::mutex immediates_mu;
// We don't use ringloop->set_immediate here because we may have no ringloop in client :)
std::vector<std::function<void()>> set_immediate;
std::vector<std::function<void()>> immediates;
public:
timerfd_manager_t *tfd;
@@ -155,10 +224,13 @@ public:
void parse_config(const json11::Json & config);
void connect_peer(uint64_t osd_num, json11::Json peer_state);
void stop_client(int peer_fd, bool force = false, bool force_delete = false);
void stop_client_from_iothread(osd_client_t *cl);
void outbox_push(osd_op_t *cur_op);
std::function<void(osd_op_t*)> exec_op;
std::function<void(osd_num_t)> repeer_pgs;
std::function<bool(osd_client_t*, json11::Json)> check_config_hook;
void handle_client_read(osd_client_t *cl, int res);
void handle_client_send(osd_client_t *cl, int res);
void read_requests();
void send_replies();
void accept_connections(int listen_fd);
@@ -178,6 +250,9 @@ public:
void inc_op_stats(osd_op_stats_t & stats, uint64_t opcode, timespec & tv_begin, timespec & tv_end, uint64_t len);
void measure_exec(osd_op_t *cur_op);
void set_immediate(std::function<void()> cb);
void set_immediate_or_run(std::function<void()> cb);
protected:
void try_connect_peer(uint64_t osd_num);
void try_connect_peer_addr(osd_num_t peer_osd, const char *peer_host, int peer_port);
@@ -188,15 +263,7 @@ protected:
void cancel_osd_ops(osd_client_t *cl);
void cancel_op(osd_op_t *op);
bool try_send(osd_client_t *cl);
void handle_send(int result, osd_client_t *cl);
bool handle_read(int result, osd_client_t *cl);
bool handle_read_buffer(osd_client_t *cl, void *curbuf, int remain);
bool handle_finished_read(osd_client_t *cl);
void handle_op_hdr(osd_client_t *cl);
bool handle_reply_hdr(osd_client_t *cl);
void handle_reply_ready(osd_op_t *op);
void handle_immediates();
#ifdef WITH_RDMA
void try_send_rdma(osd_client_t *cl);
@@ -205,4 +272,6 @@ protected:
bool try_recv_rdma(osd_client_t *cl);
void handle_rdma_events();
#endif
friend struct osd_client_t;
};

View File

@@ -603,7 +603,7 @@ void osd_messenger_t::handle_rdma_events()
if (!is_send)
{
rc->cur_recv--;
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
if (!cl->handle_read_buffer(rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
{
// handle_read_buffer may stop the client
continue;
@@ -666,9 +666,5 @@ void osd_messenger_t::handle_rdma_events()
}
}
} while (event_count > 0);
for (auto cb: set_immediate)
{
cb();
}
set_immediate.clear();
handle_immediates();
}

View File

@@ -1,6 +1,7 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
#include <unistd.h>
#include "messenger.h"
void osd_messenger_t::read_requests()
@@ -9,63 +10,119 @@ void osd_messenger_t::read_requests()
{
int peer_fd = read_ready_clients[i];
osd_client_t *cl = clients[peer_fd];
if (cl->read_msg.msg_iovlen)
if (!cl->try_recv(ringloop, use_sync_send_recv))
{
continue;
}
if (cl->read_remaining < receive_buffer_size)
{
cl->read_iov.iov_base = cl->in_buf;
cl->read_iov.iov_len = receive_buffer_size;
cl->read_msg.msg_iov = &cl->read_iov;
cl->read_msg.msg_iovlen = 1;
}
else
{
cl->read_iov.iov_base = 0;
cl->read_iov.iov_len = cl->read_remaining;
cl->read_msg.msg_iov = cl->recv_list.get_iovec();
cl->read_msg.msg_iovlen = cl->recv_list.get_size();
}
cl->refs++;
if (ringloop && !use_sync_send_recv)
{
io_uring_sqe* sqe = ringloop->get_sqe();
if (!sqe)
{
cl->read_msg.msg_iovlen = 0;
read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
return;
}
ring_data_t* data = ((ring_data_t*)sqe->user_data);
data->callback = [this, cl](ring_data_t *data) { handle_read(data->res, cl); };
my_uring_prep_recvmsg(sqe, peer_fd, &cl->read_msg, 0);
}
else
{
int result = recvmsg(peer_fd, &cl->read_msg, 0);
if (result < 0)
{
result = -errno;
}
handle_read(result, cl);
read_ready_clients.erase(read_ready_clients.begin(), read_ready_clients.begin() + i);
return;
}
}
read_ready_clients.clear();
if (!iothreads.size())
{
handle_immediates();
}
}
bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
bool osd_client_t::try_recv(ring_loop_t *ringloop, bool use_sync_send_recv)
{
bool ret = false;
auto cl = this;
if (cl->read_msg.msg_iovlen)
{
return true;
}
if (cl->read_remaining < cl->receive_buffer_size)
{
cl->read_iov.iov_base = cl->in_buf;
cl->read_iov.iov_len = cl->receive_buffer_size;
cl->read_msg.msg_iov = &cl->read_iov;
cl->read_msg.msg_iovlen = 1;
}
else
{
cl->read_iov.iov_base = 0;
cl->read_iov.iov_len = cl->read_remaining;
cl->read_msg.msg_iov = cl->recv_list.get_iovec();
cl->read_msg.msg_iovlen = cl->recv_list.get_size();
}
cl->refs++;
if (ringloop && !use_sync_send_recv)
{
io_uring_sqe* sqe = ringloop->get_sqe();
if (!sqe)
{
cl->read_msg.msg_iovlen = 0;
return false;
}
ring_data_t* data = ((ring_data_t*)sqe->user_data);
if (msgr->iothreads.size())
{
data->callback = [this](ring_data_t *data) { msgr->iothreads[peer_fd % msgr->iothreads.size()]->handle_client_read(this, data->res); };
}
else
{
data->callback = [this](ring_data_t *data) { msgr->handle_client_read(this, data->res); };
}
my_uring_prep_recvmsg(sqe, peer_fd, &cl->read_msg, 0);
}
else
{
int result = recvmsg(peer_fd, &cl->read_msg, 0);
if (result < 0)
{
result = -errno;
}
msgr->handle_client_read(this, result);
}
return true;
}
void osd_messenger_t::handle_client_read(osd_client_t *cl, int res)
{
res = cl->handle_read(res);
if (res == -ENOENT)
{
if (!cl->refs)
delete cl;
}
else if (res == -EIO)
{
stop_client(cl->peer_fd);
}
else if (res == -EAGAIN)
{
read_ready_clients.push_back(cl->peer_fd);
}
}
void msgr_iothread_t::handle_client_read(osd_client_t *cl, int res)
{
cl->mu.lock();
res = cl->handle_read(res);
if (res == -ENOENT)
{
if (!cl->refs)
cl->msgr->set_immediate([cl]() { delete cl; });
}
cl->mu.unlock();
if (res == -EIO)
{
cl->msgr->stop_client_from_iothread(cl);
}
else if (res == -EAGAIN)
{
read_ready_clients.push_back(cl->peer_fd);
ring->wakeup();
}
}
int osd_client_t::handle_read(int result)
{
auto cl = this;
cl->read_msg.msg_iovlen = 0;
cl->refs--;
if (cl->peer_state == PEER_STOPPED)
{
if (cl->refs <= 0)
{
delete cl;
}
return false;
return -ENOENT;
}
if (result <= 0 && result != -EAGAIN && result != -EINTR)
{
@@ -74,27 +131,14 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
{
fprintf(stderr, "Client %d socket read error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
}
stop_client(cl->peer_fd);
return false;
}
if (result == -EAGAIN || result == -EINTR || result < cl->read_iov.iov_len)
{
cl->read_ready--;
if (cl->read_ready > 0)
read_ready_clients.push_back(cl->peer_fd);
}
else
{
read_ready_clients.push_back(cl->peer_fd);
return -EIO;
}
int expected = cl->read_iov.iov_len;
if (result > 0)
{
if (cl->read_iov.iov_base == cl->in_buf)
{
if (!handle_read_buffer(cl, cl->in_buf, result))
{
goto fin;
}
handle_read_buffer(cl->in_buf, result);
}
else
{
@@ -103,28 +147,25 @@ bool osd_messenger_t::handle_read(int result, osd_client_t *cl)
cl->recv_list.eat(result);
if (cl->recv_list.done >= cl->recv_list.count)
{
if (!handle_finished_read(cl))
{
goto fin;
}
handle_finished_read();
}
}
if (result >= cl->read_iov.iov_len)
{
ret = true;
}
}
fin:
for (auto cb: set_immediate)
if (result == -EAGAIN || result == -EINTR || result < expected)
{
cb();
cl->read_ready--;
assert(cl->read_ready >= 0);
}
set_immediate.clear();
return ret;
if (cl->read_ready > 0)
{
return -EAGAIN;
}
return 0;
}
bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int remain)
bool osd_client_t::handle_read_buffer(void *curbuf, int remain)
{
auto cl = this;
// Compose operation(s) from the buffer
while (remain > 0)
{
@@ -160,7 +201,7 @@ bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int rem
}
if (cl->recv_list.done >= cl->recv_list.count)
{
if (!handle_finished_read(cl))
if (!handle_finished_read())
{
return false;
}
@@ -169,19 +210,20 @@ bool osd_messenger_t::handle_read_buffer(osd_client_t *cl, void *curbuf, int rem
return true;
}
bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
bool osd_client_t::handle_finished_read()
{
auto cl = this;
cl->recv_list.reset();
if (cl->read_state == CL_READ_HDR)
{
if (cl->read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
return handle_reply_hdr(cl);
return handle_reply_hdr();
else if (cl->read_op->req.hdr.magic == SECONDARY_OSD_OP_MAGIC)
handle_op_hdr(cl);
handle_op_hdr();
else
{
fprintf(stderr, "Received garbage: magic=%jx id=%ju opcode=%jx from %d\n", cl->read_op->req.hdr.magic, cl->read_op->req.hdr.id, cl->read_op->req.hdr.opcode, cl->peer_fd);
stop_client(cl->peer_fd);
msgr->stop_client_from_iothread(cl);
return false;
}
}
@@ -189,7 +231,7 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
{
// Operation is ready
cl->received_ops.push_back(cl->read_op);
set_immediate.push_back([this, op = cl->read_op]() { exec_op(op); });
msgr->set_immediate([msgr = this->msgr, op = cl->read_op, cl]() { msgr->exec_op(op); });
cl->read_op = NULL;
cl->read_state = 0;
}
@@ -207,8 +249,9 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
return true;
}
void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
void osd_client_t::handle_op_hdr()
{
auto cl = this;
osd_op_t *cur_op = cl->read_op;
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ)
{
@@ -285,20 +328,21 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
{
// Operation is ready
cl->received_ops.push_back(cur_op);
set_immediate.push_back([this, cur_op]() { exec_op(cur_op); });
msgr->set_immediate([msgr = this->msgr, cur_op, cl]() { msgr->exec_op(cur_op); });
cl->read_op = NULL;
cl->read_state = 0;
}
}
bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
bool osd_client_t::handle_reply_hdr()
{
auto cl = this;
auto req_it = cl->sent_ops.find(cl->read_op->req.hdr.id);
if (req_it == cl->sent_ops.end())
{
// Command out of sync. Drop connection
fprintf(stderr, "Client %d command out of sync: id %ju\n", cl->peer_fd, cl->read_op->req.hdr.id);
stop_client(cl->peer_fd);
msgr->stop_client_from_iothread(cl);
return false;
}
osd_op_t *op = req_it->second;
@@ -315,7 +359,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
fprintf(stderr, "Client %d read reply of different length: expected %u+%u, got %jd+%u\n",
cl->peer_fd, expected_size, op->bitmap_len, op->reply.hdr.retval, bmp_len);
cl->sent_ops[op->req.hdr.id] = op;
stop_client(cl->peer_fd);
msgr->stop_client_from_iothread(cl);
return false;
}
if (bmp_len > 0)
@@ -391,24 +435,92 @@ reuse:
return true;
}
void osd_messenger_t::handle_reply_ready(osd_op_t *op)
void osd_client_t::handle_reply_ready(osd_op_t *op)
{
// Measure subop latency
timespec tv_end;
clock_gettime(CLOCK_REALTIME, &tv_end);
stats.subop_stat_count[op->req.hdr.opcode]++;
if (!stats.subop_stat_count[op->req.hdr.opcode])
msgr->set_immediate([msgr = this->msgr, op, cl = this]()
{
// Measure subop latency
auto & stats = msgr->stats;
timespec tv_end;
clock_gettime(CLOCK_REALTIME, &tv_end);
stats.subop_stat_count[op->req.hdr.opcode]++;
stats.subop_stat_sum[op->req.hdr.opcode] = 0;
}
stats.subop_stat_sum[op->req.hdr.opcode] += (
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
);
set_immediate.push_back([op]()
{
if (!stats.subop_stat_count[op->req.hdr.opcode])
{
stats.subop_stat_count[op->req.hdr.opcode]++;
stats.subop_stat_sum[op->req.hdr.opcode] = 0;
}
stats.subop_stat_sum[op->req.hdr.opcode] += (
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
);
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(op->callback)(op);
});
}
static uint64_t one = 1;
void osd_messenger_t::set_immediate(std::function<void()> cb/*, ring_loop_t *ringloop*/)
{
if (!iothreads.size())
{
immediates.push_back(cb);
return;
}
immediates_mu.lock();
bool wakeup_main_thread = !immediates.size();
immediates.push_back(cb);
immediates_mu.unlock();
if (wakeup_main_thread)
{
// io_uring_sqe* sqe = ringloop ? ringloop->get_sqe() : NULL;
// if (!sqe)
// {
write(immediates_eventfd, &one, sizeof(one));
// FIXME: Can't use ringloop here, oops
// }
// else
// {
// ring_data_t* data = ((ring_data_t*)sqe->user_data);
// data->callback = [](ring_data_t*){};
// my_uring_prep_write(sqe, immediates_eventfd, &one, sizeof(one), 0);
// }
}
}
void osd_messenger_t::set_immediate_or_run(std::function<void()> cb/*, ring_loop_t *ringloop*/)
{
if (!iothreads.size())
{
cb();
return;
}
immediates_mu.lock();
bool wakeup_main_thread = !immediates.size();
immediates.push_back(cb);
immediates_mu.unlock();
if (wakeup_main_thread)
{
// io_uring_sqe* sqe = ringloop ? ringloop->get_sqe() : NULL;
// if (!sqe)
// {
write(immediates_eventfd, &one, sizeof(one));
// FIXME: Can't use ringloop here, oops
// }
// else
// {
// ring_data_t* data = ((ring_data_t*)sqe->user_data);
// data->callback = [](ring_data_t*){};
// my_uring_prep_write(sqe, immediates_eventfd, &one, sizeof(one), 0);
// }
}
}
void osd_messenger_t::handle_immediates()
{
auto to_run = std::move(immediates);
for (auto & cb: to_run)
{
cb();
}
}

View File

@@ -15,10 +15,17 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
{
clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
}
else
else if (cur_op->op_type == OSD_OP_IN)
{
measure_exec(cur_op);
}
if (iothreads.size())
{
cl->mu.lock();
}
if (cur_op->op_type == OSD_OP_IN)
{
// Check that operation actually belongs to this client
// FIXME: Review if this is still needed
bool found = false;
for (auto it = cl->received_ops.begin(); it != cl->received_ops.end(); it++)
{
@@ -32,6 +39,10 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
if (!found)
{
delete cur_op;
if (iothreads.size())
{
cl->mu.unlock();
}
return;
}
}
@@ -39,7 +50,6 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
auto & to_outbox = cl->write_msg.msg_iovlen ? cl->next_outbox : cl->outbox;
if (cur_op->op_type == OSD_OP_IN)
{
measure_exec(cur_op);
to_send_list.push_back((iovec){ .iov_base = cur_op->reply.buf, .iov_len = OSD_PACKET_SIZE });
}
else
@@ -108,21 +118,36 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
#ifdef WITH_RDMA
if (cl->peer_state == PEER_RDMA)
{
if (iothreads.size())
{
cl->mu.unlock();
}
try_send_rdma(cl);
return;
}
#endif
if (!ringloop)
if (iothreads.size())
{
int should_wakeup = !cl->write_msg.msg_iovlen && !cl->write_state;
cl->write_state = CL_WRITE_READY;
cl->mu.unlock();
if (should_wakeup)
{
auto iot = iothreads[cl->peer_fd % iothreads.size()];
iot->wakeup_out(cl->peer_fd, ringloop);
}
}
else if (!ringloop)
{
// FIXME: It's worse because it doesn't allow batching
while (cl->outbox.size())
{
try_send(cl);
cl->try_send(NULL, true);
}
}
else
{
if ((cl->write_msg.msg_iovlen > 0 || !try_send(cl)) && (cl->write_state == 0))
if ((cl->write_msg.msg_iovlen > 0 || !cl->try_send(ringloop, use_sync_send_recv)) && (cl->write_state == 0))
{
cl->write_state = CL_WRITE_READY;
write_ready_clients.push_back(cur_op->peer_fd);
@@ -180,8 +205,9 @@ void osd_messenger_t::measure_exec(osd_op_t *cur_op)
}
}
bool osd_messenger_t::try_send(osd_client_t *cl)
bool osd_client_t::try_send(ring_loop_t *ringloop, bool use_sync_send_recv)
{
auto cl = this;
int peer_fd = cl->peer_fd;
if (!cl->send_list.size() || cl->write_msg.msg_iovlen > 0)
{
@@ -198,7 +224,14 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
cl->refs++;
ring_data_t* data = ((ring_data_t*)sqe->user_data);
data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); };
if (msgr->iothreads.size())
{
data->callback = [this](ring_data_t *data) { msgr->iothreads[this->peer_fd % msgr->iothreads.size()]->handle_client_send(this, data->res); };
}
else
{
data->callback = [this](ring_data_t *data) { msgr->handle_client_send(this, data->res); };
}
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0);
}
else
@@ -211,18 +244,68 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
{
result = -errno;
}
handle_send(result, cl);
msgr->handle_client_send(this, result);
}
return true;
}
void osd_messenger_t::handle_client_send(osd_client_t *cl, int res)
{
res = cl->handle_send(res);
if (res == -ENOENT)
{
if (!cl->refs)
delete cl;
}
else if (res == -EIO)
{
stop_client(cl->peer_fd);
}
else if (res == -EAGAIN)
{
write_ready_clients.push_back(cl->peer_fd);
}
}
void msgr_iothread_t::handle_client_send(osd_client_t *cl, int res)
{
cl->mu.lock();
res = cl->handle_send(res);
if (res == -ENOENT)
{
if (!cl->refs)
cl->msgr->set_immediate([cl]() { delete cl; });
}
cl->mu.unlock();
if (res == -EIO)
{
cl->msgr->stop_client_from_iothread(cl);
}
else if (res == -EAGAIN)
{
write_ready_mu.lock();
write_ready_clients.push_back(cl->peer_fd);
write_ready_mu.unlock();
ring->wakeup();
}
}
void osd_messenger_t::send_replies()
{
if (iothreads.size())
{
return;
}
for (int i = 0; i < write_ready_clients.size(); i++)
{
int peer_fd = write_ready_clients[i];
auto cl_it = clients.find(peer_fd);
if (cl_it != clients.end() && !try_send(cl_it->second))
if (cl_it == clients.end())
{
continue;
}
auto cl = cl_it->second;
if (!cl->try_send(ringloop, use_sync_send_recv))
{
write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + i);
return;
@@ -231,24 +314,20 @@ void osd_messenger_t::send_replies()
write_ready_clients.clear();
}
void osd_messenger_t::handle_send(int result, osd_client_t *cl)
int osd_client_t::handle_send(int result)
{
auto cl = this;
cl->write_msg.msg_iovlen = 0;
cl->refs--;
if (cl->peer_state == PEER_STOPPED)
{
if (cl->refs <= 0)
{
delete cl;
}
return;
return -ENOENT;
}
if (result < 0 && result != -EAGAIN && result != -EINTR)
{
// this is a client socket, so don't panic. just disconnect it
fprintf(stderr, "Client %d socket write error: %d (%s). Disconnecting client\n", cl->peer_fd, -result, strerror(-result));
stop_client(cl->peer_fd);
return;
return -EIO;
}
if (result >= 0)
{
@@ -261,7 +340,7 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
if (cl->outbox[done].flags & MSGR_SENDP_FREE)
{
// Reply fully sent
delete cl->outbox[done].op;
msgr->set_immediate_or_run([op = cl->outbox[done].op] { delete op; });
}
result -= iov.iov_len;
done++;
@@ -291,26 +370,35 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
{
// FIXME: Do something better than just forgetting the FD
// FIXME: Ignore pings during RDMA state transition
if (log_level > 0)
{
fprintf(stderr, "Successfully connected with client %d using RDMA\n", cl->peer_fd);
}
cl->peer_state = PEER_RDMA;
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
msgr->set_immediate_or_run([cl = this, msgr = this->msgr, peer_fd = this->peer_fd]()
{
// Do not miss the disconnection!
if (epoll_events & EPOLLRDHUP)
auto cl_it = msgr->clients.find(peer_fd);
if (cl_it == msgr->clients.end() || cl_it->second != cl)
{
handle_peer_epoll(peer_fd, epoll_events);
return;
}
if (msgr->log_level > 0)
{
fprintf(stderr, "Successfully connected with client %d using RDMA\n", peer_fd);
}
msgr->tfd->set_fd_handler(peer_fd, 0, [msgr](int peer_fd, int epoll_events)
{
// Do not miss the disconnection!
if (epoll_events & EPOLLRDHUP)
{
msgr->handle_peer_epoll(peer_fd, epoll_events);
}
});
// Add the initial receive request
msgr->try_recv_rdma(cl);
});
// Add the initial receive request
try_recv_rdma(cl);
}
#endif
}
if (cl->write_state != 0)
{
write_ready_clients.push_back(cl->peer_fd);
return -EAGAIN;
}
return 0;
}

View File

@@ -11,6 +11,7 @@
void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
{
cl->mu.lock();
std::vector<osd_op_t*> cancel_ops;
cancel_ops.resize(cl->sent_ops.size());
int i = 0;
@@ -20,6 +21,7 @@ void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
}
cl->sent_ops.clear();
cl->outbox.clear();
cl->mu.unlock();
for (auto op: cancel_ops)
{
cancel_op(op);
@@ -53,8 +55,10 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
return;
}
osd_client_t *cl = it->second;
cl->mu.lock();
if (cl->peer_state == PEER_CONNECTING && !force || cl->peer_state == PEER_STOPPED)
{
cl->mu.unlock();
return;
}
if (log_level > 0)
@@ -71,6 +75,7 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
// First set state to STOPPED so another stop_client() call doesn't try to free it again
cl->refs++;
cl->peer_state = PEER_STOPPED;
cl->mu.unlock();
if (cl->osd_num)
{
// ...and forget OSD peer
@@ -78,7 +83,11 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
}
#ifndef __MOCK__
// Then remove FD from the eventloop so we don't accidentally read something
tfd->set_fd_handler(peer_fd, false, NULL);
tfd->set_fd_handler(peer_fd, 0, NULL);
if (iothreads.size())
{
iothreads[peer_fd % iothreads.size()]->remove_client(cl);
}
if (cl->connect_timeout_id >= 0)
{
tfd->clear_timer(cl->connect_timeout_id);
@@ -108,17 +117,24 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
repeer_pgs(cl->osd_num);
}
// Then cancel all operations
cl->mu.lock();
if (cl->read_op)
{
if (!cl->read_op->callback)
auto op = cl->read_op;
cl->read_op = NULL;
cl->mu.unlock();
if (!op->callback)
{
delete cl->read_op;
delete op;
}
else
{
cancel_op(cl->read_op);
cancel_op(op);
}
cl->read_op = NULL;
}
else
{
cl->mu.unlock();
}
if (cl->osd_num)
{
@@ -131,11 +147,32 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
{
clients.erase(it);
}
cl->mu.lock();
cl->refs--;
if (cl->refs <= 0 || force_delete)
{
cl->mu.unlock();
delete cl;
}
else
cl->mu.unlock();
}
void osd_messenger_t::stop_client_from_iothread(osd_client_t *cl)
{
if (!iothreads.size())
{
stop_client(cl->peer_fd);
return;
}
set_immediate([this, cl, peer_fd = cl->peer_fd]()
{
auto cl_it = clients.find(peer_fd);
if (cl_it != clients.end() && cl_it->second == cl)
{
stop_client(peer_fd);
}
});
}
osd_client_t::~osd_client_t()

View File

@@ -655,7 +655,7 @@ help:
ringloop->register_consumer(&consumer);
// Add FD to epoll
bool stop = false;
epmgr->tfd->set_fd_handler(sockfd[0], false, [this, &stop](int peer_fd, int epoll_events)
epmgr->tfd->set_fd_handler(sockfd[0], EPOLLIN, [this, &stop](int peer_fd, int epoll_events)
{
if (epoll_events & EPOLLRDHUP)
{

View File

@@ -12,6 +12,7 @@ add_library(vitastor_cli STATIC
cli_ls.cpp
cli_create.cpp
cli_modify.cpp
cli_osd_tree.cpp
cli_flatten.cpp
cli_merge.cpp
cli_rm_data.cpp

View File

@@ -118,6 +118,12 @@ static const char* help_text =
" With --dry-run only checks if deletion is possible without data loss and\n"
" redundancy degradation.\n"
"\n"
"vitastor-cli osd-tree\n"
" Show current OSD tree.\n"
"\n"
"vitastor-cli osds|ls-osd|osd-ls\n"
" Show current OSDs as list.\n"
"\n"
"vitastor-cli create-pool|pool-create <name> (-s <pg_size>|--ec <N>+<K>) -n <pg_count> [OPTIONS]\n"
" Create a pool. Required parameters:\n"
" -s|--pg_size R Number of replicas for replicated pools\n"
@@ -389,6 +395,17 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
// Allocate a new OSD number
action_cb = p->start_alloc_osd(cfg);
}
else if (cmd[0] == "osd-tree")
{
// Print OSD tree
action_cb = p->start_osd_tree(cfg);
}
else if (cmd[0] == "osds" || cmd[0] == "ls-osds" || cmd[0] == "ls-osd" || cmd[0] == "osd-ls")
{
// Print OSD list
cfg["flat"] = true;
action_cb = p->start_osd_tree(cfg);
}
else if (cmd[0] == "create-pool" || cmd[0] == "pool-create")
{
// Create a new pool

View File

@@ -7,6 +7,7 @@
#include "json11/json11.hpp"
#include "object_id.h"
#include "osd_id.h"
#include "ringloop.h"
#include <functional>
@@ -56,27 +57,31 @@ public:
friend struct snap_flattener_t;
friend struct snap_remover_t;
std::function<bool(cli_result_t &)> start_status(json11::Json);
std::function<bool(cli_result_t &)> start_alloc_osd(json11::Json);
std::function<bool(cli_result_t &)> start_create(json11::Json);
std::function<bool(cli_result_t &)> start_describe(json11::Json);
std::function<bool(cli_result_t &)> start_fix(json11::Json);
std::function<bool(cli_result_t &)> start_ls(json11::Json);
std::function<bool(cli_result_t &)> start_create(json11::Json);
std::function<bool(cli_result_t &)> start_modify(json11::Json);
std::function<bool(cli_result_t &)> start_rm_data(json11::Json);
std::function<bool(cli_result_t &)> start_merge(json11::Json);
std::function<bool(cli_result_t &)> start_flatten(json11::Json);
std::function<bool(cli_result_t &)> start_rm(json11::Json);
std::function<bool(cli_result_t &)> start_rm_osd(json11::Json cfg);
std::function<bool(cli_result_t &)> start_alloc_osd(json11::Json cfg);
std::function<bool(cli_result_t &)> start_ls(json11::Json);
std::function<bool(cli_result_t &)> start_merge(json11::Json);
std::function<bool(cli_result_t &)> start_modify(json11::Json);
std::function<bool(cli_result_t &)> start_osd_tree(json11::Json);
std::function<bool(cli_result_t &)> start_pool_create(json11::Json);
std::function<bool(cli_result_t &)> start_pool_modify(json11::Json);
std::function<bool(cli_result_t &)> start_pool_rm(json11::Json);
std::function<bool(cli_result_t &)> start_pool_ls(json11::Json);
std::function<bool(cli_result_t &)> start_rm(json11::Json);
std::function<bool(cli_result_t &)> start_rm_data(json11::Json);
std::function<bool(cli_result_t &)> start_rm_osd(json11::Json);
std::function<bool(cli_result_t &)> start_status(json11::Json);
// Should be called like loop_and_wait(start_status(), <completion callback>)
void loop_and_wait(std::function<bool(cli_result_t &)> loop_cb, std::function<void(const cli_result_t &)> complete_cb);
void etcd_txn(json11::Json txn);
void iterate_kvs_1(json11::Json kvs, const std::string & prefix, std::function<void(uint64_t num, json11::Json)> cb);
void iterate_kvs_2(json11::Json kvs, const std::string & prefix, std::function<void(pool_id_t pool_id, uint64_t num, json11::Json)> cb);
};
std::string print_table(json11::Json items, json11::Json header, bool use_esc);

View File

@@ -72,19 +72,10 @@ struct alloc_osd_t
if (!parent->etcd_result["succeeded"].bool_value())
{
std::vector<osd_num_t> used;
for (auto kv: parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items())
parent->iterate_kvs_1(parent->etcd_result["responses"][0]["response_range"]["kvs"], "/osd/stats/", [&](uint64_t cur_osd, json11::Json value)
{
std::string key = base64_decode(kv["key"].string_value());
osd_num_t cur_osd;
char null_byte = 0;
int scanned = sscanf(key.c_str() + parent->cli->st_cli.etcd_prefix.length(), "/osd/stats/%ju%c", &cur_osd, &null_byte);
if (scanned != 1 || !cur_osd)
{
fprintf(stderr, "Invalid key in etcd: %s\n", key.c_str());
continue;
}
used.push_back(cur_osd);
}
});
std::sort(used.begin(), used.end());
if (used[used.size()-1] == used.size())
{

View File

@@ -165,3 +165,43 @@ void cli_tool_t::loop_and_wait(std::function<bool(cli_result_t &)> loop_cb, std:
ringloop->wakeup();
});
}
void cli_tool_t::iterate_kvs_1(json11::Json kvs, const std::string & prefix, std::function<void(uint64_t, json11::Json)> cb)
{
bool is_pool = prefix == "/pool/stats/";
for (auto & kv_item: kvs.array_items())
{
auto kv = cli->st_cli.parse_etcd_kv(kv_item);
uint64_t num = 0;
char null_byte = 0;
// OSD or pool number
int scanned = sscanf(kv.key.substr(cli->st_cli.etcd_prefix.size() + prefix.size()).c_str(), "%ju%c", &num, &null_byte);
if (scanned != 1 || !num || is_pool && num >= POOL_ID_MAX)
{
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
continue;
}
cb(num, kv.value);
}
}
void cli_tool_t::iterate_kvs_2(json11::Json kvs, const std::string & prefix, std::function<void(pool_id_t pool_id, uint64_t num, json11::Json)> cb)
{
bool is_inode = prefix == "/config/inode/" || prefix == "/inode/stats/";
for (auto & kv_item: kvs.array_items())
{
auto kv = cli->st_cli.parse_etcd_kv(kv_item);
pool_id_t pool_id = 0;
uint64_t num = 0;
char null_byte = 0;
// pool+pg or pool+inode
int scanned = sscanf(kv.key.substr(cli->st_cli.etcd_prefix.size() + prefix.size()).c_str(),
"%u/%ju%c", &pool_id, &num, &null_byte);
if (scanned != 2 || !pool_id || is_inode && INODE_POOL(num) || !is_inode && num >= UINT32_MAX)
{
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
continue;
}
cb(pool_id, num, kv.value);
}
}

View File

@@ -479,10 +479,14 @@ struct snap_merger_t
{
if (op->retval != op->len)
{
rwo->error_code = -op->retval;
rwo->error_code = op->retval;
rwo->error_offset = op->offset;
rwo->error_read = true;
}
else
{
rwo->error_code = 0;
}
continue_rwo.push_back(rwo);
parent->ringloop->wakeup();
};
@@ -553,12 +557,15 @@ struct snap_merger_t
if (use_cas && subop->retval == -EINTR)
{
// CAS failure - reread and repeat optimistically
assert(rwo->todo == 1); // initial refcount from read_and_write
rwo->error_code = -EINTR;
rwo->start = rwo->end = 0;
rwo->op.version = 0;
rwo_read(rwo);
delete subop;
return;
}
rwo->error_code = -subop->retval;
rwo->error_code = subop->retval;
rwo->error_offset = subop->offset;
rwo->error_read = false;
}
@@ -633,7 +640,7 @@ struct snap_merger_t
{
char buf[1024];
snprintf(buf, 1024, "Error %s target at offset %jx: %s",
rwo->error_read ? "reading" : "writing", rwo->error_offset, strerror(rwo->error_code));
rwo->error_read ? "reading" : "writing", rwo->error_offset, strerror(-rwo->error_code));
rwo_error = std::string(buf);
}
delete rwo;

377
src/cmd/cli_osd_tree.cpp Normal file
View File

@@ -0,0 +1,377 @@
// Copyright (c) Vitaliy Filippov, 2024
// License: VNPL-1.1 (see README.md for details)
#include <ctype.h>
#include "cli.h"
#include "cluster_client.h"
#include "epoll_manager.h"
#include "pg_states.h"
#include "str_util.h"
struct placement_osd_t
{
osd_num_t num;
std::string parent;
std::vector<std::string> tags;
uint64_t size;
uint64_t free;
bool up;
double reweight;
uint32_t block_size, bitmap_granularity, immediate_commit;
};
struct placement_node_t
{
std::string name;
std::string parent;
std::string level;
std::vector<std::string> child_nodes;
std::vector<osd_num_t> child_osds;
};
struct placement_tree_t
{
std::map<std::string, placement_node_t> nodes;
std::map<osd_num_t, placement_osd_t> osds;
};
struct osd_tree_printer_t
{
cli_tool_t *parent;
json11::Json cfg;
bool flat = false;
bool show_stats = false;
int state = 0;
cli_result_t result;
json11::Json node_placement;
std::map<uint64_t, json11::Json> osd_config;
std::map<uint64_t, json11::Json> osd_stats;
std::shared_ptr<placement_tree_t> placement_tree;
bool is_done() { return state == 100; }
void load_osd_tree()
{
if (state == 1)
goto resume_1;
parent->etcd_txn(json11::Json::object {
{ "success", json11::Json::array {
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/node_placement") },
} },
},
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/osd/") },
{ "range_end", base64_encode(parent->cli->st_cli.etcd_prefix+"/config/osd0") },
} },
},
json11::Json::object {
{ "request_range", json11::Json::object {
{ "key", base64_encode(parent->cli->st_cli.etcd_prefix+"/osd/stats/") },
{ "range_end", base64_encode(parent->cli->st_cli.etcd_prefix+"/osd/stats0") },
} },
},
} },
});
state = 1;
resume_1:
if (parent->waiting > 0)
return;
if (parent->etcd_err.err)
{
result = parent->etcd_err;
state = 100;
return;
}
for (auto & item: parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items())
{
node_placement = parent->cli->st_cli.parse_etcd_kv(item).value;
}
parent->iterate_kvs_1(parent->etcd_result["responses"][1]["response_range"]["kvs"], "/config/osd/", [&](uint64_t cur_osd, json11::Json value)
{
osd_config[cur_osd] = value;
});
parent->iterate_kvs_1(parent->etcd_result["responses"][2]["response_range"]["kvs"], "/osd/stats/", [&](uint64_t cur_osd, json11::Json value)
{
osd_stats[cur_osd] = value;
});
placement_tree = make_osd_tree(node_placement, osd_config, osd_stats);
}
std::shared_ptr<placement_tree_t> make_osd_tree(json11::Json node_placement_json,
std::map<uint64_t, json11::Json> osd_config, std::map<uint64_t, json11::Json> osd_stats)
{
auto node_placement = node_placement_json.object_items();
auto tree = std::make_shared<placement_tree_t>();
tree->nodes[""] = (placement_node_t){};
// Add non-OSD items
for (auto & kv: node_placement)
{
auto osd_num = stoull_full(kv.first);
if (!osd_num)
{
auto level = kv.second["level"].string_value();
tree->nodes[kv.first] = (placement_node_t){
.name = kv.first,
.parent = kv.second["parent"].string_value(),
.level = level == "" ? "unknown" : level,
};
}
}
// Add OSDs
for (auto & kv: osd_stats)
{
auto & osd = tree->osds[kv.first] = (placement_osd_t){
.num = kv.first,
.parent = kv.second["host"].string_value(),
.size = kv.second["size"].uint64_value(),
.free = kv.second["free"].uint64_value(),
.up = parent->cli->st_cli.peer_states.find(kv.first) != parent->cli->st_cli.peer_states.end(),
.reweight = 1,
.block_size = (uint32_t)kv.second["data_block_size"].uint64_value(),
.bitmap_granularity = (uint32_t)kv.second["bitmap_granularity"].uint64_value(),
.immediate_commit = etcd_state_client_t::parse_immediate_commit(kv.second["immediate_commit"].string_value()),
};
if (tree->nodes.find(osd.parent) == tree->nodes.end())
{
// Autocreate all hosts
tree->nodes[osd.parent] = (placement_node_t){
.name = osd.parent,
.level = "host",
};
}
auto cfg_it = osd_config.find(osd.num);
if (cfg_it != osd_config.end())
{
auto & osd_cfg = cfg_it->second;
osd.reweight = osd_cfg["reweight"].is_number() ? osd_cfg["reweight"].number_value() : 1;
if (osd_cfg["tags"].is_array())
{
for (auto & jtag: osd_cfg["tags"].array_items())
osd.tags.push_back(jtag.string_value());
}
}
auto np_it = node_placement.find(std::to_string(osd.num));
if (np_it != node_placement.end())
{
osd.parent = np_it->second["parent"].string_value();
}
tree->nodes[osd.parent].child_osds.push_back(osd.num);
}
// Fill child_nodes
for (auto & ip: tree->nodes)
{
if (tree->nodes.find(ip.second.parent) == tree->nodes.end())
{
ip.second.parent = "";
}
if (ip.first != "")
{
tree->nodes[ip.second.parent].child_nodes.push_back(ip.first);
}
}
// FIXME: Maybe filter out loops here
return tree;
}
std::string format_tree()
{
std::vector<std::string> node_seq = { "" };
std::vector<int> indents = { -1 };
std::map<std::string, bool> seen;
for (int i = 0; i < node_seq.size(); i++)
{
if (seen[node_seq[i]])
{
continue;
}
seen[node_seq[i]] = true;
auto & child_nodes = placement_tree->nodes.at(node_seq[i]).child_nodes;
if (child_nodes.size())
{
node_seq.insert(node_seq.begin()+i+1, child_nodes.begin(), child_nodes.end());
indents.insert(indents.begin()+i+1, child_nodes.size(), indents[i]+1);
}
}
json11::Json::array fmt_items;
for (int i = 1; i < node_seq.size(); i++)
{
auto & node = placement_tree->nodes.at(node_seq[i]);
if (!flat)
{
fmt_items.push_back(json11::Json::object{
{ "type", str_repeat(" ", indents[i]) + node.level },
{ "name", node.name },
});
}
std::string parent = node.name;
if (flat)
{
auto cur = &placement_tree->nodes.at(node.name);
while (cur->parent != "" && cur->parent != node.name)
{
parent = cur->parent+"/"+parent;
cur = &placement_tree->nodes.at(cur->parent);
}
}
for (uint64_t osd_num: node.child_osds)
{
auto & osd = placement_tree->osds.at(osd_num);
auto fmt = json11::Json::object{
{ "type", (flat ? "osd" : str_repeat(" ", indents[i]+1) + "osd") },
{ "name", osd.num },
{ "parent", parent },
{ "up", osd.up ? "up" : "down" },
{ "size", format_size(osd.size, false, true) },
{ "used", format_q(100.0*(osd.size - osd.free)/osd.size)+" %" },
{ "reweight", format_q(osd.reweight) },
{ "tags", implode(",", osd.tags) },
{ "block", format_size(osd.block_size, false, true) },
{ "bitmap", format_size(osd.bitmap_granularity, false, true) },
{ "commit", osd.immediate_commit == IMMEDIATE_NONE ? "none" : (osd.immediate_commit == IMMEDIATE_ALL ? "all" : "small") },
};
if (show_stats)
{
auto op_stat = osd_stats[osd_num]["op_stats"];
fmt["read_bw"] = format_size(op_stat["primary_read"]["bps"].uint64_value())+"/s";
fmt["write_bw"] = format_size(op_stat["primary_write"]["bps"].uint64_value())+"/s";
fmt["delete_bw"] = format_size(op_stat["primary_delete"]["bps"].uint64_value())+"/s";
fmt["read_iops"] = format_q(op_stat["primary_read"]["iops"].uint64_value());
fmt["write_iops"] = format_q(op_stat["primary_write"]["iops"].uint64_value());
fmt["delete_iops"] = format_q(op_stat["primary_delete"]["iops"].uint64_value());
fmt["read_lat"] = format_lat(op_stat["primary_read"]["lat"].uint64_value());
fmt["write_lat"] = format_lat(op_stat["primary_write"]["lat"].uint64_value());
fmt["delete_lat"] = format_lat(op_stat["primary_delete"]["lat"].uint64_value());
}
fmt_items.push_back(std::move(fmt));
}
}
json11::Json::array cols;
if (!flat)
{
cols.push_back(json11::Json::object{
{ "key", "type" },
{ "title", "TYPE" },
});
}
cols.push_back(json11::Json::object{
{ "key", "name" },
{ "title", flat ? "OSD" : "NAME" },
});
if (flat)
{
cols.push_back(json11::Json::object{
{ "key", "parent" },
{ "title", "PARENT" },
});
}
cols.push_back(json11::Json::object{
{ "key", "up" },
{ "title", "UP" },
});
cols.push_back(json11::Json::object{
{ "key", "size" },
{ "title", "SIZE" },
});
cols.push_back(json11::Json::object{
{ "key", "used" },
{ "title", "USED%" },
});
cols.push_back(json11::Json::object{
{ "key", "tags" },
{ "title", "TAGS" },
});
cols.push_back(json11::Json::object{
{ "key", "reweight" },
{ "title", "WEIGHT" },
});
cols.push_back(json11::Json::object{
{ "key", "block" },
{ "title", "BLOCK" },
});
cols.push_back(json11::Json::object{
{ "key", "bitmap" },
{ "title", "BITMAP" },
});
cols.push_back(json11::Json::object{
{ "key", "commit" },
{ "title", "IMM" },
});
if (show_stats)
{
cols.push_back(json11::Json::object{
{ "key", "read_bw" },
{ "title", "READ" },
});
cols.push_back(json11::Json::object{
{ "key", "read_iops" },
{ "title", "IOPS" },
});
cols.push_back(json11::Json::object{
{ "key", "read_lat" },
{ "title", "LAT" },
});
cols.push_back(json11::Json::object{
{ "key", "write_bw" },
{ "title", "WRITE" },
});
cols.push_back(json11::Json::object{
{ "key", "write_iops" },
{ "title", "IOPS" },
});
cols.push_back(json11::Json::object{
{ "key", "write_lat" },
{ "title", "LAT" },
});
cols.push_back(json11::Json::object{
{ "key", "delete_bw" },
{ "title", "DEL" },
});
cols.push_back(json11::Json::object{
{ "key", "delete_iops" },
{ "title", "IOPS" },
});
cols.push_back(json11::Json::object{
{ "key", "delete_lat" },
{ "title", "LAT" },
});
}
return print_table(fmt_items, cols, parent->color);
}
void loop()
{
if (state == 1)
goto resume_1;
resume_1:
load_osd_tree();
if (parent->waiting > 0)
return;
result.text = format_tree();
state = 100;
}
};
std::function<bool(cli_result_t &)> cli_tool_t::start_osd_tree(json11::Json cfg)
{
auto osd_tree_printer = new osd_tree_printer_t();
osd_tree_printer->parent = this;
osd_tree_printer->cfg = cfg;
osd_tree_printer->flat = cfg["flat"].bool_value();
osd_tree_printer->show_stats = cfg["long"].bool_value();
return [osd_tree_printer](cli_result_t & result)
{
osd_tree_printer->loop();
if (osd_tree_printer->is_done())
{
result = osd_tree_printer->result;
delete osd_tree_printer;
return true;
}
return false;
};
}

View File

@@ -104,37 +104,16 @@ resume_1:
{
config_pools = parent->cli->st_cli.parse_etcd_kv(config_pools).value;
}
for (auto & kv_item: space_info["responses"][0]["response_range"]["kvs"].array_items())
parent->iterate_kvs_1(space_info["responses"][0]["response_range"]["kvs"], "/pool/stats/", [&](uint64_t pool_id, json11::Json value)
{
auto kv = parent->cli->st_cli.parse_etcd_kv(kv_item);
// pool ID
pool_id_t pool_id;
char null_byte = 0;
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/pool/stats/%u%c", &pool_id, &null_byte);
if (scanned != 1 || !pool_id || pool_id >= POOL_ID_MAX)
{
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
continue;
}
// pool/stats/<N>
pool_stats[pool_id] = kv.value.object_items();
}
pool_stats[pool_id] = value.object_items();
});
std::map<pool_id_t, uint64_t> osd_free;
for (auto & kv_item: space_info["responses"][1]["response_range"]["kvs"].array_items())
parent->iterate_kvs_1(space_info["responses"][1]["response_range"]["kvs"], "/osd/stats/", [&](uint64_t osd_num, json11::Json value)
{
auto kv = parent->cli->st_cli.parse_etcd_kv(kv_item);
// osd ID
osd_num_t osd_num;
char null_byte = 0;
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(), "/osd/stats/%ju%c", &osd_num, &null_byte);
if (scanned != 1 || !osd_num || osd_num >= POOL_ID_MAX)
{
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
continue;
}
// osd/stats/<N>::free
osd_free[osd_num] = kv.value["free"].uint64_value();
}
osd_free[osd_num] = value["free"].uint64_value();
});
// Calculate max_avail for each pool
for (auto & pp: parent->cli->st_cli.pool_config)
{
@@ -254,29 +233,17 @@ resume_1:
state = 100;
return;
}
auto pg_stats = parent->etcd_result["responses"][0]["response_range"]["kvs"];
// Calculate recovery percent
std::map<pool_id_t, object_counts_t> counts;
for (auto & kv_item: pg_stats.array_items())
parent->iterate_kvs_2(parent->etcd_result["responses"][0]["response_range"]["kvs"], "/pg/stats/",
[&](pool_id_t pool_id, uint64_t pg_num, json11::Json value)
{
auto kv = parent->cli->st_cli.parse_etcd_kv(kv_item);
// pool ID & pg number
pool_id_t pool_id;
pg_num_t pg_num = 0;
char null_byte = 0;
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(),
"/pg/stats/%u/%u%c", &pool_id, &pg_num, &null_byte);
if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX)
{
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
continue;
}
auto & cnt = counts[pool_id];
cnt.object_count += kv.value["object_count"].uint64_value();
cnt.misplaced_count += kv.value["misplaced_count"].uint64_value();
cnt.degraded_count += kv.value["degraded_count"].uint64_value();
cnt.incomplete_count += kv.value["incomplete_count"].uint64_value();
}
cnt.object_count += value["object_count"].uint64_value();
cnt.misplaced_count += value["misplaced_count"].uint64_value();
cnt.degraded_count += value["degraded_count"].uint64_value();
cnt.incomplete_count += value["incomplete_count"].uint64_value();
});
for (auto & pp: pool_stats)
{
auto & cnt = counts[pp.first];
@@ -317,35 +284,23 @@ resume_1:
state = 100;
return;
}
auto inode_stats = parent->etcd_result["responses"][0]["response_range"]["kvs"];
// Performance statistics
std::map<pool_id_t, io_stats_t> pool_io;
for (auto & kv_item: inode_stats.array_items())
parent->iterate_kvs_2(parent->etcd_result["responses"][0]["response_range"]["kvs"], "/inode/stats/",
[&](pool_id_t pool_id, uint64_t inode_num, json11::Json value)
{
auto kv = parent->cli->st_cli.parse_etcd_kv(kv_item);
// pool ID & inode number
pool_id_t pool_id;
inode_t only_inode_num;
char null_byte = 0;
int scanned = sscanf(kv.key.substr(parent->cli->st_cli.etcd_prefix.length()).c_str(),
"/inode/stats/%u/%ju%c", &pool_id, &only_inode_num, &null_byte);
if (scanned != 2 || !pool_id || pool_id >= POOL_ID_MAX || INODE_POOL(only_inode_num) != 0)
{
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
continue;
}
auto & io = pool_io[pool_id];
io.read_iops += kv.value["read"]["iops"].uint64_value();
io.read_bps += kv.value["read"]["bps"].uint64_value();
io.read_lat += kv.value["read"]["lat"].uint64_value();
io.write_iops += kv.value["write"]["iops"].uint64_value();
io.write_bps += kv.value["write"]["bps"].uint64_value();
io.write_lat += kv.value["write"]["lat"].uint64_value();
io.delete_iops += kv.value["delete"]["iops"].uint64_value();
io.delete_bps += kv.value["delete"]["bps"].uint64_value();
io.delete_lat += kv.value["delete"]["lat"].uint64_value();
io.read_iops += value["read"]["iops"].uint64_value();
io.read_bps += value["read"]["bps"].uint64_value();
io.read_lat += value["read"]["lat"].uint64_value();
io.write_iops += value["write"]["iops"].uint64_value();
io.write_bps += value["write"]["bps"].uint64_value();
io.write_lat += value["write"]["lat"].uint64_value();
io.delete_iops += value["delete"]["iops"].uint64_value();
io.delete_bps += value["delete"]["bps"].uint64_value();
io.delete_lat += value["delete"]["lat"].uint64_value();
io.count++;
}
});
for (auto & pp: pool_stats)
{
auto & io = pool_io[pp.first];

View File

@@ -18,7 +18,7 @@ struct status_printer_t
cli_tool_t *parent;
int state = 0;
json11::Json::array mon_members, osd_stats;
json11::Json::array mon_members;
json11::Json agg_stats;
std::map<pool_id_t, json11::Json::object> pool_stats;
json11::Json::array etcd_states;
@@ -93,7 +93,7 @@ resume_2:
return;
}
mon_members = parent->etcd_result["responses"][0]["response_range"]["kvs"].array_items();
osd_stats = parent->etcd_result["responses"][1]["response_range"]["kvs"].array_items();
auto osd_stats = parent->etcd_result["responses"][1]["response_range"]["kvs"];
if (parent->etcd_result["responses"][2]["response_range"]["kvs"].array_items().size() > 0)
{
agg_stats = parent->cli->st_cli.parse_etcd_kv(parent->etcd_result["responses"][2]["response_range"]["kvs"][0]).value;
@@ -133,20 +133,11 @@ resume_2:
}
int osd_count = 0, osd_up = 0;
uint64_t total_raw = 0, free_raw = 0, free_down_raw = 0, down_raw = 0;
for (int i = 0; i < osd_stats.size(); i++)
parent->iterate_kvs_1(osd_stats, "/osd/stats/", [&](uint64_t stat_osd_num, json11::Json value)
{
auto kv = parent->cli->st_cli.parse_etcd_kv(osd_stats[i]);
osd_num_t stat_osd_num = 0;
char null_byte = 0;
int scanned = sscanf(kv.key.c_str() + parent->cli->st_cli.etcd_prefix.size(), "/osd/stats/%ju%c", &stat_osd_num, &null_byte);
if (scanned != 1 || !stat_osd_num)
{
fprintf(stderr, "Invalid key in etcd: %s\n", kv.key.c_str());
continue;
}
osd_count++;
auto osd_size = kv.value["size"].uint64_value();
auto osd_free = kv.value["free"].uint64_value();
auto osd_size = value["size"].uint64_value();
auto osd_free = value["free"].uint64_value();
total_raw += osd_size;
free_raw += osd_free;
if (!osd_free)
@@ -164,10 +155,10 @@ resume_2:
}
else
{
down_raw += kv.value["size"].uint64_value();
free_down_raw += kv.value["free"].uint64_value();
down_raw += value["size"].uint64_value();
free_down_raw += value["free"].uint64_value();
}
}
});
int pool_count = 0, pools_active = 0;
std::map<std::string, int> pgs_by_state;
std::string pgs_by_state_str;

View File

@@ -185,7 +185,7 @@ void kv_cli_t::run()
fcntl(0, F_SETFL, fcntl(0, F_GETFL, 0) | O_NONBLOCK);
try
{
epmgr->tfd->set_fd_handler(0, false, [this](int fd, int events)
epmgr->tfd->set_fd_handler(0, EPOLLIN, [this](int fd, int events)
{
if (events & EPOLLIN)
{
@@ -193,7 +193,7 @@ void kv_cli_t::run()
}
if (events & EPOLLRDHUP)
{
epmgr->tfd->set_fd_handler(0, false, NULL);
epmgr->tfd->set_fd_handler(0, 0, NULL);
finished = true;
}
});

View File

@@ -189,6 +189,12 @@ void nfs_proxy_t::run(json11::Json cfg)
cmd->epmgr = epmgr;
cmd->cli = cli;
watch_stats();
// Init Pseudo-FS before starting client because it depends on inode_change_hook
if (fsname == "")
{
blockfs = new block_fs_state_t();
blockfs->init(this, cfg);
}
// Load image metadata
while (!cli->is_ready())
{
@@ -199,13 +205,8 @@ void nfs_proxy_t::run(json11::Json cfg)
}
// Check default pool
check_default_pool();
// Check if we're using VitastorFS
if (fsname == "")
{
blockfs = new block_fs_state_t();
blockfs->init(this, cfg);
}
else
// Init VitastorFS after starting client because it depends on loaded inode configuration
if (fsname != "")
{
kvfs = new kv_fs_state_t();
kvfs->init(this, cfg);
@@ -242,7 +243,7 @@ void nfs_proxy_t::run(json11::Json cfg)
// Create NFS socket and add it to epoll
int nfs_socket = create_and_bind_socket(bind_address, nfs_port, 128, &listening_port);
fcntl(nfs_socket, F_SETFL, fcntl(nfs_socket, F_GETFL, 0) | O_NONBLOCK);
epmgr->tfd->set_fd_handler(nfs_socket, false, [this](int nfs_socket, int epoll_events)
epmgr->tfd->set_fd_handler(nfs_socket, EPOLLIN, [this](int nfs_socket, int epoll_events)
{
if (epoll_events & EPOLLRDHUP)
{
@@ -259,7 +260,7 @@ void nfs_proxy_t::run(json11::Json cfg)
// Create portmap socket and add it to epoll
int portmap_socket = create_and_bind_socket(bind_address, 111, 128, NULL);
fcntl(portmap_socket, F_SETFL, fcntl(portmap_socket, F_GETFL, 0) | O_NONBLOCK);
epmgr->tfd->set_fd_handler(portmap_socket, false, [this](int portmap_socket, int epoll_events)
epmgr->tfd->set_fd_handler(portmap_socket, EPOLLIN, [this](int portmap_socket, int epoll_events)
{
if (epoll_events & EPOLLRDHUP)
{
@@ -465,7 +466,7 @@ void nfs_proxy_t::do_accept(int listen_fd)
{
cli->proc_table.insert(fn);
}
epmgr->tfd->set_fd_handler(nfs_fd, true, [cli](int nfs_fd, int epoll_events)
epmgr->tfd->set_fd_handler(nfs_fd, EPOLLIN|EPOLLOUT, [cli](int nfs_fd, int epoll_events)
{
// Handle incoming event
if (epoll_events & EPOLLRDHUP)
@@ -722,7 +723,7 @@ void nfs_client_t::stop()
stopped = true;
if (refs <= 0)
{
parent->epmgr->tfd->set_fd_handler(nfs_fd, true, NULL);
parent->epmgr->tfd->set_fd_handler(nfs_fd, 0, NULL);
close(nfs_fd);
delete this;
}

View File

@@ -141,6 +141,14 @@ void osd_t::parse_config(bool init)
config = msgr.merge_configs(cli_config, file_config, etcd_global_config, etcd_osd_config);
if (config.find("log_level") == this->config.end())
config["log_level"] = 1;
if (init)
{
// OSD number
osd_num = config["osd_num"].uint64_value();
if (!osd_num)
throw std::runtime_error("osd_num is required in the configuration");
msgr.osd_num = osd_num;
}
if (bs)
{
auto bs_cfg = json_to_bs(config);
@@ -150,11 +158,6 @@ void osd_t::parse_config(bool init)
msgr.parse_config(config);
if (init)
{
// OSD number
osd_num = config["osd_num"].uint64_value();
if (!osd_num)
throw std::runtime_error("osd_num is required in the configuration");
msgr.osd_num = osd_num;
// Vital Blockstore parameters
bs_block_size = config["block_size"].uint64_value();
if (!bs_block_size)
@@ -361,7 +364,7 @@ void osd_t::bind_socket()
listen_fd = create_and_bind_socket(bind_address, bind_port, listen_backlog, &listening_port);
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
epmgr->set_fd_handler(listen_fd, false, [this](int fd, int events)
epmgr->set_fd_handler(listen_fd, EPOLLIN, [this](int fd, int events)
{
msgr.accept_connections(listen_fd);
});

View File

@@ -199,12 +199,14 @@ class osd_t
ring_consumer_t consumer;
// op statistics
osd_op_stats_t prev_stats;
osd_op_stats_t prev_stats, prev_report_stats;
timespec report_stats_ts;
std::map<uint64_t, inode_stats_t> inode_stats;
std::map<uint64_t, timespec> vanishing_inodes;
const char* recovery_stat_names[2] = { "degraded", "misplaced" };
recovery_stat_t recovery_stat[2];
recovery_stat_t recovery_print_prev[2];
recovery_stat_t recovery_report_prev[2];
// recovery auto-tuning
int rtune_timer_id = -1;
@@ -252,6 +254,7 @@ class osd_t
bool check_peer_config(osd_client_t *cl, json11::Json conf);
void repeer_pgs(osd_num_t osd_num);
void start_pg_peering(pg_t & pg);
void drop_dirty_pg_connections(pool_pg_num_t pg);
void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
void discard_list_subop(osd_op_t *list_op);
bool stop_pg(pg_t & pg);

View File

@@ -180,6 +180,12 @@ json11::Json osd_t::get_statistics()
json11::Json::object st;
timespec ts;
clock_gettime(CLOCK_REALTIME, &ts);
uint64_t ts_diff = 0;
if (report_stats_ts.tv_sec != 0)
ts_diff = (ts.tv_sec - report_stats_ts.tv_sec + (ts.tv_nsec - report_stats_ts.tv_nsec) / 1000000000);
if (!ts_diff)
ts_diff = 1;
report_stats_ts = ts;
char time_str[50] = { 0 };
sprintf(time_str, "%jd.%03ld", (uint64_t)ts.tv_sec, ts.tv_nsec/1000000);
st["time"] = time_str;
@@ -196,33 +202,50 @@ json11::Json osd_t::get_statistics()
json11::Json::object op_stats, subop_stats;
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
{
auto n = (msgr.stats.op_stat_count[i] - prev_report_stats.op_stat_count[i]);
op_stats[osd_op_names[i]] = json11::Json::object {
{ "count", msgr.stats.op_stat_count[i] },
{ "usec", msgr.stats.op_stat_sum[i] },
{ "bytes", msgr.stats.op_stat_bytes[i] },
{ "lat", (msgr.stats.op_stat_sum[i] - prev_report_stats.op_stat_sum[i]) / (n < 1 ? 1 : n) },
{ "bps", (msgr.stats.op_stat_bytes[i] - prev_report_stats.op_stat_bytes[i]) / ts_diff },
{ "iops", n / ts_diff },
};
}
for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
{
auto n = (msgr.stats.subop_stat_count[i] - prev_report_stats.subop_stat_count[i]);
subop_stats[osd_op_names[i]] = json11::Json::object {
{ "count", msgr.stats.subop_stat_count[i] },
{ "usec", msgr.stats.subop_stat_sum[i] },
{ "lat", (msgr.stats.subop_stat_sum[i] - prev_report_stats.subop_stat_sum[i]) / (n < 1 ? 1 : n) },
{ "iops", n / ts_diff },
};
}
st["op_stats"] = op_stats;
st["subop_stats"] = subop_stats;
auto n0 = recovery_stat[0].count - recovery_report_prev[0].count;
auto n1 = recovery_stat[1].count - recovery_report_prev[1].count;
st["recovery_stats"] = json11::Json::object {
{ recovery_stat_names[0], json11::Json::object {
{ "count", recovery_stat[0].count },
{ "bytes", recovery_stat[0].bytes },
{ "usec", recovery_stat[0].usec },
{ "lat", (recovery_stat[0].usec - recovery_report_prev[0].usec) / (n0 < 1 ? 1 : n0) },
{ "bps", (recovery_stat[0].bytes - recovery_report_prev[0].bytes) / ts_diff },
{ "iops", n0 / ts_diff },
} },
{ recovery_stat_names[1], json11::Json::object {
{ "count", recovery_stat[1].count },
{ "bytes", recovery_stat[1].bytes },
{ "usec", recovery_stat[1].usec },
{ "lat", (recovery_stat[1].usec - recovery_report_prev[1].usec) / (n1 < 1 ? 1 : n1) },
{ "bps", (recovery_stat[1].bytes - recovery_report_prev[1].bytes) / ts_diff },
{ "iops", n1 / ts_diff },
} },
};
prev_report_stats = msgr.stats;
memcpy(recovery_report_prev, recovery_stat, sizeof(recovery_stat));
return st;
}

View File

@@ -168,20 +168,15 @@ void osd_t::reset_pg(pg_t & pg)
dirty_pgs.erase({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
}
// Repeer on each connect/disconnect peer event
void osd_t::start_pg_peering(pg_t & pg)
// Drop connections of clients who have this PG in dirty_pgs
void osd_t::drop_dirty_pg_connections(pool_pg_num_t pg)
{
pg.state = PG_PEERING;
this->peering_state |= OSD_PEERING_PGS;
reset_pg(pg);
report_pg_state(pg);
// Drop connections of clients who have this PG in dirty_pgs
if (immediate_commit != IMMEDIATE_ALL)
{
std::vector<int> to_stop;
for (auto & cp: msgr.clients)
{
if (cp.second->dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) != cp.second->dirty_pgs.end())
if (cp.second->dirty_pgs.find(pg) != cp.second->dirty_pgs.end())
{
to_stop.push_back(cp.first);
}
@@ -191,6 +186,16 @@ void osd_t::start_pg_peering(pg_t & pg)
msgr.stop_client(peer_fd);
}
}
}
// Repeer on each connect/disconnect peer event
void osd_t::start_pg_peering(pg_t & pg)
{
pg.state = PG_PEERING;
this->peering_state |= OSD_PEERING_PGS;
reset_pg(pg);
report_pg_state(pg);
drop_dirty_pg_connections({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
// Try to connect with current peers if they're up, but we don't have connections to them
// Otherwise we may erroneously decide that the pg is incomplete :-)
for (auto pg_osd: pg.all_peers)
@@ -460,6 +465,7 @@ bool osd_t::stop_pg(pg_t & pg)
{
return false;
}
drop_dirty_pg_connections({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
if (!(pg.state & (PG_ACTIVE | PG_REPEERING)))
{
finish_stop_pg(pg);

View File

@@ -247,6 +247,7 @@ resume_8:
finish:
if (cur_op->peer_fd)
{
// FIXME: Do it before executing sync
auto it = msgr.clients.find(cur_op->peer_fd);
if (it != msgr.clients.end())
it->second->dirty_pgs.clear();

View File

@@ -43,7 +43,7 @@ int main(int narg, char *args[])
// Accept new connections
int listen_fd = create_and_bind_socket("0.0.0.0", 11203, 128, NULL);
fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
epmgr->set_fd_handler(listen_fd, false, [listen_fd, msgr](int fd, int events)
epmgr->set_fd_handler(listen_fd, EPOLLIN, [listen_fd, msgr](int fd, int events)
{
msgr->accept_connections(listen_fd);
});

View File

@@ -43,8 +43,7 @@ void configure_single_pg_pool(cluster_client_t *cli)
},
});
cli->st_cli.on_load_pgs_hook(true);
std::map<std::string, etcd_kv_t> changes;
cli->st_cli.on_change_hook(changes);
cli->st_cli.on_change_pool_config_hook();
}
int *test_write(cluster_client_t *cli, uint64_t offset, uint64_t len, uint8_t c, std::function<void()> cb = NULL, bool instant = false)
@@ -281,7 +280,8 @@ void test1()
uint8_t c = offset < 0xE000 ? 0x56 : (offset < 0x10000 ? 0x57 : 0x58);
if (((uint8_t*)op->iov.buf[buf_idx].iov_base)[i] != c)
{
printf("Write replay: mismatch at %ju\n", offset-op->req.rw.offset);
printf("Write replay: mismatch at %ju (expected %02x, have %02x)\n", offset-op->req.rw.offset,
c, ((uint8_t*)op->iov.buf[buf_idx].iov_base)[i]);
goto fail;
}
}
@@ -290,9 +290,9 @@ void test1()
assert(offset == op->req.rw.offset+op->req.rw.len);
replay_ops.push_back(op);
}
if (replay_start != 0 || replay_end != 0x14000)
if (replay_start != 0 || replay_end != 0x10000)
{
printf("Write replay: range mismatch: %jx-%jx\n", replay_start, replay_end);
printf("Write replay: range mismatch: 0x%jx-0x%jx (expected 0-0x10000)\n", replay_start, replay_end);
assert(0);
}
for (auto op: replay_ops)
@@ -320,8 +320,6 @@ void test1()
check_disconnected(cli, 1);
pretend_connected(cli, 1);
check_op_count(cli, 1, 1);
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
check_op_count(cli, 1, 1);
can_complete(r1);
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
check_completed(r1);
@@ -341,7 +339,7 @@ void test1()
pretend_connected(cli, 1);
cli->continue_ops(true);
check_op_count(cli, 1, 1);
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x2000), 0);
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0, 0x1000), 0);
check_op_count(cli, 1, 1);
can_complete(r2);
pretend_op_completed(cli, find_op(cli, 1, OSD_OP_WRITE, 0x1000, 0x1000), 0);

View File

@@ -21,7 +21,7 @@ epoll_manager_t::epoll_manager_t(ring_loop_t *ringloop)
throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
}
tfd = new timerfd_manager_t([this](int fd, bool wr, std::function<void(int, int)> handler) { set_fd_handler(fd, wr, handler); });
tfd = new timerfd_manager_t([this](int fd, int events, std::function<void(int, int)> handler) { set_fd_handler(fd, events, handler); });
if (ringloop)
{
@@ -54,14 +54,14 @@ int epoll_manager_t::get_fd()
return epoll_fd;
}
void epoll_manager_t::set_fd_handler(int fd, bool wr, std::function<void(int, int)> handler)
void epoll_manager_t::set_fd_handler(int fd, int events, std::function<void(int, int)> handler)
{
if (handler != NULL)
{
bool exists = epoll_handlers.find(fd) != epoll_handlers.end();
epoll_event ev;
ev.data.fd = fd;
ev.events = (wr ? EPOLLOUT : 0) | EPOLLIN | EPOLLRDHUP | EPOLLET;
ev.events = events | EPOLLRDHUP | EPOLLET;
if (epoll_ctl(epoll_fd, exists ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, fd, &ev) < 0)
{
if (errno == ENOENT)

View File

@@ -3,6 +3,8 @@
#pragma once
#include <sys/epoll.h>
#include <map>
#include "ringloop.h"
@@ -21,7 +23,7 @@ public:
epoll_manager_t(ring_loop_t *ringloop);
~epoll_manager_t();
int get_fd();
void set_fd_handler(int fd, bool wr, std::function<void(int, int)> handler);
void set_fd_handler(int fd, int events, std::function<void(int, int)> handler);
void handle_events(int timeout);
timerfd_manager_t *tfd;

View File

@@ -32,12 +32,22 @@ static inline void my_uring_prep_readv(struct io_uring_sqe *sqe, int fd, const s
my_uring_prep_rw(IORING_OP_READV, sqe, fd, iovecs, nr_vecs, offset);
}
static inline void my_uring_prep_read(struct io_uring_sqe *sqe, int fd, void *buf, unsigned nbytes, off_t offset)
{
my_uring_prep_rw(IORING_OP_READ, sqe, fd, buf, nbytes, offset);
}
static inline void my_uring_prep_read_fixed(struct io_uring_sqe *sqe, int fd, void *buf, unsigned nbytes, off_t offset, int buf_index)
{
my_uring_prep_rw(IORING_OP_READ_FIXED, sqe, fd, buf, nbytes, offset);
sqe->buf_index = buf_index;
}
static inline void my_uring_prep_write(struct io_uring_sqe *sqe, int fd, void *buf, unsigned nbytes, off_t offset)
{
my_uring_prep_rw(IORING_OP_WRITE, sqe, fd, buf, nbytes, offset);
}
static inline void my_uring_prep_writev(struct io_uring_sqe *sqe, int fd, const struct iovec *iovecs, unsigned nr_vecs, off_t offset)
{
my_uring_prep_rw(IORING_OP_WRITEV, sqe, fd, iovecs, nr_vecs, offset);

View File

@@ -151,10 +151,11 @@ static uint64_t size_thresh[] = { (uint64_t)1024*1024*1024*1024, (uint64_t)1024*
static uint64_t size_thresh_d[] = { (uint64_t)1000000000000, (uint64_t)1000000000, (uint64_t)1000000, (uint64_t)1000, 0 };
static const int size_thresh_n = sizeof(size_thresh)/sizeof(size_thresh[0]);
static const char *size_unit = "TGMKB";
static const char *size_unit_ns = "TGMk ";
std::string format_size(uint64_t size, bool nobytes)
std::string format_size(uint64_t size, bool nobytes, bool nospace)
{
uint64_t *thr = nobytes ? size_thresh_d : size_thresh;
uint64_t *thr = (nobytes ? size_thresh_d : size_thresh);
char buf[256];
for (int i = 0; i < size_thresh_n; i++)
{
@@ -165,9 +166,19 @@ std::string format_size(uint64_t size, bool nobytes)
assert(l < sizeof(buf)-2);
if (buf[l-1] == '0')
l -= 2;
buf[l] = i == size_thresh_n-1 && nobytes ? 0 : ' ';
buf[l+1] = i == size_thresh_n-1 && nobytes ? 0 : size_unit[i];
buf[l+2] = 0;
if (i == size_thresh_n-1 && nobytes)
buf[l] = 0;
else if (nospace)
{
buf[l] = size_unit_ns[i];
buf[l+1] = 0;
}
else
{
buf[l] = ' ';
buf[l+1] = size_unit[i];
buf[l+2] = 0;
}
break;
}
}

View File

@@ -16,7 +16,7 @@ std::string strtolower(const std::string & in);
std::string trim(const std::string & in, const char *rm_chars = " \n\r\t");
std::string str_replace(const std::string & in, const std::string & needle, const std::string & replacement);
uint64_t stoull_full(const std::string & str, int base = 0);
std::string format_size(uint64_t size, bool nobytes = false);
std::string format_size(uint64_t size, bool nobytes = false, bool nospace = false);
void print_help(const char *help_text, std::string exe_name, std::string cmd, bool all);
uint64_t parse_time(std::string time_str, bool *ok = NULL);
std::string read_all_fd(int fd);

View File

@@ -11,7 +11,7 @@
#include <stdexcept>
#include "timerfd_manager.h"
timerfd_manager_t::timerfd_manager_t(std::function<void(int, bool, std::function<void(int, int)>)> set_fd_handler)
timerfd_manager_t::timerfd_manager_t(std::function<void(int, int, std::function<void(int, int)>)> set_fd_handler)
{
this->set_fd_handler = set_fd_handler;
wait_state = 0;
@@ -20,7 +20,7 @@ timerfd_manager_t::timerfd_manager_t(std::function<void(int, bool, std::function
{
throw std::runtime_error(std::string("timerfd_create: ") + strerror(errno));
}
set_fd_handler(timerfd, false, [this](int fd, int events)
set_fd_handler(timerfd, EPOLLIN, [this](int fd, int events)
{
handle_readable();
});
@@ -28,7 +28,7 @@ timerfd_manager_t::timerfd_manager_t(std::function<void(int, bool, std::function
timerfd_manager_t::~timerfd_manager_t()
{
set_fd_handler(timerfd, false, NULL);
set_fd_handler(timerfd, 0, NULL);
close(timerfd);
}

View File

@@ -30,9 +30,9 @@ class timerfd_manager_t
void trigger_nearest();
void handle_readable();
public:
std::function<void(int, bool, std::function<void(int, int)>)> set_fd_handler;
std::function<void(int, int, std::function<void(int, int)>)> set_fd_handler;
timerfd_manager_t(std::function<void(int, bool, std::function<void(int, int)>)> set_fd_handler);
timerfd_manager_t(std::function<void(int, int, std::function<void(int, int)>)> set_fd_handler);
~timerfd_manager_t();
int set_timer(uint64_t millis, bool repeat, std::function<void(int)> callback);
int set_timer_us(uint64_t micros, bool repeat, std::function<void(int)> callback);

View File

@@ -20,7 +20,6 @@ cd `dirname $0`/..
trap 'kill -9 $(jobs -p)' EXIT
ANTIETCD=1
ETCD=${ETCD:-etcd}
ETCD_IP=${ETCD_IP:-127.0.0.1}
ETCD_PORT=${ETCD_PORT:-12379}
@@ -33,20 +32,14 @@ if [ "$KEEP_DATA" = "" ]; then
fi
ETCD_URL="http://$ETCD_IP:$ETCD_PORT"
ETCD_CLUSTER="etcd1=http://$ETCD_IP:$((ETCD_PORT+1))"
for i in $(seq 2 $ETCD_COUNT); do
ETCD_URL="$ETCD_URL,http://$ETCD_IP:$((ETCD_PORT+2*i-2))"
ETCD_CLUSTER="$ETCD_CLUSTER,etcd$i=http://$ETCD_IP:$((ETCD_PORT+2*i-1))"
done
ETCDCTL="${ETCD}ctl --endpoints=$ETCD_URL --dial-timeout=5s --command-timeout=10s"
start_etcd()
{
if [[ "$ANTIETCD" -eq "1" ]]; then
start_antietcd $*
else
start__etcd $*
fi
}
start__etcd()
{
local i=$1
local t=/run/user/$(id -u)
@@ -60,65 +53,15 @@ start__etcd()
eval ETCD${i}_PID=$!
}
start_etcd_cluster()
{
ETCD_CLUSTER="etcd1=http://$ETCD_IP:$((ETCD_PORT+1))"
for i in $(seq 2 $ETCD_COUNT); do
ETCD_CLUSTER="$ETCD_CLUSTER,etcd$i=http://$ETCD_IP:$((ETCD_PORT+2*i-1))"
done
for i in $(seq 1 $ETCD_COUNT); do
start__etcd $i
done
ETCDCTL="${ETCD}ctl --endpoints=$ETCD_URL --dial-timeout=5s --command-timeout=10s"
for i in {1..30}; do
${ETCD}ctl --endpoints=$ETCD_URL --dial-timeout=1s --command-timeout=1s member list >/dev/null && break
if [[ $i = 30 ]]; then
format_error "Failed to start etcd"
fi
done
}
start_antietcd()
{
local i=$1
local t=/run/user/$(id -u)
findmnt $t >/dev/null || (sudo mkdir -p $t && sudo mount -t tmpfs tmpfs $t)
local persist=""
if [[ -n "$ANTIETCD_PERSISTENCE" ]]; then
persist="--data ./testdata/antietcd$i.json.gz --persist_interval 500"
for i in $(seq 1 $ETCD_COUNT); do
start_etcd $i
done
for i in {1..30}; do
${ETCD}ctl --endpoints=$ETCD_URL --dial-timeout=1s --command-timeout=1s member list >/dev/null && break
if [[ $i = 30 ]]; then
format_error "Failed to start etcd"
fi
local cluster=""
if [[ $ETCD_COUNT -gt 1 ]]; then
cluster="--node_id etcd$i --cluster_key abcdef --cluster $ETCD_CLUSTER"
fi
nodejs mon/tinyraft/antietcd-app.js $persist --port $((ETCD_PORT+2*i-2)) $cluster &>./testdata/antietcd$i.log &
eval ETCD${i}_PID=$!
}
start_antietcd_cluster()
{
ETCD_CLUSTER="etcd1=http://$ETCD_IP:$((ETCD_PORT))"
for i in $(seq 2 $ETCD_COUNT); do
ETCD_CLUSTER="$ETCD_CLUSTER,etcd$i=http://$ETCD_IP:$((ETCD_PORT+2*i-2))"
done
for i in $(seq 1 $ETCD_COUNT); do
start_antietcd $i
done
ETCDCTL="nodejs mon/tinyraft/anticli.js -e $ETCD_URL"
for i in {1..30}; do
nodejs mon/tinyraft/anticli.js -e "$ETCD_URL" get --prefix / && break
if [[ $i = 30 ]]; then
format_error "Failed to start antietcd"
fi
sleep 1
done
}
if [[ "$ANTIETCD" -eq "1" ]]; then
start_antietcd_cluster
else
start_etcd_cluster
fi
done
echo leak:fio >> testdata/lsan-suppress.txt
echo leak:tcmalloc >> testdata/lsan-suppress.txt

View File

@@ -54,7 +54,7 @@ for i in $(seq 1 $OSD_COUNT); do
start_osd $i
done
(while true; do set +e; node mon/mon-main.js --etcd_address $ETCD_URL --etcd_prefix "/vitastor" --verbose 1; if [[ $? -ne 2 ]]; then break; fi; done) >>./testdata/mon.log 2>&1 &
node mon/mon-main.js --etcd_address $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 >>./testdata/mon.log 2>&1 &
MON_PID=$!
if [ "$SCHEME" = "ec" ]; then

View File

@@ -15,7 +15,7 @@ for i in $(seq 1 $OSD_COUNT); do
eval OSD${i}_PID=$!
done
(while true; do node mon/mon-main.js --etcd_address $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 || true; done) >>./testdata/mon.log 2>&1 &
node mon/mon-main.js --etcd_address $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 >>./testdata/mon.log 2>&1 &
MON_PID=$!
sleep 3