Compare commits

...

46 Commits

Author SHA1 Message Date
f5e5686540 Experimental: Support RDMA devices without ODP by stupidly calling ibv_reg_mr on every output buffer :)
Some checks failed
Test / test_rm (push) Successful in 18s
Test / test_snapshot_ec (push) Successful in 35s
Test / test_interrupted_rebalance_ec (push) Successful in 1m53s
Test / test_snapshot_down (push) Successful in 23s
Test / test_move_reappear (push) Failing after 49s
Test / test_snapshot_down_ec (push) Successful in 23s
Test / test_splitbrain (push) Successful in 22s
Test / test_snapshot_chain (push) Successful in 2m16s
Test / test_snapshot_chain_ec (push) Successful in 3m2s
Test / test_rebalance_verify_imm (push) Successful in 3m22s
Test / test_write (push) Successful in 34s
Test / test_rebalance_verify (push) Successful in 4m0s
Test / test_write_no_same (push) Successful in 13s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m47s
Test / test_rebalance_verify_ec (push) Successful in 4m56s
Test / test_write_xor (push) Failing after 3m9s
Test / test_heal_pg_size_2 (push) Successful in 3m43s
Test / test_heal_csum_32k_dmj (push) Successful in 5m20s
Test / test_heal_ec (push) Successful in 5m43s
Test / test_heal_csum_32k_dj (push) Successful in 6m6s
Test / test_heal_csum_32k (push) Successful in 6m23s
Test / test_scrub (push) Successful in 1m24s
Test / test_scrub_zero_osd_2 (push) Successful in 1m6s
Test / test_scrub_xor (push) Successful in 1m9s
Test / test_heal_csum_4k_dj (push) Successful in 6m17s
Test / test_heal_csum_4k_dmj (push) Successful in 6m19s
Test / test_scrub_ec (push) Successful in 49s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 52s
Test / test_scrub_pg_size_3 (push) Successful in 1m9s
Test / test_heal_csum_4k (push) Successful in 5m57s
2023-11-11 11:47:07 +03:00
12a6bed2d5 Return the new accidentally rolled back json11 commit ("allow trailing comma")
Some checks failed
Test / test_snapshot_ec (push) Successful in 28s
Test / test_move_reappear (push) Successful in 19s
Test / test_interrupted_rebalance_ec (push) Successful in 1m51s
Test / test_rm (push) Successful in 18s
Test / test_snapshot_down (push) Successful in 30s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 24s
Test / test_snapshot_chain (push) Successful in 2m22s
Test / test_rebalance_verify_imm (push) Successful in 3m4s
Test / test_rebalance_verify (push) Successful in 3m48s
Test / test_write (push) Successful in 39s
Test / test_write_no_same (push) Successful in 12s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m14s
Test / test_rebalance_verify_ec (push) Successful in 4m2s
Test / test_heal_pg_size_2 (push) Successful in 3m44s
Test / test_heal_csum_32k_dmj (push) Successful in 4m43s
Test / test_heal_ec (push) Successful in 6m47s
Test / test_heal_csum_32k_dj (push) Successful in 5m57s
Test / test_heal_csum_32k (push) Successful in 6m11s
Test / test_scrub (push) Successful in 1m10s
Test / test_scrub_zero_osd_2 (push) Successful in 1m17s
Test / test_heal_csum_4k_dmj (push) Successful in 6m19s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m10s
Test / test_scrub_pg_size_3 (push) Successful in 1m47s
Test / test_heal_csum_4k_dj (push) Successful in 6m3s
Test / test_heal_csum_4k (push) Successful in 6m3s
Test / test_scrub_ec (push) Successful in 28s
Test / test_scrub_xor (push) Successful in 23s
Test / test_snapshot_chain_ec (push) Successful in 2m25s
Test / test_write_xor (push) Failing after 3m6s
2023-11-07 15:49:23 +03:00
5524dbdab7 Release 1.2.0
Some checks failed
Test / test_snapshot_ec (push) Successful in 25s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m18s
Test / test_rm (push) Successful in 15s
Test / test_snapshot_down (push) Successful in 22s
Test / test_snapshot_down_ec (push) Successful in 23s
Test / test_splitbrain (push) Successful in 18s
Test / test_snapshot_chain (push) Successful in 2m13s
Test / test_snapshot_chain_ec (push) Successful in 2m57s
Test / test_rebalance_verify_imm (push) Successful in 2m51s
Test / test_write (push) Successful in 38s
Test / test_rebalance_verify (push) Successful in 3m39s
Test / test_write_no_same (push) Successful in 12s
Test / test_rebalance_verify_ec (push) Successful in 3m56s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m6s
Test / test_heal_pg_size_2 (push) Successful in 3m43s
Test / test_heal_csum_32k_dmj (push) Successful in 4m35s
Test / test_heal_csum_32k_dj (push) Successful in 5m44s
Test / test_heal_csum_32k (push) Successful in 5m50s
Test / test_heal_csum_4k_dmj (push) Successful in 5m44s
Test / test_scrub_zero_osd_2 (push) Successful in 57s
Test / test_scrub (push) Successful in 1m0s
Test / test_scrub_xor (push) Successful in 1m5s
Test / test_heal_csum_4k_dj (push) Successful in 5m9s
Test / test_scrub_pg_size_3 (push) Successful in 1m38s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 54s
Test / test_scrub_ec (push) Successful in 52s
Test / test_heal_csum_4k (push) Successful in 5m8s
Test / test_heal_ec (push) Successful in 3m17s
Test / test_write_xor (push) Successful in 35s
Test / test_move_reappear (push) Failing after 48s
New features:

- Implement CSI volume expansion
- Implement CSI volume snapshots
- CSI driver now requires Kubernetes >= 1.20

Bug fixes:

- Important bug fix for EC: fix EC n+k, k>=2 read recovery in ISA-L version returning
  incorrect data when reading at least the second chunk out of multiple missing chunks
  without reading the first one. All users of EC n+k, k>=2 should upgrade as soon as
  possible, and upgrade should be conducted with downtime: first stop all clients
  (VMs/containers), then all OSDs, then upgrade and restart everything.
- Fix unstable statistics aggregation in monitor (affecting vitastor-cli status and df)
- Make udev not wait for OSDs to start during boot
- Do not report negative numbers of offline PGs in vitastor-cli status when changing PG count
- Report both old and new PG counts in vitastor-cli df when changing it
- Fix OSDs sometimes not starting with "The code only supports journal versions 1 and 2,
  but it is 2 on disk" error after upgrading from pre-1.0 versions and letting OSDs run
  for some time
- Fix monitors sometimes returning old PG count back after OSD configuration changes
- Make monitor PG changes more stable and timeout errors less probable
2023-11-05 01:48:57 +03:00
cd3dec06ac Remove spaces from old->new PG count in df
All checks were successful
Test / test_interrupted_rebalance_ec (push) Successful in 1m50s
Test / test_move_reappear (push) Successful in 19s
Test / test_rm (push) Successful in 14s
Test / test_snapshot_ec (push) Successful in 33s
Test / test_snapshot_down (push) Successful in 29s
Test / test_snapshot_down_ec (push) Successful in 32s
Test / test_splitbrain (push) Successful in 24s
Test / test_snapshot_chain (push) Successful in 2m34s
Test / test_rebalance_verify_imm (push) Successful in 3m9s
Test / test_rebalance_verify (push) Successful in 4m9s
Test / test_write (push) Successful in 40s
Test / test_write_no_same (push) Successful in 13s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m15s
Test / test_rebalance_verify_ec (push) Successful in 4m29s
Test / test_heal_pg_size_2 (push) Successful in 3m21s
Test / test_heal_csum_32k_dmj (push) Successful in 5m38s
Test / test_heal_ec (push) Successful in 6m14s
Test / test_heal_csum_32k_dj (push) Successful in 6m22s
Test / test_heal_csum_32k (push) Successful in 6m40s
Test / test_scrub (push) Successful in 1m11s
Test / test_scrub_zero_osd_2 (push) Successful in 1m12s
Test / test_scrub_xor (push) Successful in 1m16s
Test / test_heal_csum_4k_dj (push) Successful in 6m4s
Test / test_heal_csum_4k_dmj (push) Successful in 6m34s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m4s
Test / test_heal_csum_4k (push) Successful in 5m37s
Test / test_scrub_ec (push) Successful in 43s
Test / test_scrub_pg_size_3 (push) Successful in 1m14s
Test / test_write_xor (push) Successful in 1m11s
Test / test_snapshot_chain_ec (push) Successful in 2m43s
2023-11-05 01:45:45 +03:00
371d79e059 Document vitastor-csi features 2023-11-05 01:05:26 +03:00
0e888e6c60 Prevent spamming etcd with last_clean_pgs update requests
All checks were successful
Test / test_snapshot_ec (push) Successful in 34s
Test / test_interrupted_rebalance_ec (push) Successful in 1m49s
Test / test_move_reappear (push) Successful in 19s
Test / test_rm (push) Successful in 14s
Test / test_snapshot_down (push) Successful in 30s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 24s
Test / test_snapshot_chain (push) Successful in 2m23s
Test / test_snapshot_chain_ec (push) Successful in 3m4s
Test / test_rebalance_verify_imm (push) Successful in 2m54s
Test / test_rebalance_verify (push) Successful in 3m48s
Test / test_write (push) Successful in 35s
Test / test_write_no_same (push) Successful in 14s
Test / test_write_xor (push) Successful in 55s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m15s
Test / test_rebalance_verify_ec (push) Successful in 5m3s
Test / test_heal_pg_size_2 (push) Successful in 3m59s
Test / test_heal_ec (push) Successful in 4m56s
Test / test_heal_csum_32k_dmj (push) Successful in 5m48s
Test / test_heal_csum_32k_dj (push) Successful in 5m44s
Test / test_heal_csum_32k (push) Successful in 6m35s
Test / test_scrub (push) Successful in 1m14s
Test / test_heal_csum_4k_dmj (push) Successful in 6m54s
Test / test_scrub_zero_osd_2 (push) Successful in 1m2s
Test / test_scrub_xor (push) Successful in 49s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m9s
Test / test_scrub_pg_size_3 (push) Successful in 1m54s
Test / test_heal_csum_4k_dj (push) Successful in 6m17s
Test / test_heal_csum_4k (push) Successful in 6m18s
Test / test_scrub_ec (push) Successful in 37s
2023-11-05 00:12:00 +03:00
408c21d8f0 Scale last_clean_pgs PG count even if current PGs already contain the new number of PGs
Some checks failed
Test / test_interrupted_rebalance_ec (push) Successful in 1m49s
Test / test_snapshot_ec (push) Successful in 31s
Test / test_rm (push) Successful in 13s
Test / test_snapshot_down (push) Successful in 23s
Test / test_move_reappear (push) Failing after 50s
Test / test_snapshot_down_ec (push) Successful in 22s
Test / test_splitbrain (push) Successful in 20s
Test / test_snapshot_chain (push) Successful in 2m15s
Test / test_snapshot_chain_ec (push) Successful in 2m56s
Test / test_rebalance_verify_imm (push) Successful in 2m59s
Test / test_write (push) Successful in 34s
Test / test_rebalance_verify (push) Successful in 3m44s
Test / test_write_no_same (push) Successful in 13s
Test / test_write_xor (push) Successful in 52s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m5s
Test / test_rebalance_verify_ec (push) Successful in 5m1s
Test / test_heal_pg_size_2 (push) Successful in 4m1s
Test / test_heal_ec (push) Successful in 5m3s
Test / test_heal_csum_32k_dmj (push) Successful in 5m13s
Test / test_heal_csum_32k_dj (push) Successful in 5m37s
Test / test_heal_csum_32k (push) Successful in 6m19s
Test / test_scrub (push) Successful in 1m11s
Test / test_heal_csum_4k_dmj (push) Successful in 6m13s
Test / test_scrub_zero_osd_2 (push) Successful in 1m5s
Test / test_scrub_xor (push) Successful in 48s
Test / test_heal_csum_4k_dj (push) Successful in 6m11s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m10s
Test / test_scrub_pg_size_3 (push) Successful in 1m29s
Test / test_heal_csum_4k (push) Successful in 6m9s
Test / test_scrub_ec (push) Successful in 35s
2023-11-04 23:45:59 +03:00
43cb9ae212 Prevent multiple parallel recheck_pgs in case of timeouts
Some checks failed
Test / test_snapshot_ec (push) Successful in 37s
Test / test_minsize_1 (push) Successful in 13s
Test / test_rm (push) Successful in 12s
Test / test_move_reappear (push) Successful in 17s
Test / test_snapshot_down (push) Successful in 30s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 22s
Test / test_snapshot_chain (push) Successful in 2m25s
Test / test_snapshot_chain_ec (push) Failing after 3m7s
Test / test_rebalance_verify_imm (push) Successful in 3m0s
Test / test_rebalance_verify (push) Successful in 3m54s
Test / test_write (push) Successful in 34s
Test / test_write_no_same (push) Successful in 14s
Test / test_write_xor (push) Successful in 52s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m6s
Test / test_rebalance_verify_ec (push) Successful in 5m10s
Test / test_heal_pg_size_2 (push) Successful in 4m1s
Test / test_heal_ec (push) Successful in 4m21s
Test / test_heal_csum_32k_dmj (push) Successful in 5m10s
Test / test_heal_csum_32k_dj (push) Successful in 5m51s
Test / test_heal_csum_32k (push) Successful in 6m54s
Test / test_heal_csum_4k_dmj (push) Successful in 6m38s
Test / test_scrub (push) Successful in 1m9s
Test / test_scrub_zero_osd_2 (push) Successful in 1m2s
Test / test_scrub_xor (push) Successful in 43s
Test / test_heal_csum_4k_dj (push) Successful in 6m24s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m16s
Test / test_scrub_pg_size_3 (push) Successful in 1m38s
Test / test_scrub_ec (push) Successful in 37s
Test / test_heal_csum_4k (push) Successful in 6m2s
2023-11-04 20:59:56 +03:00
e15b6e7805 Fix "cannot be narrowed" in clang
Some checks failed
Test / test_snapshot_ec (push) Successful in 44s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 2m10s
Test / test_rm (push) Successful in 16s
Test / test_move_reappear (push) Failing after 51s
Test / test_snapshot_down (push) Successful in 22s
Test / test_snapshot_down_ec (push) Successful in 24s
Test / test_splitbrain (push) Successful in 23s
Test / test_snapshot_chain (push) Successful in 2m32s
Test / test_snapshot_chain_ec (push) Successful in 3m2s
Test / test_rebalance_verify_imm (push) Successful in 3m0s
Test / test_write (push) Successful in 33s
Test / test_rebalance_verify (push) Successful in 3m53s
Test / test_write_no_same (push) Successful in 12s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m23s
Test / test_rebalance_verify_ec (push) Successful in 4m11s
Test / test_write_xor (push) Failing after 3m12s
Test / test_heal_pg_size_2 (push) Successful in 3m47s
Test / test_heal_csum_32k_dmj (push) Successful in 5m17s
Test / test_heal_ec (push) Successful in 5m34s
Test / test_heal_csum_32k_dj (push) Successful in 6m43s
Test / test_heal_csum_32k (push) Successful in 6m30s
Test / test_scrub (push) Successful in 1m18s
Test / test_scrub_zero_osd_2 (push) Successful in 1m11s
Test / test_heal_csum_4k_dmj (push) Successful in 6m24s
Test / test_heal_csum_4k_dj (push) Successful in 6m23s
Test / test_scrub_xor (push) Successful in 54s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m1s
Test / test_scrub_ec (push) Successful in 54s
Test / test_scrub_pg_size_3 (push) Successful in 1m25s
Test / test_heal_csum_4k (push) Successful in 6m10s
2023-11-04 18:14:44 +03:00
31017d8412 Allow to start with V2 journal with header size from V1, as incorrectly updated by previous versions 2023-11-04 18:13:42 +03:00
4819854064 Fix OSDs incorrectly updating journal superblock after upgrade to 1.x from pre-1.x and refusing to start after it
Some checks failed
Test / test_interrupted_rebalance_imm (push) Successful in 3m38s
Test / test_snapshot_ec (push) Successful in 33s
Test / test_rm (push) Successful in 16s
Test / test_snapshot_down (push) Successful in 23s
Test / test_move_reappear (push) Failing after 47s
Test / test_snapshot_down_ec (push) Successful in 23s
Test / test_splitbrain (push) Successful in 21s
Test / test_snapshot_chain (push) Successful in 2m31s
Test / test_snapshot_chain_ec (push) Successful in 3m7s
Test / test_rebalance_verify_imm (push) Successful in 2m54s
Test / test_write (push) Successful in 32s
Test / test_rebalance_verify (push) Successful in 3m46s
Test / test_write_no_same (push) Successful in 13s
Test / test_write_xor (push) Successful in 37s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m56s
Test / test_rebalance_verify_ec (push) Successful in 5m0s
Test / test_heal_pg_size_2 (push) Failing after 4m18s
Test / test_heal_ec (push) Successful in 5m3s
Test / test_heal_csum_32k_dmj (push) Successful in 5m19s
Test / test_heal_csum_32k_dj (push) Successful in 5m44s
Test / test_heal_csum_32k (push) Successful in 6m37s
Test / test_heal_csum_4k_dmj (push) Successful in 6m46s
Test / test_scrub (push) Successful in 1m5s
Test / test_scrub_zero_osd_2 (push) Successful in 48s
Test / test_scrub_xor (push) Successful in 45s
Test / test_heal_csum_4k_dj (push) Successful in 6m37s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m17s
Test / test_scrub_pg_size_3 (push) Successful in 1m40s
Test / test_scrub_ec (push) Successful in 34s
Test / test_heal_csum_4k (push) Successful in 7m13s
2023-11-04 15:02:24 +03:00
1f509cca77 Fix unused capture warnings and void* arithmetic (clang)
Some checks failed
Test / test_minsize_1 (push) Successful in 14s
Test / test_snapshot_ec (push) Successful in 40s
Test / test_rm (push) Successful in 16s
Test / test_move_reappear (push) Successful in 18s
Test / test_snapshot_down (push) Successful in 31s
Test / test_snapshot_down_ec (push) Successful in 33s
Test / test_splitbrain (push) Successful in 22s
Test / test_snapshot_chain (push) Successful in 2m31s
Test / test_snapshot_chain_ec (push) Failing after 3m7s
Test / test_rebalance_verify_imm (push) Successful in 3m6s
Test / test_write (push) Successful in 39s
Test / test_rebalance_verify (push) Successful in 4m7s
Test / test_write_no_same (push) Successful in 13s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m23s
Test / test_rebalance_verify_ec (push) Successful in 4m20s
Test / test_write_xor (push) Failing after 3m9s
Test / test_heal_pg_size_2 (push) Successful in 3m55s
Test / test_heal_csum_32k_dmj (push) Successful in 4m44s
Test / test_heal_csum_32k_dj (push) Successful in 6m8s
Test / test_heal_csum_32k (push) Successful in 5m58s
Test / test_heal_ec (push) Failing after 10m16s
Test / test_heal_csum_4k_dmj (push) Successful in 5m57s
Test / test_scrub (push) Successful in 1m8s
Test / test_scrub_zero_osd_2 (push) Successful in 59s
Test / test_scrub_xor (push) Successful in 47s
Test / test_heal_csum_4k_dj (push) Successful in 5m30s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m7s
Test / test_scrub_pg_size_3 (push) Successful in 1m34s
Test / test_heal_csum_4k (push) Successful in 5m21s
Test / test_scrub_ec (push) Successful in 43s
2023-11-04 14:55:12 +03:00
aa8e8e8271 Add version info to --help output
Some checks failed
Test / test_minsize_1 (push) Successful in 16s
Test / test_snapshot_ec (push) Successful in 39s
Test / test_move_reappear (push) Successful in 21s
Test / test_rm (push) Successful in 16s
Test / test_snapshot_down (push) Successful in 32s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 24s
Test / test_snapshot_chain (push) Successful in 2m31s
Test / test_snapshot_chain_ec (push) Failing after 3m6s
Test / test_rebalance_verify_imm (push) Successful in 2m47s
Test / test_rebalance_verify (push) Successful in 3m38s
Test / test_write (push) Successful in 38s
Test / test_write_no_same (push) Successful in 13s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m19s
Test / test_rebalance_verify_ec (push) Successful in 4m6s
Test / test_write_xor (push) Failing after 3m10s
Test / test_heal_pg_size_2 (push) Successful in 3m54s
Test / test_heal_csum_32k_dmj (push) Successful in 5m6s
Test / test_heal_ec (push) Successful in 5m48s
Test / test_heal_csum_32k_dj (push) Successful in 6m22s
Test / test_heal_csum_32k (push) Successful in 6m30s
Test / test_scrub (push) Successful in 1m18s
Test / test_scrub_zero_osd_2 (push) Successful in 1m12s
Test / test_heal_csum_4k_dmj (push) Successful in 6m38s
Test / test_heal_csum_4k_dj (push) Successful in 6m14s
Test / test_scrub_xor (push) Successful in 1m0s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 56s
Test / test_scrub_ec (push) Successful in 51s
Test / test_scrub_pg_size_3 (push) Successful in 1m20s
Test / test_heal_csum_4k (push) Successful in 5m58s
2023-11-04 13:32:12 +03:00
4d79e531c5 Do not print "-X offline" in status when changing pool PG count, print it in df instead
Some checks failed
Test / test_interrupted_rebalance_ec_imm (push) Successful in 2m20s
Test / test_rm (push) Successful in 16s
Test / test_snapshot_ec (push) Successful in 36s
Test / test_snapshot_down (push) Successful in 23s
Test / test_move_reappear (push) Failing after 48s
Test / test_snapshot_down_ec (push) Successful in 23s
Test / test_splitbrain (push) Successful in 21s
Test / test_snapshot_chain (push) Successful in 2m21s
Test / test_snapshot_chain_ec (push) Successful in 3m1s
Test / test_rebalance_verify_imm (push) Successful in 3m6s
Test / test_write (push) Successful in 35s
Test / test_rebalance_verify (push) Successful in 3m46s
Test / test_write_no_same (push) Successful in 13s
Test / test_rebalance_verify_ec (push) Successful in 4m7s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m23s
Test / test_write_xor (push) Failing after 3m8s
Test / test_heal_pg_size_2 (push) Successful in 4m15s
Test / test_heal_csum_32k_dmj (push) Successful in 4m39s
Test / test_heal_ec (push) Successful in 6m35s
Test / test_heal_csum_32k_dj (push) Successful in 6m5s
Test / test_heal_csum_32k (push) Successful in 6m45s
Test / test_scrub (push) Successful in 1m8s
Test / test_heal_csum_4k_dmj (push) Successful in 6m24s
Test / test_scrub_zero_osd_2 (push) Successful in 1m6s
Test / test_scrub_xor (push) Successful in 41s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m18s
Test / test_heal_csum_4k_dj (push) Successful in 6m29s
Test / test_scrub_pg_size_3 (push) Successful in 1m34s
Test / test_heal_csum_4k (push) Successful in 6m7s
Test / test_scrub_ec (push) Successful in 30s
2023-11-04 13:12:13 +03:00
30dff8893f Fix ISA-L version EC recovery with first missing data chunk not being read
Some checks failed
Test / test_snapshot (push) Successful in 44s
Test / test_snapshot_ec (push) Successful in 28s
Test / test_move_reappear (push) Successful in 19s
Test / test_rm (push) Successful in 16s
Test / test_snapshot_down (push) Successful in 30s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 24s
Test / test_snapshot_chain (push) Successful in 2m20s
Test / test_snapshot_chain_ec (push) Successful in 3m1s
Test / test_rebalance_verify_imm (push) Successful in 2m49s
Test / test_rebalance_verify (push) Successful in 3m37s
Test / test_write (push) Successful in 42s
Test / test_write_no_same (push) Successful in 14s
Test / test_write_xor (push) Successful in 54s
Test / test_rebalance_verify_ec (push) Successful in 4m55s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m13s
Test / test_heal_pg_size_2 (push) Successful in 4m4s
Test / test_heal_ec (push) Successful in 5m2s
Test / test_heal_csum_32k_dmj (push) Failing after 5m54s
Test / test_heal_csum_32k_dj (push) Successful in 6m6s
Test / test_heal_csum_32k (push) Successful in 6m59s
Test / test_scrub (push) Successful in 1m16s
Test / test_heal_csum_4k_dmj (push) Successful in 6m56s
Test / test_scrub_xor (push) Successful in 51s
Test / test_scrub_zero_osd_2 (push) Successful in 1m1s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m25s
Test / test_heal_csum_4k (push) Successful in 6m9s
Test / test_heal_csum_4k_dj (push) Successful in 6m33s
Test / test_scrub_pg_size_3 (push) Successful in 1m37s
Test / test_scrub_ec (push) Successful in 26s
(Yes, all EC n + k with k >= 2 users should upgrade as soon as possible)
2023-11-04 01:34:18 +03:00
becf14a705 Add a test for EC with multiple missing data chunks, but without recovery of first of them 2023-11-04 01:34:18 +03:00
64388788c1 Implement CSI volume expansion
Some checks failed
Test / test_snapshot_ec (push) Successful in 35s
Test / test_minsize_1 (push) Successful in 16s
Test / test_rm (push) Successful in 15s
Test / test_snapshot_down (push) Successful in 22s
Test / test_move_reappear (push) Failing after 50s
Test / test_snapshot_down_ec (push) Successful in 23s
Test / test_splitbrain (push) Successful in 19s
Test / test_snapshot_chain (push) Successful in 2m24s
Test / test_snapshot_chain_ec (push) Failing after 3m6s
Test / test_rebalance_verify_imm (push) Successful in 3m15s
Test / test_write (push) Successful in 41s
Test / test_rebalance_verify (push) Successful in 4m13s
Test / test_write_no_same (push) Successful in 13s
Test / test_write_xor (push) Successful in 50s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m28s
Test / test_rebalance_verify_ec (push) Successful in 5m30s
Test / test_heal_pg_size_2 (push) Successful in 4m5s
Test / test_heal_ec (push) Successful in 4m57s
Test / test_heal_csum_32k_dmj (push) Successful in 6m13s
Test / test_heal_csum_32k_dj (push) Successful in 6m10s
Test / test_heal_csum_32k (push) Successful in 6m40s
Test / test_heal_csum_4k_dmj (push) Successful in 6m24s
Test / test_scrub (push) Successful in 1m7s
Test / test_scrub_xor (push) Successful in 47s
Test / test_scrub_zero_osd_2 (push) Successful in 53s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m25s
Test / test_scrub_pg_size_3 (push) Successful in 1m59s
Test / test_heal_csum_4k_dj (push) Successful in 6m10s
Test / test_heal_csum_4k (push) Successful in 6m2s
Test / test_scrub_ec (push) Successful in 40s
2023-11-01 12:46:20 +03:00
37653abe4b Implement CSI volume snapshots 2023-11-01 12:46:20 +03:00
7c054c6f10 Add "id" to df --json output 2023-11-01 12:46:16 +03:00
bb7709e824 Support listening on non-127.0.0.1 in tests 2023-11-01 12:45:27 +03:00
ebeace5a2d Add cmake and pkg-config to debian build depends 2023-11-01 12:45:27 +03:00
a378789f10 Rollback erroneous go.mod changes in 1.1.0 O:-)
All checks were successful
Test / test_snapshot_ec (push) Successful in 30s
Test / test_move_reappear (push) Successful in 19s
Test / test_interrupted_rebalance_ec (push) Successful in 1m52s
Test / test_rm (push) Successful in 16s
Test / test_snapshot_down (push) Successful in 32s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 23s
Test / test_snapshot_chain (push) Successful in 2m23s
Test / test_snapshot_chain_ec (push) Successful in 3m2s
Test / test_rebalance_verify_imm (push) Successful in 3m1s
Test / test_rebalance_verify (push) Successful in 3m49s
Test / test_write (push) Successful in 41s
Test / test_write_no_same (push) Successful in 13s
Test / test_write_xor (push) Successful in 43s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m15s
Test / test_rebalance_verify_ec (push) Successful in 5m9s
Test / test_heal_pg_size_2 (push) Successful in 4m13s
Test / test_heal_ec (push) Successful in 4m31s
Test / test_heal_csum_32k_dmj (push) Successful in 5m59s
Test / test_heal_csum_32k_dj (push) Successful in 6m14s
Test / test_heal_csum_32k (push) Successful in 6m47s
Test / test_heal_csum_4k_dmj (push) Successful in 6m47s
Test / test_scrub (push) Successful in 1m11s
Test / test_scrub_zero_osd_2 (push) Successful in 1m0s
Test / test_scrub_xor (push) Successful in 52s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m19s
Test / test_scrub_pg_size_3 (push) Successful in 1m52s
Test / test_heal_csum_4k_dj (push) Successful in 6m2s
Test / test_heal_csum_4k (push) Successful in 5m46s
Test / test_scrub_ec (push) Successful in 25s
2023-10-30 18:47:48 +03:00
1fe678e57b Add --no-block to udev rule
Some checks failed
Test / test_minsize_1 (push) Successful in 13s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 2m0s
Test / test_move_reappear (push) Successful in 21s
Test / test_rm (push) Successful in 15s
Test / test_snapshot_down (push) Successful in 30s
Test / test_snapshot_down_ec (push) Successful in 32s
Test / test_splitbrain (push) Successful in 23s
Test / test_snapshot_chain (push) Successful in 2m29s
Test / test_snapshot_chain_ec (push) Successful in 3m4s
Test / test_rebalance_verify_ec_imm (push) Failing after 18s
Test / test_write (push) Successful in 29s
Test / test_rebalance_verify_imm (push) Successful in 2m53s
Test / test_write_no_same (push) Successful in 12s
Test / test_rebalance_verify (push) Successful in 3m46s
Test / test_write_xor (push) Failing after 3m6s
Test / test_rebalance_verify_ec (push) Successful in 5m1s
Test / test_heal_pg_size_2 (push) Successful in 4m50s
Test / test_heal_ec (push) Successful in 4m34s
Test / test_heal_csum_32k_dmj (push) Successful in 5m5s
Test / test_heal_csum_32k_dj (push) Successful in 5m57s
Test / test_heal_csum_32k (push) Successful in 6m56s
Test / test_heal_csum_4k_dmj (push) Successful in 7m28s
Test / test_scrub (push) Successful in 1m10s
Test / test_scrub_zero_osd_2 (push) Successful in 57s
Test / test_scrub_xor (push) Successful in 53s
Test / test_heal_csum_4k_dj (push) Successful in 6m34s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m8s
Test / test_scrub_pg_size_3 (push) Successful in 1m37s
Test / test_scrub_ec (push) Successful in 41s
Test / test_heal_csum_4k (push) Successful in 6m6s
2023-10-30 12:18:29 +03:00
2e592a2f22 Fix undefined variable "timeout"
Some checks failed
Test / test_snapshot_ec (push) Successful in 44s
Test / test_rm (push) Successful in 17s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 2m9s
Test / test_snapshot_down (push) Successful in 23s
Test / test_move_reappear (push) Failing after 51s
Test / test_snapshot_down_ec (push) Successful in 25s
Test / test_splitbrain (push) Successful in 24s
Test / test_snapshot_chain (push) Successful in 2m26s
Test / test_snapshot_chain_ec (push) Failing after 3m6s
Test / test_rebalance_verify_imm (push) Successful in 3m2s
Test / test_write (push) Successful in 35s
Test / test_rebalance_verify (push) Successful in 3m56s
Test / test_write_no_same (push) Successful in 12s
Test / test_write_xor (push) Successful in 38s
Test / test_rebalance_verify_ec (push) Successful in 5m2s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m13s
Test / test_heal_pg_size_2 (push) Successful in 4m17s
Test / test_heal_ec (push) Successful in 5m2s
Test / test_heal_csum_32k_dmj (push) Successful in 5m43s
Test / test_heal_csum_32k_dj (push) Successful in 5m36s
Test / test_heal_csum_32k (push) Successful in 7m4s
Test / test_heal_csum_4k_dmj (push) Successful in 6m47s
Test / test_scrub (push) Successful in 1m11s
Test / test_scrub_zero_osd_2 (push) Successful in 59s
Test / test_scrub_pg_size_3 (push) Successful in 1m26s
Test / test_heal_csum_4k_dj (push) Successful in 6m32s
Test / test_heal_csum_4k (push) Successful in 6m31s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 27s
Test / test_scrub_ec (push) Successful in 25s
Test / test_scrub_xor (push) Failing after 3m7s
2023-10-29 01:30:55 +03:00
b92f644e3a Fix statistics aggregation, calculate inode stats by first deriving per-OSD stats, too 2023-10-29 01:30:55 +03:00
890ea3dbc0 Forgot to add new parameter page to README 2023-10-28 13:39:53 +03:00
06630369bf Plans++ 2023-10-28 13:38:04 +03:00
b4740acf62 Fix operations paused for 0.5-1 second when it happens that io_uring submit is not triggered
Some checks failed
Test / test_snapshot (push) Successful in 24s
Test / test_snapshot_ec (push) Successful in 33s
Test / test_minsize_1 (push) Successful in 15s
Test / test_rm (push) Successful in 17s
Test / test_move_reappear (push) Failing after 48s
Test / test_snapshot_down_ec (push) Successful in 24s
Test / test_splitbrain (push) Successful in 22s
Test / test_snapshot_chain (push) Successful in 2m30s
Test / test_snapshot_chain_ec (push) Successful in 3m4s
Test / test_rebalance_verify_imm (push) Successful in 2m41s
Test / test_write (push) Successful in 48s
Test / test_rebalance_verify (push) Successful in 3m42s
Test / test_write_no_same (push) Successful in 12s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m17s
Test / test_rebalance_verify_ec (push) Successful in 4m11s
Test / test_write_xor (push) Failing after 3m8s
Test / test_heal_pg_size_2 (push) Successful in 3m40s
Test / test_heal_csum_32k_dmj (push) Successful in 5m9s
Test / test_heal_ec (push) Successful in 6m31s
Test / test_heal_csum_32k_dj (push) Successful in 6m30s
Test / test_heal_csum_32k (push) Successful in 6m22s
Test / test_scrub (push) Successful in 1m14s
Test / test_scrub_zero_osd_2 (push) Successful in 1m20s
Test / test_heal_csum_4k_dmj (push) Successful in 6m23s
Test / test_scrub_xor (push) Successful in 1m4s
Test / test_heal_csum_4k_dj (push) Successful in 6m2s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 59s
Test / test_scrub_ec (push) Successful in 50s
Test / test_scrub_pg_size_3 (push) Successful in 1m35s
Test / test_heal_csum_4k (push) Successful in 6m1s
2023-10-28 13:18:21 +03:00
eae81bbda6 Fix typo 2023-10-28 01:09:20 +03:00
8222e3c77d Release 1.1.0
Some checks failed
Test / test_interrupted_rebalance_ec (push) Successful in 1m49s
Test / test_snapshot_ec (push) Successful in 38s
Test / test_rm (push) Successful in 15s
Test / test_snapshot_down (push) Successful in 23s
Test / test_move_reappear (push) Failing after 49s
Test / test_snapshot_down_ec (push) Successful in 23s
Test / test_splitbrain (push) Successful in 22s
Test / test_snapshot_chain (push) Successful in 2m25s
Test / test_snapshot_chain_ec (push) Successful in 3m5s
Test / test_rebalance_verify_imm (push) Successful in 2m51s
Test / test_write (push) Successful in 34s
Test / test_rebalance_verify (push) Successful in 3m38s
Test / test_write_no_same (push) Successful in 14s
Test / test_write_xor (push) Successful in 50s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m3s
Test / test_rebalance_verify_ec (push) Successful in 5m0s
Test / test_heal_pg_size_2 (push) Successful in 4m2s
Test / test_heal_ec (push) Successful in 4m49s
Test / test_heal_csum_32k_dmj (push) Successful in 5m27s
Test / test_heal_csum_32k_dj (push) Successful in 5m44s
Test / test_heal_csum_32k (push) Successful in 6m57s
Test / test_heal_csum_4k_dmj (push) Successful in 6m50s
Test / test_scrub (push) Successful in 1m12s
Test / test_scrub_xor (push) Successful in 48s
Test / test_scrub_zero_osd_2 (push) Successful in 54s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m14s
Test / test_heal_csum_4k_dj (push) Successful in 6m32s
Test / test_scrub_pg_size_3 (push) Successful in 1m38s
Test / test_heal_csum_4k (push) Successful in 6m20s
Test / test_scrub_ec (push) Successful in 27s
New features:

- Implement [client writeback cache](docs/config/client.en.md#client_enable_writeback)
- Add the third I/O mode: [O_DIRECT|O_SYNC](docs/config/osd.en.md#data_io) (good for Optane)
- Reduce load on etcd by splitting OSD lease and statistics reporting intervals:
  [etcd_stats_interval](docs/config/osd.en.md#etcd_stats_interval) (default 30 sec)
- Make MON automatically filter OSDs by layout (block_size/immediate_commit/bitmap_granularity)
  to prevent "refusing to start PGs of this pool" errors on misconfiguration
- Support running fio benchmarks on systems without io_uring
- Make QEMU driver compatible with QEMU 8.1
- Document usage of [vhost-user-blk](docs/usage/qemu.en.md#vhost-user-blk)

Bug fixes:

- Fix resizing disks in QEMU driver (for example, in Proxmox)
- Fix "unexpected result" in Proxmox driver by making CLI flush output on exit
- Remove unneeded block_size mismatch warnings on pools without matching PGs
- Fix possible segfault in vitastor-cli ls -l (usually with deleted pools)
- Fix QEMU driver compatibility with systems without io_uring
- Fix monitor eating 100% CPU when etcd is down (caused by infinite retries)
- Fix potential incorrect write processing with snapshots (not caught in tests
  but could probably lead to client hangs)
- Fix buffer insertion in cluster_client (not caught in tests but could
  probably lead to incorrect writes in rare cases)
- Fix rare OSD crash during sync operation processing
- Fix a reenterability issue in cluster_client not reproducible in QEMU/fio,
  but reproducible with the currently developed K/V database implementation
- Fix deletion of the first modified object - OSDs could crash if you modified
  the same object a lot of times, then deleted it, and then modified it again
- Fix the fio_sec_osd test tool
2023-10-28 00:33:06 +03:00
29cbe70e74 Bump qemu version to vitastor4 2023-10-28 00:33:06 +03:00
a883e79507 Make docs to add etcd_stats_interval 2023-10-27 14:09:26 +03:00
be7e76f849 Split etcd_stats_interval out of etcd_report_interval
All checks were successful
Test / test_interrupted_rebalance_ec (push) Successful in 1m46s
Test / test_snapshot_ec (push) Successful in 36s
Test / test_move_reappear (push) Successful in 19s
Test / test_rm (push) Successful in 15s
Test / test_snapshot_down (push) Successful in 29s
Test / test_snapshot_down_ec (push) Successful in 30s
Test / test_splitbrain (push) Successful in 26s
Test / test_snapshot_chain (push) Successful in 2m15s
Test / test_snapshot_chain_ec (push) Successful in 2m57s
Test / test_rebalance_verify_imm (push) Successful in 2m29s
Test / test_rebalance_verify (push) Successful in 3m40s
Test / test_write (push) Successful in 1m0s
Test / test_write_no_same (push) Successful in 13s
Test / test_write_xor (push) Successful in 50s
Test / test_rebalance_verify_ec (push) Successful in 4m58s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m14s
Test / test_heal_pg_size_2 (push) Successful in 4m21s
Test / test_heal_ec (push) Successful in 4m5s
Test / test_heal_csum_32k_dmj (push) Successful in 5m36s
Test / test_heal_csum_32k_dj (push) Successful in 6m28s
Test / test_heal_csum_32k (push) Successful in 6m38s
Test / test_heal_csum_4k_dmj (push) Successful in 6m46s
Test / test_scrub_zero_osd_2 (push) Successful in 59s
Test / test_scrub (push) Successful in 1m16s
Test / test_scrub_xor (push) Successful in 53s
Test / test_scrub_pg_size_3 (push) Successful in 1m57s
Test / test_heal_csum_4k_dj (push) Successful in 6m18s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m7s
Test / test_heal_csum_4k (push) Successful in 5m43s
Test / test_scrub_ec (push) Successful in 32s
2023-10-27 01:26:26 +03:00
6fd2cf5df6 Add documentation for the write-back cache 2023-10-27 01:26:26 +03:00
294a754c9e Allow write-back by default in NBD & NFS 2023-10-27 01:26:26 +03:00
8bfea6e7de Support vitastor_c_create_epoll() in fio driver
All checks were successful
Test / test_snapshot_ec (push) Successful in 29s
Test / test_move_reappear (push) Successful in 21s
Test / test_rm (push) Successful in 21s
Test / test_snapshot_down (push) Successful in 27s
Test / test_interrupted_rebalance_ec (push) Successful in 2m29s
Test / test_splitbrain (push) Successful in 20s
Test / test_snapshot_down_ec (push) Successful in 24s
Test / test_snapshot_chain (push) Successful in 2m35s
Test / test_rebalance_verify (push) Successful in 3m22s
Test / test_rebalance_verify_imm (push) Successful in 3m22s
Test / test_write (push) Successful in 32s
Test / test_write_no_same (push) Successful in 12s
Test / test_rebalance_verify_ec (push) Successful in 3m56s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m51s
Test / test_heal_pg_size_2 (push) Successful in 3m21s
Test / test_heal_ec (push) Successful in 5m45s
Test / test_heal_csum_32k_dmj (push) Successful in 5m43s
Test / test_heal_csum_32k_dj (push) Successful in 6m10s
Test / test_heal_csum_32k (push) Successful in 6m29s
Test / test_heal_csum_4k_dmj (push) Successful in 6m22s
Test / test_heal_csum_4k_dj (push) Successful in 5m51s
Test / test_scrub (push) Successful in 36s
Test / test_scrub_zero_osd_2 (push) Successful in 34s
Test / test_heal_csum_4k (push) Successful in 5m35s
Test / test_scrub_xor (push) Successful in 35s
Test / test_write_xor (push) Successful in 1m31s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 31s
Test / test_scrub_pg_size_3 (push) Successful in 42s
Test / test_scrub_ec (push) Successful in 20s
Test / test_snapshot_chain_ec (push) Successful in 1m23s
2023-10-26 22:57:36 +03:00
bac9e34836 Allow to create vitastor_c with plain epoll without uring :-) 2023-10-26 22:57:36 +03:00
8aa4d492c1 Allow to use epoll_manager without ringloop 2023-10-26 22:57:36 +03:00
9336ee5476 Correctly free manual "small vector" in cluster_client %-) 2023-10-26 22:57:36 +03:00
ad30b11519 Add the missing ringloop creation check to vitastor_c_create_uring_json()
Some checks failed
Test / test_minsize_1 (push) Failing after 1s
Test / test_move_reappear (push) Failing after 1s
Test / test_rm (push) Failing after 0s
Test / test_snapshot_chain (push) Failing after 0s
Test / test_snapshot_chain_ec (push) Failing after 1s
Test / test_snapshot_down (push) Failing after 1s
Test / test_snapshot_down_ec (push) Failing after 0s
Test / test_splitbrain (push) Failing after 1s
Test / test_rebalance_verify (push) Failing after 1s
Test / test_rebalance_verify_imm (push) Failing after 1s
Test / test_rebalance_verify_ec (push) Failing after 1s
Test / test_rebalance_verify_ec_imm (push) Failing after 1s
Test / test_write (push) Failing after 1s
Test / test_write_xor (push) Failing after 1s
Test / test_write_no_same (push) Failing after 1s
Test / test_heal_pg_size_2 (push) Failing after 1s
Test / test_heal_ec (push) Failing after 1s
Test / test_heal_csum_32k_dmj (push) Failing after 1s
Test / test_heal_csum_32k_dj (push) Failing after 1s
Test / test_heal_csum_32k (push) Failing after 1s
Test / test_heal_csum_4k_dmj (push) Failing after 1s
Test / test_heal_csum_4k_dj (push) Failing after 1s
Test / test_heal_csum_4k (push) Failing after 1s
Test / test_scrub (push) Failing after 1s
Test / test_scrub_zero_osd_2 (push) Failing after 1s
Test / test_scrub_xor (push) Failing after 1s
Test / test_scrub_pg_size_3 (push) Failing after 1s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Failing after 0s
Test / test_scrub_ec (push) Failing after 1s
Test / build (push) Successful in 2m35s
2023-10-26 18:07:23 +03:00
a061246997 Do not attempt to initialize QEMU driver via vitastor_c_create_qemu_uring()
Some checks failed
Test / test_snapshot_ec (push) Successful in 38s
Test / test_minsize_1 (push) Successful in 18s
Test / test_rm (push) Successful in 17s
Test / test_snapshot_down (push) Failing after 26s
Test / test_move_reappear (push) Failing after 52s
Test / test_snapshot_down_ec (push) Failing after 27s
Test / test_splitbrain (push) Successful in 25s
Test / test_snapshot_chain (push) Successful in 2m28s
Test / test_snapshot_chain_ec (push) Successful in 3m6s
Test / test_rebalance_verify (push) Successful in 3m30s
Test / test_rebalance_verify_imm (push) Successful in 3m29s
Test / test_rebalance_verify_ec (push) Has been cancelled
Test / test_write_no_same (push) Has been cancelled
Test / test_heal_pg_size_2 (push) Has been cancelled
Test / test_heal_ec (push) Has been cancelled
Test / test_heal_csum_32k_dmj (push) Has been cancelled
Test / test_heal_csum_32k_dj (push) Has been cancelled
Test / test_heal_csum_32k (push) Has been cancelled
Test / test_heal_csum_4k_dmj (push) Has been cancelled
Test / test_heal_csum_4k_dj (push) Has been cancelled
Test / test_heal_csum_4k (push) Has been cancelled
Test / test_scrub (push) Has been cancelled
Test / test_scrub_zero_osd_2 (push) Has been cancelled
Test / test_scrub_xor (push) Has been cancelled
Test / test_scrub_pg_size_3 (push) Has been cancelled
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Has been cancelled
Test / test_write_xor (push) Has been cancelled
Test / test_scrub_ec (push) Has been cancelled
Test / test_rebalance_verify_ec_imm (push) Has been cancelled
Test / test_write (push) Has been cancelled
It doesn't add any compatibility because vitastor_c_uring_register_eventfd()
is added in the same VITASTOR_C_API_VERSION 2.
2023-10-26 17:46:19 +03:00
5066e35a49 Fix write-over-delete failing for the very first entry in dirty_db
Some checks failed
Test / test_minsize_1 (push) Successful in 17s
Test / test_snapshot_ec (push) Successful in 38s
Test / test_rm (push) Successful in 17s
Test / test_snapshot_down (push) Failing after 27s
Test / test_move_reappear (push) Failing after 53s
Test / test_snapshot_down_ec (push) Failing after 27s
Test / test_splitbrain (push) Successful in 25s
Test / test_snapshot_chain (push) Successful in 2m28s
Test / test_snapshot_chain_ec (push) Successful in 2m59s
Test / test_rebalance_verify (push) Successful in 3m22s
Test / test_rebalance_verify_imm (push) Successful in 3m21s
Test / test_write (push) Successful in 42s
Test / test_write_xor (push) Successful in 48s
Test / test_write_no_same (push) Successful in 14s
Test / test_rebalance_verify_ec (push) Successful in 4m41s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m8s
Test / test_heal_pg_size_2 (push) Successful in 4m17s
Test / test_heal_ec (push) Successful in 4m33s
Test / test_heal_csum_32k_dmj (push) Successful in 5m23s
Test / test_heal_csum_32k_dj (push) Successful in 6m18s
Test / test_heal_csum_32k (push) Successful in 6m44s
Test / test_heal_csum_4k_dmj (push) Successful in 6m53s
Test / test_scrub (push) Successful in 1m11s
Test / test_scrub_zero_osd_2 (push) Successful in 1m5s
Test / test_scrub_xor (push) Successful in 1m1s
Test / test_heal_csum_4k_dj (push) Successful in 6m58s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m2s
Test / test_scrub_pg_size_3 (push) Successful in 1m41s
Test / test_scrub_ec (push) Successful in 37s
Test / test_heal_csum_4k (push) Successful in 6m4s
2023-10-21 17:00:14 +03:00
93dc31f3fc Fix possible segfault in vitastor-cli ls -l
Some checks failed
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m21s
Test / test_snapshot_ec (push) Successful in 32s
Test / test_rm (push) Successful in 17s
Test / test_snapshot_down (push) Failing after 26s
Test / test_move_reappear (push) Failing after 52s
Test / test_splitbrain (push) Successful in 22s
Test / test_snapshot_down_ec (push) Failing after 26s
Test / test_snapshot_chain (push) Successful in 2m22s
Test / test_snapshot_chain_ec (push) Successful in 2m44s
Test / test_rebalance_verify (push) Successful in 3m20s
Test / test_rebalance_verify_imm (push) Successful in 3m20s
Test / test_write (push) Successful in 36s
Test / test_write_no_same (push) Successful in 14s
Test / test_rebalance_verify_ec (push) Successful in 3m59s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m49s
Test / test_write_xor (push) Failing after 3m11s
Test / test_heal_pg_size_2 (push) Failing after 4m40s
Test / test_heal_csum_32k_dmj (push) Successful in 5m13s
Test / test_heal_ec (push) Successful in 5m32s
Test / test_heal_csum_32k_dj (push) Successful in 6m1s
Test / test_heal_csum_32k (push) Successful in 6m54s
Test / test_scrub (push) Successful in 1m28s
Test / test_heal_csum_4k_dmj (push) Successful in 7m1s
Test / test_scrub_zero_osd_2 (push) Successful in 1m7s
Test / test_heal_csum_4k_dj (push) Successful in 7m25s
Test / test_scrub_xor (push) Successful in 59s
Test / test_heal_csum_4k (push) Successful in 6m39s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 59s
Test / test_scrub_pg_size_3 (push) Successful in 1m6s
Test / test_scrub_ec (push) Successful in 35s
2023-10-18 11:11:41 +03:00
f245b56176 Fix another possible reenterability issue in cluster_client
Some checks failed
Test / test_interrupted_rebalance_ec_imm (push) Successful in 2m9s
Test / test_snapshot_ec (push) Successful in 42s
Test / test_rm (push) Successful in 15s
Test / test_snapshot_down (push) Failing after 27s
Test / test_move_reappear (push) Failing after 50s
Test / test_snapshot_down_ec (push) Failing after 26s
Test / test_splitbrain (push) Successful in 23s
Test / test_snapshot_chain (push) Successful in 2m26s
Test / test_snapshot_chain_ec (push) Successful in 2m58s
Test / test_rebalance_verify (push) Successful in 3m26s
Test / test_rebalance_verify_imm (push) Successful in 3m27s
Test / test_write (push) Successful in 42s
Test / test_write_xor (push) Successful in 51s
Test / test_write_no_same (push) Successful in 16s
Test / test_rebalance_verify_ec (push) Successful in 4m56s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m11s
Test / test_heal_pg_size_2 (push) Successful in 4m18s
Test / test_heal_ec (push) Successful in 5m5s
Test / test_heal_csum_32k_dmj (push) Successful in 5m7s
Test / test_heal_csum_32k_dj (push) Successful in 6m14s
Test / test_heal_csum_32k (push) Successful in 6m54s
Test / test_heal_csum_4k_dmj (push) Successful in 6m55s
Test / test_scrub (push) Successful in 1m23s
Test / test_scrub_zero_osd_2 (push) Successful in 1m8s
Test / test_scrub_xor (push) Successful in 1m0s
Test / test_heal_csum_4k_dj (push) Successful in 7m15s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m7s
Test / test_scrub_pg_size_3 (push) Successful in 1m40s
Test / test_scrub_ec (push) Successful in 42s
Test / test_heal_csum_4k (push) Successful in 6m17s
Non-reproducible in QEMU/FIO, only caught during K/V DB debugging
2023-10-08 11:02:53 +03:00
befca06f18 Support any OSD count in test_heal 2023-10-08 11:02:53 +03:00
fbf0263625 Add qemu-storage-daemon to documentation 2023-09-16 18:40:52 +03:00
84 changed files with 1390 additions and 369 deletions

View File

@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
project(vitastor)
set(VERSION "1.0.0")
set(VERSION "1.2.0")
add_subdirectory(src)

View File

@@ -50,6 +50,7 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
- Параметры
- [Общие](docs/config/common.ru.md)
- [Сетевые](docs/config/network.ru.md)
- [Клиентский код](docs/config/client.en.md)
- [Глобальные дисковые параметры](docs/config/layout-cluster.ru.md)
- [Дисковые параметры OSD](docs/config/layout-osd.ru.md)
- [Прочие параметры OSD](docs/config/osd.ru.md)

View File

@@ -50,6 +50,7 @@ Read more details below in the documentation.
- Parameter Reference
- [Common](docs/config/common.en.md)
- [Network](docs/config/network.en.md)
- [Client](docs/config/client.en.md)
- [Global Disk Layout](docs/config/layout-cluster.en.md)
- [OSD Disk Layout](docs/config/layout-osd.en.md)
- [OSD Runtime Parameters](docs/config/osd.en.md)

View File

@@ -1,4 +1,4 @@
VERSION ?= v1.0.0
VERSION ?= v1.2.0
all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v1.0.0
image: vitalif/vitastor-csi:v1.2.0
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -35,10 +35,13 @@ rules:
verbs: ["get", "list", "watch"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshots"]
verbs: ["get", "list"]
verbs: ["get", "list", "patch"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshots/status"]
verbs: ["get", "list", "patch"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshotcontents"]
verbs: ["create", "get", "list", "watch", "update", "delete"]
verbs: ["create", "get", "list", "watch", "update", "delete", "patch"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshotclasses"]
verbs: ["get", "list", "watch"]
@@ -53,7 +56,7 @@ rules:
verbs: ["get", "list", "watch"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshotcontents/status"]
verbs: ["update"]
verbs: ["update", "patch"]
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["get"]

View File

@@ -23,6 +23,11 @@ metadata:
name: csi-vitastor-provisioner
spec:
replicas: 3
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
maxSurge: 0
selector:
matchLabels:
app: csi-vitastor-provisioner
@@ -46,7 +51,7 @@ spec:
priorityClassName: system-cluster-critical
containers:
- name: csi-provisioner
image: k8s.gcr.io/sig-storage/csi-provisioner:v2.2.0
image: k8s.gcr.io/sig-storage/csi-provisioner:v3.0.0
args:
- "--csi-address=$(ADDRESS)"
- "--v=5"
@@ -116,7 +121,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v1.0.0
image: vitalif/vitastor-csi:v1.2.0
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -17,3 +17,4 @@ parameters:
# multiple etcdUrls may be specified, delimited by comma
#etcdUrl: "http://192.168.7.2:2379"
#etcdPrefix: "/vitastor"
allowVolumeExpansion: true

View File

@@ -0,0 +1,7 @@
apiVersion: snapshot.storage.k8s.io/v1
kind: VolumeSnapshotClass
metadata:
name: vitastor-snapclass
driver: csi.vitastor.io
deletionPolicy: Delete
parameters:

View File

@@ -0,0 +1,16 @@
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: test-vitastor-clone
spec:
storageClassName: vitastor
dataSource:
name: snap1
kind: VolumeSnapshot
apiGroup: snapshot.storage.k8s.io
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi

View File

@@ -0,0 +1,8 @@
apiVersion: snapshot.storage.k8s.io/v1
kind: VolumeSnapshot
metadata:
name: snap1
spec:
volumeSnapshotClassName: vitastor-snapclass
source:
persistentVolumeClaimName: test-vitastor-pvc

View File

@@ -9,6 +9,7 @@ require (
golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
google.golang.org/grpc v1.33.1
google.golang.org/protobuf v1.24.0
k8s.io/klog v1.0.0
k8s.io/utils v0.0.0-20210305010621-2afb4311ab10
)

View File

@@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "1.0.0"
vitastorCSIDriverVersion = "1.2.0"
)
// Config struct fills the parameters of request or user input

View File

@@ -20,6 +20,7 @@ import (
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"google.golang.org/protobuf/types/known/timestamppb"
"github.com/container-storage-interface/spec/lib/go/csi"
)
@@ -45,6 +46,7 @@ type InodeConfig struct
ParentPool uint64 `json:"parent_pool,omitempty"`
ParentId uint64 `json:"parent_id,omitempty"`
Readonly bool `json:"readonly,omitempty"`
CreateTs uint64 `json:"create_ts,omitempty"`
}
type ControllerServer struct
@@ -178,27 +180,43 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
}
args := []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) }
// Support creation from snapshot
var src *csi.VolumeContentSource
if (req.VolumeContentSource.GetSnapshot() != nil)
{
snapId := req.VolumeContentSource.GetSnapshot().GetSnapshotId()
if (snapId != "")
{
snapVars := make(map[string]string)
err := json.Unmarshal([]byte(snapId), &snapVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
args = append(args, "--parent", snapVars["name"]+"@"+snapVars["snapshot"])
src = &csi.VolumeContentSource{
Type: &csi.VolumeContentSource_Snapshot{
Snapshot: &csi.VolumeContentSource_SnapshotSource{
SnapshotId: snapId,
},
},
}
}
}
// Create image using vitastor-cli
_, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) })
_, err := invokeCLI(ctxVars, args)
if (err != nil)
{
if (strings.Index(err.Error(), "already exists") > 0)
{
stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
inodeCfg, err := invokeList(ctxVars, volName, true)
if (err != nil)
{
return nil, err
}
var inodeCfg []InodeConfig
err = json.Unmarshal(stat, &inodeCfg)
if (err != nil)
{
return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
}
if (len(inodeCfg) == 0)
{
return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
}
if (inodeCfg[0].Size < uint64(volSize))
{
return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
@@ -217,6 +235,7 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
// Ugly, but VolumeContext isn't passed to DeleteVolume :-(
VolumeId: string(volumeIdJson),
CapacityBytes: volSize,
ContentSource: src,
},
}, nil
}
@@ -230,15 +249,15 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
}
ctxVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
volVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &volVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := ctxVars["name"]
volName := volVars["name"]
ctxVars, _, _ = GetConnectionParams(ctxVars)
ctxVars, _, _ := GetConnectionParams(volVars)
_, err = invokeCLI(ctxVars, []string{ "rm", volName })
if (err != nil)
@@ -344,6 +363,8 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
csi.ControllerServiceCapability_RPC_LIST_VOLUMES,
csi.ControllerServiceCapability_RPC_EXPAND_VOLUME,
csi.ControllerServiceCapability_RPC_CREATE_DELETE_SNAPSHOT,
csi.ControllerServiceCapability_RPC_LIST_SNAPSHOTS,
// TODO: csi.ControllerServiceCapability_RPC_CLONE_VOLUME,
} {
controllerServerCapabilities = append(controllerServerCapabilities, functionControllerServerCapabilities(capability))
}
@@ -353,28 +374,214 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
}, nil
}
func invokeList(ctxVars map[string]string, pattern string, expectExist bool) ([]InodeConfig, error)
{
stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", pattern })
if (err != nil)
{
return nil, err
}
var inodeCfg []InodeConfig
err = json.Unmarshal(stat, &inodeCfg)
if (err != nil)
{
return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
}
if (expectExist && len(inodeCfg) == 0)
{
return nil, status.Error(codes.Internal, "Can't find expected image "+pattern+" via vitastor-cli ls")
}
return inodeCfg, nil
}
// CreateSnapshot create snapshot of an existing PV
func (cs *ControllerServer) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (*csi.CreateSnapshotResponse, error)
{
return nil, status.Error(codes.Unimplemented, "")
klog.Infof("received controller create snapshot request %+v", protosanitizer.StripSecrets(req))
if (req == nil)
{
return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
}
if (req.SourceVolumeId == "" || req.Name == "")
{
return nil, status.Error(codes.InvalidArgument, "source volume ID and snapshot name are required fields")
}
// snapshot name
snapName := req.Name
// req.VolumeId is an ugly json string in our case :)
ctxVars := make(map[string]string)
err := json.Unmarshal([]byte(req.SourceVolumeId), &ctxVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := ctxVars["name"]
// Create image using vitastor-cli
_, err = invokeCLI(ctxVars, []string{ "create", "--snapshot", snapName, volName })
if (err != nil && strings.Index(err.Error(), "already exists") <= 0)
{
return nil, err
}
// Check created snapshot
inodeCfg, err := invokeList(ctxVars, volName+"@"+snapName, true)
if (err != nil)
{
return nil, err
}
// Use ugly JSON snapshot ID again, DeleteSnapshot doesn't have context :-(
ctxVars["snapshot"] = snapName
snapIdJson, _ := json.Marshal(ctxVars)
return &csi.CreateSnapshotResponse{
Snapshot: &csi.Snapshot{
SizeBytes: int64(inodeCfg[0].Size),
SnapshotId: string(snapIdJson),
SourceVolumeId: req.SourceVolumeId,
CreationTime: &timestamppb.Timestamp{ Seconds: int64(inodeCfg[0].CreateTs) },
ReadyToUse: true,
},
}, nil
}
// DeleteSnapshot delete provided snapshot of a PV
func (cs *ControllerServer) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequest) (*csi.DeleteSnapshotResponse, error)
{
return nil, status.Error(codes.Unimplemented, "")
klog.Infof("received controller delete snapshot request %+v", protosanitizer.StripSecrets(req))
if (req == nil)
{
return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
}
if (req.SnapshotId == "")
{
return nil, status.Error(codes.InvalidArgument, "snapshot ID is a required field")
}
volVars := make(map[string]string)
err := json.Unmarshal([]byte(req.SnapshotId), &volVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "snapshot ID not in JSON format")
}
volName := volVars["name"]
snapName := volVars["snapshot"]
ctxVars, _, _ := GetConnectionParams(volVars)
_, err = invokeCLI(ctxVars, []string{ "rm", volName+"@"+snapName })
if (err != nil)
{
return nil, err
}
return &csi.DeleteSnapshotResponse{}, nil
}
// ListSnapshots list the snapshots of a PV
func (cs *ControllerServer) ListSnapshots(ctx context.Context, req *csi.ListSnapshotsRequest) (*csi.ListSnapshotsResponse, error)
{
return nil, status.Error(codes.Unimplemented, "")
klog.Infof("received controller list snapshots request %+v", protosanitizer.StripSecrets(req))
if (req == nil)
{
return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
}
volVars := make(map[string]string)
err := json.Unmarshal([]byte(req.SourceVolumeId), &volVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := volVars["name"]
ctxVars, _, _ := GetConnectionParams(volVars)
inodeCfg, err := invokeList(ctxVars, volName+"@*", false)
if (err != nil)
{
return nil, err
}
resp := &csi.ListSnapshotsResponse{}
for _, ino := range inodeCfg
{
snapName := ino.Name[len(volName)+1:]
if (len(req.StartingToken) > 0 && snapName < req.StartingToken)
{
}
else if (req.MaxEntries == 0 || len(resp.Entries) < int(req.MaxEntries))
{
volVars["snapshot"] = snapName
snapIdJson, _ := json.Marshal(volVars)
resp.Entries = append(resp.Entries, &csi.ListSnapshotsResponse_Entry{
Snapshot: &csi.Snapshot{
SizeBytes: int64(ino.Size),
SnapshotId: string(snapIdJson),
SourceVolumeId: req.SourceVolumeId,
CreationTime: &timestamppb.Timestamp{ Seconds: int64(ino.CreateTs) },
ReadyToUse: true,
},
})
}
else
{
resp.NextToken = snapName
break
}
}
return resp, nil
}
// ControllerExpandVolume resizes a volume
// ControllerExpandVolume increases the size of a volume
func (cs *ControllerServer) ControllerExpandVolume(ctx context.Context, req *csi.ControllerExpandVolumeRequest) (*csi.ControllerExpandVolumeResponse, error)
{
return nil, status.Error(codes.Unimplemented, "")
klog.Infof("received controller expand volume request %+v", protosanitizer.StripSecrets(req))
if (req == nil)
{
return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
}
if (req.VolumeId == "" || req.CapacityRange == nil || req.CapacityRange.RequiredBytes == 0)
{
return nil, status.Error(codes.InvalidArgument, "VolumeId, CapacityRange and RequiredBytes are required fields")
}
volVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &volVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := volVars["name"]
ctxVars, _, _ := GetConnectionParams(volVars)
inodeCfg, err := invokeList(ctxVars, volName, true)
if (err != nil)
{
return nil, err
}
if (req.CapacityRange.RequiredBytes > 0 && inodeCfg[0].Size < uint64(req.CapacityRange.RequiredBytes))
{
sz := ((req.CapacityRange.RequiredBytes+4095)/4096)*4096
_, err := invokeCLI(ctxVars, []string{ "modify", "--inc_size", "1", "--resize", fmt.Sprintf("%d", sz), volName })
if (err != nil)
{
return nil, err
}
inodeCfg, err = invokeList(ctxVars, volName, true)
if (err != nil)
{
return nil, err
}
}
return &csi.ControllerExpandVolumeResponse{
CapacityBytes: int64(inodeCfg[0].Size),
NodeExpansionRequired: false,
}, nil
}
// ControllerGetVolume get volume info

View File

@@ -49,6 +49,13 @@ func (is *IdentityServer) GetPluginCapabilities(ctx context.Context, req *csi.Ge
},
},
},
{
Type: &csi.PluginCapability_VolumeExpansion_{
VolumeExpansion: &csi.PluginCapability_VolumeExpansion{
Type: csi.PluginCapability_VolumeExpansion_OFFLINE,
},
},
},
},
}, nil
}

View File

@@ -70,10 +70,10 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
isBlock := req.GetVolumeCapability().GetBlock() != nil
// Check that it's not already mounted
_, error := mount.IsNotMountPoint(ns.mounter, targetPath)
if (error != nil)
_, err := mount.IsNotMountPoint(ns.mounter, targetPath)
if (err != nil)
{
if (os.IsNotExist(error))
if (os.IsNotExist(err))
{
if (isBlock)
{
@@ -102,12 +102,12 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
}
else
{
return nil, status.Error(codes.Internal, error.Error())
return nil, status.Error(codes.Internal, err.Error())
}
}
ctxVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
err = json.Unmarshal([]byte(req.VolumeId), &ctxVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
@@ -147,70 +147,74 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
}
devicePath := strings.TrimSpace(stdoutStr)
// Check existing format
diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
existingFormat, err := diskMounter.GetDiskFormat(devicePath)
if (err != nil)
{
klog.Errorf("failed to get disk format for path %s, error: %v", err)
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
return nil, err
}
// Format the device (ext4 or xfs)
fsType := req.GetVolumeCapability().GetMount().GetFsType()
opt := req.GetVolumeCapability().GetMount().GetMountFlags()
opt = append(opt, "_netdev")
if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
!Contains(opt, "ro"))
{
opt = append(opt, "ro")
}
if (fsType == "xfs")
{
opt = append(opt, "nouuid")
}
readOnly := Contains(opt, "ro")
if (existingFormat == "" && !readOnly)
{
args := []string{}
switch fsType
{
case "ext4":
args = []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
case "xfs":
args = []string{"-K", devicePath}
}
if (len(args) > 0)
{
cmdOut, cmdErr := diskMounter.Exec.Command("mkfs."+fsType, args...).CombinedOutput()
if (cmdErr != nil)
{
klog.Errorf("failed to run mkfs error: %v, output: %v", cmdErr, string(cmdOut))
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
return nil, status.Error(codes.Internal, cmdErr.Error())
}
}
}
if (isBlock)
{
opt = append(opt, "bind")
err = diskMounter.Mount(devicePath, targetPath, fsType, opt)
err = diskMounter.Mount(devicePath, targetPath, "", []string{"bind"})
}
else
{
// Check existing format
existingFormat, err := diskMounter.GetDiskFormat(devicePath)
if (err != nil)
{
klog.Errorf("failed to get disk format for path %s, error: %v", err)
goto unmap
}
// Format the device (ext4 or xfs)
fsType := req.GetVolumeCapability().GetMount().GetFsType()
opt := req.GetVolumeCapability().GetMount().GetMountFlags()
opt = append(opt, "_netdev")
if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
!Contains(opt, "ro"))
{
opt = append(opt, "ro")
}
if (fsType == "xfs")
{
opt = append(opt, "nouuid")
}
readOnly := Contains(opt, "ro")
if (existingFormat == "" && !readOnly)
{
var cmdOut []byte
switch fsType
{
case "ext4":
args := []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
cmdOut, err = diskMounter.Exec.Command("mkfs.ext4", args...).CombinedOutput()
case "xfs":
cmdOut, err = diskMounter.Exec.Command("mkfs.xfs", "-K", devicePath).CombinedOutput()
}
if (err != nil)
{
klog.Errorf("failed to run mkfs error: %v, output: %v", err, string(cmdOut))
goto unmap
}
}
err = diskMounter.FormatAndMount(devicePath, targetPath, fsType, opt)
// Try to run online resize on mount.
// FIXME: Implement online resize. It requires online resize support in vitastor-nbd.
if (err == nil && existingFormat != "" && !readOnly)
{
var cmdOut []byte
switch (fsType)
{
case "ext4":
cmdOut, err = diskMounter.Exec.Command("resize2fs", devicePath).CombinedOutput()
case "xfs":
cmdOut, err = diskMounter.Exec.Command("xfs_growfs", devicePath).CombinedOutput()
}
if (err != nil)
{
klog.Errorf("failed to run resizefs error: %v, output: %v", err, string(cmdOut))
goto unmap
}
}
}
if (err != nil)
{
@@ -218,15 +222,18 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
"failed to mount device path (%s) to path (%s) for volume (%s) error: %s",
devicePath, targetPath, volName, err,
)
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
return nil, status.Error(codes.Internal, err.Error())
goto unmap
}
return &csi.NodePublishVolumeResponse{}, nil
unmap:
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
return nil, status.Error(codes.Internal, err.Error())
}
// NodeUnpublishVolume unmounts the volume from the target path

4
debian/changelog vendored
View File

@@ -1,10 +1,10 @@
vitastor (1.0.0-1) unstable; urgency=medium
vitastor (1.2.0-1) unstable; urgency=medium
* Bugfixes
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
vitastor (1.0.0-1) unstable; urgency=medium
vitastor (1.2.0-1) unstable; urgency=medium
* Implement NFS proxy
* Add documentation

2
debian/control vendored
View File

@@ -2,7 +2,7 @@ Source: vitastor
Section: admin
Priority: optional
Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev, cmake, pkg-config
Standards-Version: 4.5.0
Homepage: https://vitastor.io/
Rules-Requires-Root: no

View File

@@ -54,7 +54,8 @@ RUN set -e; \
quilt add block/vitastor.c; \
cp /root/vitastor/src/qemu_driver.c block/vitastor.c; \
quilt refresh; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor3; \
V=$(head -n1 debian/changelog | perl -pe 's/5\.2\+dfsg-9/5.2+dfsg-11/; s/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor4; \
if [ "$REL" = bullseye ]; then V=${V}bullseye; fi; \
DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
rm -rf /root/packages/qemu-$REL/qemu-*/

View File

@@ -35,8 +35,8 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-1.0.0; \
cd vitastor-1.0.0; \
cp -r /root/vitastor vitastor-1.2.0; \
cd vitastor-1.2.0; \
ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -49,8 +49,8 @@ RUN set -e -x; \
rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.0.0.orig.tar.xz vitastor-1.0.0; \
cd vitastor-1.0.0; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.2.0.orig.tar.xz vitastor-1.2.0; \
cd vitastor-1.2.0; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

View File

@@ -33,6 +33,7 @@ In the future, additional configuration methods may be added:
- [Common](config/common.en.md)
- [Network](config/network.en.md)
- [Client](config/client.en.md)
- [Global Disk Layout](config/layout-cluster.en.md)
- [OSD Disk Layout](config/layout-osd.en.md)
- [OSD Runtime Parameters](config/osd.en.md)

View File

@@ -36,6 +36,7 @@
- [Общие](config/common.ru.md)
- [Сеть](config/network.ru.md)
- [Клиентский код](config/client.ru.md)
- [Глобальные дисковые параметры](config/layout-cluster.ru.md)
- [Дисковые параметры OSD](config/layout-osd.ru.md)
- [Прочие параметры OSD](config/osd.ru.md)

103
docs/config/client.en.md Normal file
View File

@@ -0,0 +1,103 @@
[Documentation](../../README.md#documentation) → [Configuration](../config.en.md) → Client Parameters
-----
[Читать на русском](client.ru.md)
# Client Parameters
These parameters apply only to clients and affect their interaction with
the cluster.
- [client_max_dirty_bytes](#client_max_dirty_bytes)
- [client_max_dirty_ops](#client_max_dirty_ops)
- [client_enable_writeback](#client_enable_writeback)
- [client_max_buffered_bytes](#client_max_buffered_bytes)
- [client_max_buffered_ops](#client_max_buffered_ops)
- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
## client_max_dirty_bytes
- Type: integer
- Default: 33554432
- Can be changed online: yes
Without [immediate_commit](layout-cluster.en.md#immediate_commit)=all this parameter sets the limit of "dirty"
(not committed by fsync) data allowed by the client before forcing an
additional fsync and committing the data. Also note that the client always
holds a copy of uncommitted data in memory so this setting also affects
RAM usage of clients.
## client_max_dirty_ops
- Type: integer
- Default: 1024
- Can be changed online: yes
Same as client_max_dirty_bytes, but instead of total size, limits the number
of uncommitted write operations.
## client_enable_writeback
- Type: boolean
- Default: false
- Can be changed online: yes
This parameter enables client-side write buffering. This means that write
requests are accumulated in memory for a short time before being sent to
a Vitastor cluster which allows to send them in parallel and increase
performance of some applications. Writes are buffered until client forces
a flush with fsync() or until the amount of buffered writes exceeds the
limit.
Write buffering significantly increases performance of some applications,
for example, CrystalDiskMark under Windows (LOL :-D), but also any other
applications if they do writes in one of two non-optimal ways: either if
they do a lot of small (4 kb or so) sequential writes, or if they do a lot
of small random writes, but without any parallelism or asynchrony, and also
without calling fsync().
With write buffering enabled, you can expect around 22000 T1Q1 random write
iops in QEMU more or less regardless of the quality of your SSDs, and this
number is in fact bound by QEMU itself rather than Vitastor (check it
yourself by adding a "driver=null-co" disk in QEMU). Without write
buffering, the current record is 9900 iops, but the number is usually
even lower with non-ideal hardware, for example, it may be 5000 iops.
Even when this parameter is enabled, write buffering isn't enabled until
the client explicitly allows it, because enabling it without the client
being aware of the fact that his writes may be buffered may lead to data
loss. Because of this, older versions of clients don't support write
buffering at all, newer versions of the QEMU driver allow write buffering
only if it's enabled in disk settings with `-blockdev cache.direct=false`,
and newer versions of FIO only allow write buffering if you don't specify
`-direct=1`. NBD and NFS drivers allow write buffering by default.
You can overcome this restriction too with the `client_writeback_allowed`
parameter, but you shouldn't do that unless you **really** know what you
are doing.
## client_max_buffered_bytes
- Type: integer
- Default: 33554432
- Can be changed online: yes
Maximum total size of buffered writes which triggers write-back when reached.
## client_max_buffered_ops
- Type: integer
- Default: 1024
- Can be changed online: yes
Maximum number of buffered writes which triggers write-back when reached.
Multiple consecutive modified data regions are counted as 1 write here.
## client_max_writeback_iodepth
- Type: integer
- Default: 256
- Can be changed online: yes
Maximum number of parallel writes when flushing buffered data to the server.

103
docs/config/client.ru.md Normal file
View File

@@ -0,0 +1,103 @@
[Документация](../../README-ru.md#документация) → [Конфигурация](../config.ru.md) → Параметры клиентского кода
-----
[Read in English](client.en.md)
# Параметры клиентского кода
Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD) и
затрагивают логику их работы с кластером.
- [client_max_dirty_bytes](#client_max_dirty_bytes)
- [client_max_dirty_ops](#client_max_dirty_ops)
- [client_enable_writeback](#client_enable_writeback)
- [client_max_buffered_bytes](#client_max_buffered_bytes)
- [client_max_buffered_ops](#client_max_buffered_ops)
- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
## client_max_dirty_bytes
- Тип: целое число
- Значение по умолчанию: 33554432
- Можно менять на лету: да
При работе без [immediate_commit](layout-cluster.ru.md#immediate_commit)=all - это лимит объёма "грязных" (не
зафиксированных fsync-ом) данных, при достижении которого клиент будет
принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
что в этом случае до момента fsync клиент хранит копию незафиксированных
данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
## client_max_dirty_ops
- Тип: целое число
- Значение по умолчанию: 1024
- Можно менять на лету: да
Аналогично client_max_dirty_bytes, но ограничивает количество
незафиксированных операций записи вместо их общего объёма.
## client_enable_writeback
- Тип: булево (да/нет)
- Значение по умолчанию: false
- Можно менять на лету: да
Данный параметр разрешает включать буферизацию записи в памяти. Буферизация
означает, что операции записи отправляются на кластер Vitastor не сразу, а
могут небольшое время накапливаться в памяти и сбрасываться сразу пакетами,
до тех пор, пока либо не будет превышен лимит неотправленных записей, либо
пока клиент не вызовет fsync.
Буферизация значительно повышает производительность некоторых приложений,
например, CrystalDiskMark в Windows (ха-ха :-D), но также и любых других,
которые пишут на диск неоптимально: либо последовательно, но мелкими блоками
(например, по 4 кб), либо случайно, но без параллелизма и без fsync - то
есть, например, отправляя 128 операций записи в разные места диска, но не
все сразу с помощью асинхронного I/O, а по одной.
В QEMU с буферизацией записи можно ожидать показателя примерно 22000
операций случайной записи в секунду в 1 поток и с глубиной очереди 1 (T1Q1)
без fsync, почти вне зависимости от того, насколько хороши ваши диски - эта
цифра упирается в сам QEMU. Без буферизации рекорд пока что - 9900 операций
в секунду, но на железе похуже может быть и поменьше, например, 5000 операций
в секунду.
При этом, даже если данный параметр включён, буферизация не включается, если
явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
буферизуются, это может приводить к потере данных. Поэтому в старых версиях
клиентских драйверов буферизация записи не включается вообще, в новых
версиях QEMU-драйвера включается, только если разрешена опцией диска
`-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,
но делать так не надо, если только вы не уверены в том, что делаете, на все
100%. :-)
## client_max_buffered_bytes
- Тип: целое число
- Значение по умолчанию: 33554432
- Можно менять на лету: да
Максимальный общий размер буферизованных записей, при достижении которого
начинается процесс сброса данных на сервер.
## client_max_buffered_ops
- Тип: целое число
- Значение по умолчанию: 1024
- Можно менять на лету: да
Максимальное количество буферизованных записей, при достижении которого
начинается процесс сброса данных на сервер. При этом несколько
последовательных изменённых областей здесь считаются 1 записью.
## client_max_writeback_iodepth
- Тип: целое число
- Значение по умолчанию: 256
- Можно менять на лету: да
Максимальное число параллельных операций записи при сбросе буферов на сервер.

View File

@@ -30,7 +30,6 @@ between clients, OSDs and etcd.
- [etcd_slow_timeout](#etcd_slow_timeout)
- [etcd_keepalive_timeout](#etcd_keepalive_timeout)
- [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
- [client_dirty_limit](#client_dirty_limit)
## tcp_header_buffer_size
@@ -240,17 +239,3 @@ etcd_report_interval to guarantee that keepalive actually works.
etcd websocket ping interval required to keep the connection alive and
detect disconnections quickly.
## client_dirty_limit
- Type: integer
- Default: 33554432
- Can be changed online: yes
Without immediate_commit=all this parameter sets the limit of "dirty"
(not committed by fsync) data allowed by the client before forcing an
additional fsync and committing the data. Also note that the client always
holds a copy of uncommitted data in memory so this setting also affects
RAM usage of clients.
This parameter doesn't affect OSDs themselves.

View File

@@ -30,7 +30,6 @@
- [etcd_slow_timeout](#etcd_slow_timeout)
- [etcd_keepalive_timeout](#etcd_keepalive_timeout)
- [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
- [client_dirty_limit](#client_dirty_limit)
## tcp_header_buffer_size
@@ -251,17 +250,3 @@ etcd_report_interval, чтобы keepalive гарантированно рабо
- Можно менять на лету: да
Интервал проверки живости вебсокет-подключений к etcd.
## client_dirty_limit
- Тип: целое число
- Значение по умолчанию: 33554432
- Можно менять на лету: да
При работе без immediate_commit=all - это лимит объёма "грязных" (не
зафиксированных fsync-ом) данных, при достижении которого клиент будет
принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
что в этом случае до момента fsync клиент хранит копию незафиксированных
данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
Параметр не влияет на сами OSD.

View File

@@ -11,6 +11,7 @@ initialization and can be changed - either with an OSD restart or, for some of
them, even without restarting by updating configuration in etcd.
- [etcd_report_interval](#etcd_report_interval)
- [etcd_stats_interval](#etcd_stats_interval)
- [run_primary](#run_primary)
- [osd_network](#osd_network)
- [bind_address](#bind_address)
@@ -56,11 +57,21 @@ them, even without restarting by updating configuration in etcd.
- Type: seconds
- Default: 5
Interval at which OSDs report their state to etcd. Affects OSD lease time
Interval at which OSDs report their liveness to etcd. Affects OSD lease time
and thus the failover speed. Lease time is equal to this parameter value
plus max_etcd_attempts * etcd_quick_timeout because it should be guaranteed
that every OSD always refreshes its lease in time.
## etcd_stats_interval
- Type: seconds
- Default: 30
Interval at which OSDs report their statistics to etcd. Highly affects the
imposed load on etcd, because statistics include a key for every OSD and
for every PG. At the same time, low statistic intervals make `vitastor-cli`
statistics more responsive.
## run_primary
- Type: boolean

View File

@@ -12,6 +12,7 @@
изменения конфигурации в etcd.
- [etcd_report_interval](#etcd_report_interval)
- [etcd_stats_interval](#etcd_stats_interval)
- [run_primary](#run_primary)
- [osd_network](#osd_network)
- [bind_address](#bind_address)
@@ -57,11 +58,21 @@
- Тип: секунды
- Значение по умолчанию: 5
Интервал, с которым OSD обновляет своё состояние в etcd. Значение параметра
влияет на время резервации (lease) OSD и поэтому на скорость переключения
Интервал, с которым OSD сообщает о том, что жив, в etcd. Значение параметра
влияет на время резервации (lease) OSD и поэтому - на скорость переключения
при падении OSD. Время lease равняется значению этого параметра плюс
max_etcd_attempts * etcd_quick_timeout.
## etcd_stats_interval
- Тип: секунды
- Значение по умолчанию: 30
Интервал, с которым OSD обновляет свою статистику в etcd. Сильно влияет на
создаваемую нагрузку на etcd, потому что статистика содержит по ключу на
каждый OSD и на каждую PG. В то же время низкий интервал делает
статистику, печатаемую `vitastor-cli`, отзывчивей.
## run_primary
- Тип: булево (да/нет)

View File

@@ -0,0 +1,4 @@
# Client Parameters
These parameters apply only to clients and affect their interaction with
the cluster.

View File

@@ -0,0 +1,4 @@
# Параметры клиентского кода
Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD) и
затрагивают логику их работы с кластером.

124
docs/config/src/client.yml Normal file
View File

@@ -0,0 +1,124 @@
- name: client_max_dirty_bytes
type: int
default: 33554432
online: true
info: |
Without [immediate_commit](layout-cluster.en.md#immediate_commit)=all this parameter sets the limit of "dirty"
(not committed by fsync) data allowed by the client before forcing an
additional fsync and committing the data. Also note that the client always
holds a copy of uncommitted data in memory so this setting also affects
RAM usage of clients.
info_ru: |
При работе без [immediate_commit](layout-cluster.ru.md#immediate_commit)=all - это лимит объёма "грязных" (не
зафиксированных fsync-ом) данных, при достижении которого клиент будет
принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
что в этом случае до момента fsync клиент хранит копию незафиксированных
данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
- name: client_max_dirty_ops
type: int
default: 1024
online: true
info: |
Same as client_max_dirty_bytes, but instead of total size, limits the number
of uncommitted write operations.
info_ru: |
Аналогично client_max_dirty_bytes, но ограничивает количество
незафиксированных операций записи вместо их общего объёма.
- name: client_enable_writeback
type: bool
default: false
online: true
info: |
This parameter enables client-side write buffering. This means that write
requests are accumulated in memory for a short time before being sent to
a Vitastor cluster which allows to send them in parallel and increase
performance of some applications. Writes are buffered until client forces
a flush with fsync() or until the amount of buffered writes exceeds the
limit.
Write buffering significantly increases performance of some applications,
for example, CrystalDiskMark under Windows (LOL :-D), but also any other
applications if they do writes in one of two non-optimal ways: either if
they do a lot of small (4 kb or so) sequential writes, or if they do a lot
of small random writes, but without any parallelism or asynchrony, and also
without calling fsync().
With write buffering enabled, you can expect around 22000 T1Q1 random write
iops in QEMU more or less regardless of the quality of your SSDs, and this
number is in fact bound by QEMU itself rather than Vitastor (check it
yourself by adding a "driver=null-co" disk in QEMU). Without write
buffering, the current record is 9900 iops, but the number is usually
even lower with non-ideal hardware, for example, it may be 5000 iops.
Even when this parameter is enabled, write buffering isn't enabled until
the client explicitly allows it, because enabling it without the client
being aware of the fact that his writes may be buffered may lead to data
loss. Because of this, older versions of clients don't support write
buffering at all, newer versions of the QEMU driver allow write buffering
only if it's enabled in disk settings with `-blockdev cache.direct=false`,
and newer versions of FIO only allow write buffering if you don't specify
`-direct=1`. NBD and NFS drivers allow write buffering by default.
You can overcome this restriction too with the `client_writeback_allowed`
parameter, but you shouldn't do that unless you **really** know what you
are doing.
info_ru: |
Данный параметр разрешает включать буферизацию записи в памяти. Буферизация
означает, что операции записи отправляются на кластер Vitastor не сразу, а
могут небольшое время накапливаться в памяти и сбрасываться сразу пакетами,
до тех пор, пока либо не будет превышен лимит неотправленных записей, либо
пока клиент не вызовет fsync.
Буферизация значительно повышает производительность некоторых приложений,
например, CrystalDiskMark в Windows (ха-ха :-D), но также и любых других,
которые пишут на диск неоптимально: либо последовательно, но мелкими блоками
(например, по 4 кб), либо случайно, но без параллелизма и без fsync - то
есть, например, отправляя 128 операций записи в разные места диска, но не
все сразу с помощью асинхронного I/O, а по одной.
В QEMU с буферизацией записи можно ожидать показателя примерно 22000
операций случайной записи в секунду в 1 поток и с глубиной очереди 1 (T1Q1)
без fsync, почти вне зависимости от того, насколько хороши ваши диски - эта
цифра упирается в сам QEMU. Без буферизации рекорд пока что - 9900 операций
в секунду, но на железе похуже может быть и поменьше, например, 5000 операций
в секунду.
При этом, даже если данный параметр включён, буферизация не включается, если
явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
буферизуются, это может приводить к потере данных. Поэтому в старых версиях
клиентских драйверов буферизация записи не включается вообще, в новых
версиях QEMU-драйвера включается, только если разрешена опцией диска
`-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,
но делать так не надо, если только вы не уверены в том, что делаете, на все
100%. :-)
- name: client_max_buffered_bytes
type: int
default: 33554432
online: true
info: |
Maximum total size of buffered writes which triggers write-back when reached.
info_ru: |
Максимальный общий размер буферизованных записей, при достижении которого
начинается процесс сброса данных на сервер.
- name: client_max_buffered_ops
type: int
default: 1024
online: true
info: |
Maximum number of buffered writes which triggers write-back when reached.
Multiple consecutive modified data regions are counted as 1 write here.
info_ru: |
Максимальное количество буферизованных записей, при достижении которого
начинается процесс сброса данных на сервер. При этом несколько
последовательных изменённых областей здесь считаются 1 записью.
- name: client_max_writeback_iodepth
type: int
default: 256
online: true
info: |
Maximum number of parallel writes when flushing buffered data to the server.
info_ru: |
Максимальное число параллельных операций записи при сбросе буферов на сервер.

View File

@@ -28,6 +28,8 @@
{{../../config/network.en.md|indent=2}}
{{../../config/client.en.md|indent=2}}
{{../../config/layout-cluster.en.md|indent=2}}
{{../../config/layout-osd.en.md|indent=2}}

View File

@@ -28,6 +28,8 @@
{{../../config/network.ru.md|indent=2}}
{{../../config/client.ru.md|indent=2}}
{{../../config/layout-cluster.ru.md|indent=2}}
{{../../config/layout-osd.ru.md|indent=2}}

View File

@@ -259,23 +259,3 @@
detect disconnections quickly.
info_ru: |
Интервал проверки живости вебсокет-подключений к etcd.
- name: client_dirty_limit
type: int
default: 33554432
online: true
info: |
Without immediate_commit=all this parameter sets the limit of "dirty"
(not committed by fsync) data allowed by the client before forcing an
additional fsync and committing the data. Also note that the client always
holds a copy of uncommitted data in memory so this setting also affects
RAM usage of clients.
This parameter doesn't affect OSDs themselves.
info_ru: |
При работе без immediate_commit=all - это лимит объёма "грязных" (не
зафиксированных fsync-ом) данных, при достижении которого клиент будет
принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
что в этом случае до момента fsync клиент хранит копию незафиксированных
данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
Параметр не влияет на сами OSD.

View File

@@ -2,15 +2,28 @@
type: sec
default: 5
info: |
Interval at which OSDs report their state to etcd. Affects OSD lease time
Interval at which OSDs report their liveness to etcd. Affects OSD lease time
and thus the failover speed. Lease time is equal to this parameter value
plus max_etcd_attempts * etcd_quick_timeout because it should be guaranteed
that every OSD always refreshes its lease in time.
info_ru: |
Интервал, с которым OSD обновляет своё состояние в etcd. Значение параметра
влияет на время резервации (lease) OSD и поэтому на скорость переключения
Интервал, с которым OSD сообщает о том, что жив, в etcd. Значение параметра
влияет на время резервации (lease) OSD и поэтому - на скорость переключения
при падении OSD. Время lease равняется значению этого параметра плюс
max_etcd_attempts * etcd_quick_timeout.
- name: etcd_stats_interval
type: sec
default: 30
info: |
Interval at which OSDs report their statistics to etcd. Highly affects the
imposed load on etcd, because statistics include a key for every OSD and
for every PG. At the same time, low statistic intervals make `vitastor-cli`
statistics more responsive.
info_ru: |
Интервал, с которым OSD обновляет свою статистику в etcd. Сильно влияет на
создаваемую нагрузку на etcd, потому что статистика содержит по ключу на
каждый OSD и на каждую PG. В то же время низкий интервал делает
статистику, печатаемую `vitastor-cli`, отзывчивей.
- name: run_primary
type: bool
default: true

View File

@@ -17,4 +17,15 @@ and apply all `NNN-*.yaml` manifests to your Kubernetes installation:
for i in ./???-*.yaml; do kubectl apply -f $i; done
```
After that you'll be able to create PersistentVolumes. See example in [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
After that you'll be able to create PersistentVolumes.
## Features
Vitastor CSI supports:
- Kubernetes starting with 1.20 (or 1.17 for older vitastor-csi <= 1.1.0)
- Filesystem RWO (ReadWriteOnce) volumes. Example: [PVC](../../csi/deploy/example-pvc.yaml), [pod](../../csi/deploy/example-test-pod.yaml)
- Raw block RWX (ReadWriteMany) volumes. Example: [PVC](../../csi/deploy/example-pvc-block.yaml), [pod](../../csi/deploy/example-test-pod-block.yaml)
- Volume expansion
- Volume snapshots. Example: [snapshot class](../../csi/deploy/example-snapshot-class.yaml), [snapshot](../../csi/deploy/example-snapshot.yaml), [clone](../../csi/deploy/example-snapshot-clone.yaml)
Remember that to use snapshots with CSI you also have to install [Snapshot Controller and CRDs](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).

View File

@@ -17,4 +17,15 @@
for i in ./???-*.yaml; do kubectl apply -f $i; done
```
После этого вы сможете создавать PersistentVolume. Пример смотрите в файле [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
После этого вы сможете создавать PersistentVolume.
## Возможности
CSI-плагин Vitastor поддерживает:
- Версии Kubernetes, начиная с 1.20 (или с 1.17 для более старых vitastor-csi <= 1.1.0)
- Файловые RWO (ReadWriteOnce) тома. Пример: [PVC](../../csi/deploy/example-pvc.yaml), [под](../../csi/deploy/example-test-pod.yaml)
- Сырые блочные RWX (ReadWriteMany) тома. Пример: [PVC](../../csi/deploy/example-pvc-block.yaml), [под](../../csi/deploy/example-test-pod-block.yaml)
- Расширение размера томов
- Снимки томов. Пример: [класс снимков](../../csi/deploy/example-snapshot-class.yaml), [снимок](../../csi/deploy/example-snapshot.yaml), [клон снимка](../../csi/deploy/example-snapshot-clone.yaml)
Не забывайте, что для использования снимков нужно сначала установить [контроллер снимков и CRD](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).

View File

@@ -31,6 +31,7 @@
- [RDMA/RoCEv2 support via libibverbs](../config/network.en.md#rdma_device)
- [Scrubbing](../config/osd.en.md#auto_scrub) (verification of copies)
- [Checksums](../config/layout-osd.en.md#data_csum_type)
- [Client write-back cache](../config/client.en.md#client_enable_writeback)
## Plugins and tools
@@ -50,13 +51,15 @@
The following features are planned for the future:
- File system
- Control plane optimisation
- Other administrative tools
- Web GUI
- OpenNebula plugin
- iSCSI proxy
- iSCSI and NVMeoF gateways
- Multi-threaded client
- Faster failover
- S3
- Tiered storage (SSD caching)
- NVDIMM support
- Compression (possibly)
- Read caching using system page cache (possibly)

View File

@@ -33,6 +33,7 @@
- [Поддержка RDMA/RoCEv2 через libibverbs](../config/network.ru.md#rdma_device)
- [Фоновая проверка целостности](../config/osd.ru.md#auto_scrub) (сверка копий)
- [Контрольные суммы](../config/layout-osd.ru.md#data_csum_type)
- [Буферизация записи на стороне клиента](../config/client.ru.md#client_enable_writeback)
## Драйверы и инструменты
@@ -50,12 +51,15 @@
## Планы развития
- Файловая система
- Оптимизация слоя управления
- Другие инструменты администрирования
- Web-интерфейс
- Плагин для OpenNebula
- iSCSI-прокси
- iSCSI и NVMeoF прокси
- Многопоточный клиент
- Более быстрое переключение при отказах
- S3
- Поддержка SSD-кэширования (tiered storage)
- Поддержка NVDIMM
- Возможно, сжатие

View File

@@ -34,6 +34,20 @@ qemu-system-x86_64 -enable-kvm -m 1024 \
-vnc 0.0.0.0:0
```
With a separate I/O thread:
```
qemu-system-x86_64 -enable-kvm -m 1024 \
-object iothread,id=vitastor1 \
-blockdev '{"node-name":"drive-virtio-disk0","driver":"vitastor","image":"debian9",
"cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
-device 'virtio-blk-pci,iothread=vitastor1,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
id=virtio-disk0,bootindex=1,write-cache=off' \
-vnc 0.0.0.0:0
```
You can also specify inode ID, pool and size manually instead of `:image=<IMAGE>` option: `:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
## qemu-img
For qemu-img, you should use `vitastor:etcd_host=<HOST>:image=<IMAGE>` as filename.
@@ -84,6 +98,29 @@ This can be used for backups. Just note that exporting an image that is currentl
is of course unsafe and doesn't produce a consistent result, so only export snapshots if you do this
on a live VM.
## vhost-user-blk
QEMU, starting with 6.0, includes support for attaching disks via a separate
userspace worker process, called `vhost-user-blk`. It usually has slightly (20-30 us)
lower latency.
Example commands to use it with Vitastor:
```
qemu-storage-daemon \
--daemonize \
--blockdev '{"node-name":"drive-virtio-disk1","driver":"vitastor","image":"testosd1","cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
--export type=vhost-user-blk,id=vitastor1,node-name=drive-virtio-disk1,addr.type=unix,addr.path=/run/vitastor1-user-blk.sock,writable=on,num-queues=1
qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
-object memory-backend-memfd,id=mem,size=2G,share=on \
-chardev socket,id=vitastor1,reconnect=1,path=/run/vitastor1-user-blk.sock \
-device vhost-user-blk-pci,chardev=vitastor1,num-queues=1,config-wce=off \
-vnc 0.0.0.0:0
```
memfd memory-backend is crucial, vhost-user-blk does not work without it.
## VDUSE
Linux kernel, starting with version 5.15, supports a new interface for attaching virtual disks

View File

@@ -36,6 +36,18 @@ qemu-system-x86_64 -enable-kvm -m 1024 \
-vnc 0.0.0.0:0
```
С отдельным потоком ввода-вывода:
```
qemu-system-x86_64 -enable-kvm -m 1024 \
-object iothread,id=vitastor1 \
-blockdev '{"node-name":"drive-virtio-disk0","driver":"vitastor","image":"debian9",
"cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
-device 'virtio-blk-pci,iothread=vitastor1,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
id=virtio-disk0,bootindex=1,write-cache=off' \
-vnc 0.0.0.0:0
```
Вместо `:image=<IMAGE>` также можно указывать номер инода, пул и размер: `:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
## qemu-img
@@ -88,6 +100,29 @@ qemu-img rebase -u -b '' testimg.qcow2
в то же время идёт запись, небезопасно - результат чтения не будет целостным. Так что если вы работаете
с активными виртуальными машинами, экспортируйте только их снимки, но не сам образ.
## vhost-user-blk
QEMU, начиная с 6.0, позволяет подключать диски через отдельный рабочий процесс.
Этот метод подключения называется `vhost-user-blk` и обычно имеет чуть меньшую
задержку (ниже на 20-30 микросекунд, чем при обычном методе).
Пример команд для использования vhost-user-blk с Vitastor:
```
qemu-storage-daemon \
--daemonize \
--blockdev '{"node-name":"drive-virtio-disk1","driver":"vitastor","image":"testosd1","cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
--export type=vhost-user-blk,id=vitastor1,node-name=drive-virtio-disk1,addr.type=unix,addr.path=/run/vitastor1-user-blk.sock,writable=on,num-queues=1
qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
-object memory-backend-memfd,id=mem,size=2G,share=on \
-chardev socket,id=vitastor1,reconnect=1,path=/run/vitastor1-user-blk.sock \
-device vhost-user-blk-pci,chardev=vitastor1,num-queues=1,config-wce=off \
-vnc 0.0.0.0:0
```
Здесь критична опция memory-backend-memfd, vhost-user-blk без неё не работает.
## VDUSE
В Linux, начиная с версии ядра 5.15, доступен новый интерфейс для подключения виртуальных дисков

View File

@@ -3,5 +3,5 @@ SUBSYSTEM=="block", ENV{ID_PART_ENTRY_TYPE}=="e7009fac-a5a1-4d72-af72-53de130599
IMPORT{program}="/usr/bin/vitastor-disk udev $devnode", \
SYMLINK+="vitastor/$env{VITASTOR_ALIAS}"
ENV{VITASTOR_OSD_NUM}!="", ACTION=="add", RUN{program}+="/usr/bin/systemctl enable --now vitastor-osd@$env{VITASTOR_OSD_NUM}"
ENV{VITASTOR_OSD_NUM}!="", ACTION=="remove", RUN{program}+="/usr/bin/systemctl disable --now vitastor-osd@$env{VITASTOR_OSD_NUM}"
ENV{VITASTOR_OSD_NUM}!="", ACTION=="add", RUN{program}+="/usr/bin/systemctl enable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"
ENV{VITASTOR_OSD_NUM}!="", ACTION=="remove", RUN{program}+="/usr/bin/systemctl disable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"

View File

@@ -99,6 +99,7 @@ const etcd_tree = {
etcd_ws_keepalive_interval: 30, // seconds
// osd
etcd_report_interval: 5, // seconds
etcd_stats_interval: 30, // seconds
run_primary: true,
osd_network: null, // "192.168.7.0/24" or an array of masks
bind_address: "0.0.0.0",
@@ -396,12 +397,13 @@ class Mon
this.etcd_prefix = this.etcd_prefix.replace(/\/\/+/g, '/').replace(/^\/?(.*[^\/])\/?$/, '/$1');
this.etcd_start_timeout = (config.etcd_start_timeout || 5) * 1000;
this.state = JSON.parse(JSON.stringify(this.constructor.etcd_tree));
this.prev_stats = { osd_stats: {}, osd_diff: {} };
this.signals_set = false;
this.stat_time = Date.now();
this.ws = null;
this.ws_alive = false;
this.ws_keepalive_timer = null;
this.on_stop_cb = () => this.on_stop(0).catch(console.error);
this.recheck_pgs_active = false;
}
parse_etcd_addresses(addrs)
@@ -551,9 +553,9 @@ class Mon
const cur_addr = this.pick_next_etcd();
const base = 'ws'+cur_addr.substr(4);
let now = Date.now();
if (tried[base] && now-tried[base] < timeout)
if (tried[base] && now-tried[base] < this.etcd_start_timeout)
{
await new Promise(ok => setTimeout(ok, timeout-(now-tried[base])));
await new Promise(ok => setTimeout(ok, this.etcd_start_timeout-(now-tried[base])));
now = Date.now();
}
tried[base] = now;
@@ -691,8 +693,27 @@ class Mon
});
}
// Schedule save_last_clean() to to run after a small timeout (1s) (to not spam etcd)
schedule_save_last_clean()
{
if (!this.save_last_clean_timer)
{
this.save_last_clean_timer = setTimeout(() =>
{
this.save_last_clean_timer = null;
this.save_last_clean().catch(this.die);
}, this.config.mon_change_timeout || 1000);
}
}
async save_last_clean()
{
if (this.save_last_clean_running)
{
this.schedule_save_last_clean();
return;
}
this.save_last_clean_running = true;
// last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
const new_clean_pgs = { items: {} };
next_pool:
@@ -729,6 +750,7 @@ class Mon
value: b64(JSON.stringify(this.state.history.last_clean_pgs))
} } ],
}, this.etcd_start_timeout, 0);
this.save_last_clean_running = false;
}
get_mon_state()
@@ -1202,6 +1224,12 @@ class Mon
async recheck_pgs()
{
if (this.recheck_pgs_active)
{
this.schedule_recheck();
return;
}
this.recheck_pgs_active = true;
// Take configuration and state, check it against the stored configuration hash
// Recalculate PGs and save them to etcd if the configuration is changed
// FIXME: Do not change anything if the distribution is good and random enough and no PGs are degraded
@@ -1223,6 +1251,7 @@ class Mon
// Pool deleted. Delete all PGs, but first stop them.
if (!await this.stop_all_pgs(pool_id))
{
this.recheck_pgs_active = false;
this.schedule_recheck();
return;
}
@@ -1291,9 +1320,16 @@ class Mon
// PG count changed. Need to bring all PGs down.
if (!await this.stop_all_pgs(pool_id))
{
this.recheck_pgs_active = false;
this.schedule_recheck();
return;
}
}
if (prev_pgs.length != pool_cfg.pg_count)
{
// Scale PG count
// Do it even if old_pg_count is already equal to pool_cfg.pg_count,
// because last_clean_pgs may still contain the old number of PGs
const new_pg_history = [];
PGUtil.scale_pg_count(prev_pgs, real_prev_pgs, pg_history, new_pg_history, pool_cfg.pg_count);
pg_history = new_pg_history;
@@ -1395,6 +1431,7 @@ class Mon
await this.save_pg_config(new_config_pgs);
}
}
this.recheck_pgs_active = false;
}
async save_pg_config(new_config_pgs, etcd_request = { compare: [], success: [] })
@@ -1444,7 +1481,6 @@ class Mon
}
// Schedule a recheck to run after a small timeout (1s)
// If already scheduled, cancel previous timer and schedule it again
// This is required for multiple change events to trigger at most 1 recheck in 1s
schedule_recheck()
{
@@ -1458,15 +1494,15 @@ class Mon
}
}
derive_osd_stats(st, prev)
derive_osd_stats(st, prev, prev_diff)
{
const zero_stats = { op: { bps: 0n, iops: 0n, lat: 0n }, subop: { iops: 0n, lat: 0n }, recovery: { bps: 0n, iops: 0n } };
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
if (!st || !st.time || prev && (prev.time || this.stat_time/1000) >= st.time)
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
if (!st || !st.time || !prev || prev.time >= st.time)
{
return diff;
return prev_diff || diff;
}
const timediff = BigInt(st.time*1000 - (prev && prev.time*1000 || this.stat_time));
const timediff = BigInt(st.time*1000 - prev.time*1000);
for (const op in st.op_stats||{})
{
const pr = prev && prev.op_stats && prev.op_stats[op];
@@ -1498,25 +1534,47 @@ class Mon
if (n > 0)
diff.recovery_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff };
}
for (const pool_id in st.inode_stats||{})
{
const pool_diff = diff.inode_stats[pool_id] = {};
for (const inode_num in st.inode_stats[pool_id])
{
const inode_diff = diff.inode_stats[pool_id][inode_num] = {};
for (const op of [ 'read', 'write', 'delete' ])
{
const c = st.inode_stats[pool_id][inode_num][op];
const pr = prev && prev.inode_stats && prev.inode_stats[pool_id] &&
prev.inode_stats[pool_id][inode_num] && prev.inode_stats[pool_id][inode_num][op];
const n = BigInt(c.count||0) - BigInt(pr && pr.count||0);
inode_diff[op] = {
bps: (BigInt(c.bytes||0) - BigInt(pr && pr.bytes||0))*1000n/timediff,
iops: n*1000n/timediff,
lat: (BigInt(c.usec||0) - BigInt(pr && pr.usec||0))/(n || 1n),
};
}
}
}
return diff;
}
sum_op_stats(timestamp, prev_stats)
sum_op_stats()
{
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
if (!prev_stats || prev_stats.timestamp >= timestamp)
for (const osd in this.state.osd.stats)
{
return sum_diff;
const cur = { ...this.state.osd.stats[osd], inode_stats: this.state.osd.inodestats[osd]||{} };
this.prev_stats.osd_diff[osd] = this.derive_osd_stats(
cur, this.prev_stats.osd_stats[osd], this.prev_stats.osd_diff[osd]
);
this.prev_stats.osd_stats[osd] = cur;
}
const tm = BigInt(timestamp - (prev_stats.timestamp || 0));
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
// Sum derived values instead of deriving summed
for (const osd in this.state.osd.stats)
{
const derived = this.derive_osd_stats(this.state.osd.stats[osd],
this.prev_stats && this.prev_stats.osd_stats && this.prev_stats.osd_stats[osd]);
for (const type in derived)
const derived = this.prev_stats.osd_diff[osd];
for (const type in sum_diff)
{
for (const op in derived[type])
for (const op in derived[type]||{})
{
for (const k in derived[type][op])
{
@@ -1573,14 +1631,14 @@ class Mon
return { object_counts, object_bytes };
}
sum_inode_stats(prev_stats, timestamp, prev_timestamp)
sum_inode_stats()
{
const inode_stats = {};
const inode_stub = () => ({
raw_used: 0n,
read: { count: 0n, usec: 0n, bytes: 0n },
write: { count: 0n, usec: 0n, bytes: 0n },
delete: { count: 0n, usec: 0n, bytes: 0n },
read: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
write: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
delete: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
});
const seen_pools = {};
for (const pool_id in this.state.config.pools)
@@ -1632,11 +1690,25 @@ class Mon
}
}
}
if (prev_stats && prev_timestamp >= timestamp)
for (const osd in this.prev_stats.osd_diff)
{
prev_stats = null;
for (const pool_id in this.prev_stats.osd_diff[osd].inode_stats)
{
for (const inode_num in this.prev_stats.osd_diff[osd].inode_stats[pool_id])
{
inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
for (const op of [ 'read', 'write', 'delete' ])
{
const op_diff = this.prev_stats.osd_diff[osd].inode_stats[pool_id][inode_num][op] || {};
const op_st = inode_stats[pool_id][inode_num][op];
op_st.bps += op_diff.bps;
op_st.iops += op_diff.iops;
op_st.lat += op_diff.lat;
op_st.n_osd = (op_st.n_osd || 0) + 1;
}
}
}
}
const tm = prev_stats ? BigInt(timestamp - prev_timestamp) : 0;
for (const pool_id in inode_stats)
{
for (const inode_num in inode_stats[pool_id])
@@ -1645,11 +1717,12 @@ class Mon
for (const op of [ 'read', 'write', 'delete' ])
{
const op_st = inode_stats[pool_id][inode_num][op];
const prev_st = prev_stats && prev_stats[pool_id] && prev_stats[pool_id][inode_num] && prev_stats[pool_id][inode_num][op];
op_st.bps = prev_st ? (op_st.bytes - prev_st.bytes) * 1000n / tm : 0;
op_st.iops = prev_st ? (op_st.count - prev_st.count) * 1000n / tm : 0;
op_st.lat = prev_st ? (op_st.usec - prev_st.usec) / ((op_st.count - prev_st.count) || 1n) : 0;
if (op_st.bps > 0 || op_st.iops > 0 || op_st.lat > 0)
if (op_st.n_osd)
{
op_st.lat /= BigInt(op_st.n_osd);
delete op_st.n_osd;
}
if (op_st.bps > 0 || op_st.iops > 0)
nonzero = true;
}
if (!nonzero && (!this.state.config.inode[pool_id] || !this.state.config.inode[pool_id][inode_num]))
@@ -1682,15 +1755,9 @@ class Mon
async update_total_stats()
{
const txn = [];
const timestamp = Date.now();
const { object_counts, object_bytes } = this.sum_object_counts();
let stats = this.sum_op_stats(timestamp, this.prev_stats);
let { inode_stats, seen_pools } = this.sum_inode_stats(
this.prev_stats ? this.prev_stats.inode_stats : null,
timestamp, this.prev_stats ? this.prev_stats.timestamp : null
);
this.prev_stats = { timestamp, inode_stats, osd_stats: { ...this.state.osd.stats } };
this.stat_time = Date.now();
let stats = this.sum_op_stats();
let { inode_stats, seen_pools } = this.sum_inode_stats();
stats.object_counts = object_counts;
stats.object_bytes = object_bytes;
stats = this.serialize_bigints(stats);

View File

@@ -1,6 +1,6 @@
{
"name": "vitastor-mon",
"version": "1.0.0",
"version": "1.2.0",
"description": "Vitastor SDS monitor service",
"main": "mon-main.js",
"scripts": {

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VERSION = '1.0.0'
VERSION = '1.2.0'
LOG = logging.getLogger(__name__)

View File

@@ -24,4 +24,4 @@ rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-1.0.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.0.0$(rpm --eval '%dist').tar.gz *
tar --transform 's#^#vitastor-1.2.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.2.0$(rpm --eval '%dist').tar.gz *

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-1.0.0.el7.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.2.0.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 1.0.0
Version: 1.2.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.0.0.el7.tar.gz
Source0: vitastor-1.2.0.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-1.0.0.el8.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.2.0.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 1.0.0
Version: 1.2.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.0.0.el8.tar.gz
Source0: vitastor-1.2.0.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -18,7 +18,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-1.0.0.el9.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.2.0.el9.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 1.0.0
Version: 1.2.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.0.0.el9.tar.gz
Source0: vitastor-1.2.0.el9.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
add_definitions(-DVERSION="1.0.0")
add_definitions(-DVERSION="1.2.0")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
if (${WITH_ASAN})
add_definitions(-fsanitize=address -fno-omit-frame-pointer)

View File

@@ -1372,7 +1372,8 @@ bool journal_flusher_co::trim_journal(int wait_base)
? (uint32_t)JE_START_V1_SIZE : (uint32_t)JE_START_V2_SIZE),
.reserved = 0,
.journal_start = new_trim_pos,
.version = JOURNAL_VERSION_V2,
.version = (uint64_t)(!bs->dsk.data_csum_type && ((journal_entry_start*)flusher->journal_superblock)->version == JOURNAL_VERSION_V1
? JOURNAL_VERSION_V1 : JOURNAL_VERSION_V2),
.data_csum_type = bs->dsk.data_csum_type,
.csum_block_size = bs->dsk.csum_block_size,
};

View File

@@ -553,7 +553,7 @@ resume_1:
}
if (je_start->size == JE_START_V0_SIZE ||
(je_start->version != JOURNAL_VERSION_V1 || je_start->size != JE_START_V1_SIZE) &&
(je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE))
(je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE && je_start->size != JE_START_V1_SIZE))
{
fprintf(
stderr, "The code only supports journal versions 2 and 1, but it is %lu on disk."
@@ -562,7 +562,8 @@ resume_1:
);
exit(1);
}
if (je_start->version == JOURNAL_VERSION_V1)
if (je_start->version == JOURNAL_VERSION_V1 ||
je_start->version == JOURNAL_VERSION_V2 && je_start->size == JE_START_V1_SIZE)
{
je_start->data_csum_type = 0;
je_start->csum_block_size = 0;

View File

@@ -21,7 +21,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
dyn = calloc_or_die(1, dyn_size+sizeof(int));
*((int*)dyn) = 1;
}
uint8_t *dyn_ptr = (uint8_t*)(alloc_dyn_data ? dyn+sizeof(int) : &dyn);
uint8_t *dyn_ptr = (alloc_dyn_data ? (uint8_t*)dyn+sizeof(int) : (uint8_t*)&dyn);
uint64_t version = 1;
if (dirty_db.size() > 0)
{
@@ -289,13 +289,18 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
printf("Restoring %lx:%lx version: v%lu -> v%lu\n", op->oid.inode, op->oid.stripe, op->version, PRIV(op)->real_version);
#endif
auto prev_it = dirty_it;
prev_it--;
if (prev_it->first.oid == op->oid && prev_it->first.version >= PRIV(op)->real_version)
if (prev_it != dirty_db.begin())
{
// Original version is still invalid
// All subsequent writes to the same object must be canceled too
cancel_all_writes(op, dirty_it, -EEXIST);
return 2;
prev_it--;
if (prev_it->first.oid == op->oid && prev_it->first.version >= PRIV(op)->real_version)
{
// Original version is still invalid
// All subsequent writes to the same object must be canceled too
printf("Tried to write %lx:%lx v%lu after delete (old version v%lu), but already have v%lu\n",
op->oid.inode, op->oid.stripe, PRIV(op)->real_version, op->version, prev_it->first.version);
cancel_all_writes(op, dirty_it, -EEXIST);
return 2;
}
}
op->version = PRIV(op)->real_version;
PRIV(op)->real_version = 0;

View File

@@ -17,7 +17,7 @@
static const char *exe_name = NULL;
static const char* help_text =
"Vitastor command-line tool\n"
"Vitastor command-line tool " VERSION "\n"
"(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
"\n"
"COMMANDS:\n"

View File

@@ -109,7 +109,7 @@ resume_1:
}
for (auto pg_per_pair: pg_per_osd)
{
uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.pg_count / pg_per_pair.second;
uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.real_pg_count / pg_per_pair.second;
if (pool_avail > pg_free)
{
pool_avail = pg_free;
@@ -124,8 +124,10 @@ resume_1:
pool_avail *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
}
pool_stats[pool_cfg.id] = json11::Json::object {
{ "id", (uint64_t)pool_cfg.id },
{ "name", pool_cfg.name },
{ "pg_count", pool_cfg.pg_count },
{ "real_pg_count", pool_cfg.real_pg_count },
{ "scheme", pool_cfg.scheme == POOL_SCHEME_REPLICATED ? "replicated" : "ec" },
{ "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
@@ -176,7 +178,7 @@ resume_1:
{ "title", "SCHEME" },
});
cols.push_back(json11::Json::object{
{ "key", "pg_count" },
{ "key", "pg_count_fmt" },
{ "title", "PGS" },
});
cols.push_back(json11::Json::object{
@@ -205,6 +207,9 @@ resume_1:
double raw_to = kv.second["raw_to_usable"].number_value();
if (raw_to < 0.000001 && raw_to > -0.000001)
raw_to = 1;
kv.second["pg_count_fmt"] = kv.second["real_pg_count"] == kv.second["pg_count"]
? kv.second["real_pg_count"].as_string()
: kv.second["real_pg_count"].as_string()+"->"+kv.second["pg_count"].as_string();
kv.second["total_fmt"] = format_size(kv.second["total_raw"].uint64_value() / raw_to);
kv.second["used_fmt"] = format_size(kv.second["used_raw"].uint64_value() / raw_to);
kv.second["max_avail_fmt"] = format_size(kv.second["max_available"].uint64_value());

View File

@@ -174,7 +174,7 @@ resume_1:
{ "size", 0 },
{ "readonly", false },
{ "pool_id", (uint64_t)INODE_POOL(inode_num) },
{ "pool_name", pool_it == parent->cli->st_cli.pool_config.end()
{ "pool_name", pool_it != parent->cli->st_cli.pool_config.end()
? (pool_it->second.name == "" ? "<Unnamed>" : pool_it->second.name) : "?" },
{ "inode_num", INODE_NO_POOL(inode_num) },
{ "inode_id", inode_num },

View File

@@ -158,12 +158,7 @@ resume_2:
for (auto & pool_pair: parent->cli->st_cli.pool_config)
{
auto & pool_cfg = pool_pair.second;
bool active = true;
if (pool_cfg.pg_config.size() != pool_cfg.pg_count)
{
active = false;
pgs_by_state["offline"] += pool_cfg.pg_count-pool_cfg.pg_config.size();
}
bool active = pool_cfg.real_pg_count > 0;
pool_count++;
for (auto pg_it = pool_cfg.pg_config.begin(); pg_it != pool_cfg.pg_config.end(); pg_it++)
{

View File

@@ -64,7 +64,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
cluster_client_t::~cluster_client_t()
{
msgr.repeer_pgs = [this](osd_num_t){};
msgr.repeer_pgs = [](osd_num_t){};
if (ringloop)
{
ringloop->unregister_consumer(&consumer);
@@ -169,46 +169,52 @@ void cluster_client_t::calc_wait(cluster_op_t *op)
void cluster_client_t::inc_wait(uint64_t opcode, uint64_t flags, cluster_op_t *next, int inc)
{
if (opcode == OSD_OP_WRITE)
if (opcode != OSD_OP_WRITE && opcode != OSD_OP_SYNC)
{
while (next)
{
auto n2 = next->next;
if (next->opcode == OSD_OP_SYNC && (!(flags & OP_IMMEDIATE_COMMIT) || enable_writeback) ||
next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER))
{
next->prev_wait += inc;
assert(next->prev_wait >= 0);
if (!next->prev_wait)
{
if (next->opcode == OSD_OP_SYNC)
continue_sync(next);
else
continue_rw(next);
}
}
next = n2;
}
return;
}
else if (opcode == OSD_OP_SYNC)
cluster_op_t *bh_ops_local[32], **bh_ops = bh_ops_local;
int bh_op_count = 0, bh_op_max = 32;
while (next)
{
while (next)
auto n2 = next->next;
if (opcode == OSD_OP_WRITE
? (next->opcode == OSD_OP_SYNC && (!(flags & OP_IMMEDIATE_COMMIT) || enable_writeback) ||
next->opcode == OSD_OP_WRITE && (flags & OP_FLUSH_BUFFER) && !(next->flags & OP_FLUSH_BUFFER))
: (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE))
{
auto n2 = next->next;
if (next->opcode == OSD_OP_SYNC || next->opcode == OSD_OP_WRITE)
next->prev_wait += inc;
assert(next->prev_wait >= 0);
if (!next->prev_wait)
{
next->prev_wait += inc;
assert(next->prev_wait >= 0);
if (!next->prev_wait)
// Kind of std::vector with local "small vector optimisation"
if (bh_op_count >= bh_op_max)
{
if (next->opcode == OSD_OP_SYNC)
continue_sync(next);
else
continue_rw(next);
bh_op_max *= 2;
cluster_op_t **n = (cluster_op_t**)malloc_or_die(sizeof(cluster_op_t*) * bh_op_max);
memcpy(n, bh_ops, sizeof(cluster_op_t*) * bh_op_count);
if (bh_ops != bh_ops_local)
{
free(bh_ops);
}
bh_ops = n;
}
bh_ops[bh_op_count++] = next;
}
next = n2;
}
next = n2;
}
for (int i = 0; i < bh_op_count; i++)
{
cluster_op_t *next = bh_ops[i];
if (next->opcode == OSD_OP_SYNC)
continue_sync(next);
else
continue_rw(next);
}
if (bh_ops != bh_ops_local)
{
free(bh_ops);
}
}
@@ -448,7 +454,7 @@ bool cluster_client_t::flush()
wb->start_writebacks(this, 0);
cluster_op_t *sync = new cluster_op_t;
sync->opcode = OSD_OP_SYNC;
sync->callback = [this](cluster_op_t *sync)
sync->callback = [](cluster_op_t *sync)
{
delete sync;
};
@@ -459,7 +465,7 @@ bool cluster_client_t::flush()
bool sync_done = false;
cluster_op_t *sync = new cluster_op_t;
sync->opcode = OSD_OP_SYNC;
sync->callback = [this, &sync_done](cluster_op_t *sync)
sync->callback = [&sync_done](cluster_op_t *sync)
{
delete sync;
sync_done = true;

View File

@@ -263,7 +263,7 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
}
assert(calc_len == op->len);
writebacks_active++;
op->callback = [this, cli, flush_id](cluster_op_t* op)
op->callback = [this, flush_id](cluster_op_t* op)
{
// Buffer flushes should be always retried, regardless of the error,
// so they should never result in an error here
@@ -383,7 +383,7 @@ static void copy_to_op(cluster_op_t *op, uint64_t offset, uint8_t *buf, uint64_t
auto begin = (cur_offset < offset ? offset : cur_offset);
auto end = (cur_offset+v.iov_len > offset+len ? offset+len : cur_offset+v.iov_len);
memcpy(
v.iov_base + begin - cur_offset,
(uint8_t*)v.iov_base + begin - cur_offset,
buf + (cur_offset <= offset ? 0 : cur_offset-offset),
end - begin
);

View File

@@ -5,7 +5,7 @@
#include "str_util.h"
static const char *help_text =
"Vitastor disk management tool\n"
"Vitastor disk management tool " VERSION "\n"
"(c) Vitaliy Filippov, 2022+ (VNPL-1.1)\n"
"\n"
"COMMANDS:\n"

View File

@@ -23,19 +23,24 @@ epoll_manager_t::epoll_manager_t(ring_loop_t *ringloop)
tfd = new timerfd_manager_t([this](int fd, bool wr, std::function<void(int, int)> handler) { set_fd_handler(fd, wr, handler); });
consumer.loop = [this]()
if (ringloop)
{
if (pending)
handle_epoll_events();
};
ringloop->register_consumer(&consumer);
handle_epoll_events();
consumer.loop = [this]()
{
if (pending)
handle_uring_event();
};
ringloop->register_consumer(&consumer);
handle_uring_event();
}
}
epoll_manager_t::~epoll_manager_t()
{
ringloop->unregister_consumer(&consumer);
if (ringloop)
{
ringloop->unregister_consumer(&consumer);
}
if (tfd)
{
delete tfd;
@@ -44,6 +49,11 @@ epoll_manager_t::~epoll_manager_t()
close(epoll_fd);
}
int epoll_manager_t::get_fd()
{
return epoll_fd;
}
void epoll_manager_t::set_fd_handler(int fd, bool wr, std::function<void(int, int)> handler)
{
if (handler != NULL)
@@ -75,7 +85,7 @@ void epoll_manager_t::set_fd_handler(int fd, bool wr, std::function<void(int, in
}
}
void epoll_manager_t::handle_epoll_events()
void epoll_manager_t::handle_uring_event()
{
io_uring_sqe *sqe = ringloop->get_sqe();
if (!sqe)
@@ -95,14 +105,20 @@ void epoll_manager_t::handle_epoll_events()
{
throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res));
}
handle_epoll_events();
handle_uring_event();
};
ringloop->submit();
handle_events(0);
}
void epoll_manager_t::handle_events(int timeout)
{
int nfds;
epoll_event events[MAX_EPOLL_EVENTS];
do
{
nfds = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, 0);
nfds = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, timeout);
timeout = 0;
for (int i = 0; i < nfds; i++)
{
auto cb_it = epoll_handlers.find(events[i].data.fd);

View File

@@ -15,11 +15,14 @@ class epoll_manager_t
ring_consumer_t consumer;
ring_loop_t *ringloop;
std::map<int, std::function<void(int, int)>> epoll_handlers;
void handle_uring_event();
public:
epoll_manager_t(ring_loop_t *ringloop);
~epoll_manager_t();
int get_fd();
void set_fd_handler(int fd, bool wr, std::function<void(int, int)> handler);
void handle_epoll_events();
void handle_events(int timeout);
timerfd_manager_t *tfd;
};

View File

@@ -32,6 +32,7 @@
struct sec_data
{
vitastor_c *cli = NULL;
bool epoll_based = false;
void *watch = NULL;
bool last_sync = false;
/* The list of completed io_u structs. */
@@ -58,6 +59,7 @@ struct sec_options
int rdma_port_num = 0;
int rdma_gid_index = 0;
int rdma_mtu = 0;
int no_io_uring = 0;
};
static struct fio_option options[] = {
@@ -193,6 +195,16 @@ static struct fio_option options[] = {
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = "no_io_uring",
.lname = "Disable io_uring",
.type = FIO_OPT_BOOL,
.off1 = offsetof(struct sec_options, no_io_uring),
.help = "Use epoll and plain sendmsg/recvmsg instead of io_uring (slower)",
.def = "0",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = NULL,
},
@@ -281,7 +293,17 @@ static int sec_setup(struct thread_data *td)
opt_push(options, "log_level", std::to_string(o->cluster_log).c_str());
// allow writeback caching if -direct is not set
opt_push(options, "client_writeback_allowed", td->o.odirect ? "0" : "1");
bsd->cli = vitastor_c_create_uring_json((const char**)options.data(), options.size());
bsd->cli = o->no_io_uring ? NULL : vitastor_c_create_uring_json((const char**)options.data(), options.size());
bsd->epoll_based = false;
if (!bsd->cli)
{
if (o->no_io_uring)
fprintf(stderr, "vitastor: io_uring disabled - I/O will be slower\n");
else
fprintf(stderr, "vitastor: failed to create io_uring: %s - I/O will be slower\n", strerror(errno));
bsd->cli = vitastor_c_create_epoll_json((const char**)options.data(), options.size());
bsd->epoll_based = true;
}
for (auto opt: options)
free(opt);
options.clear();
@@ -289,12 +311,24 @@ static int sec_setup(struct thread_data *td)
{
bsd->watch = NULL;
vitastor_c_watch_inode(bsd->cli, o->image, watch_callback, bsd);
while (true)
if (!bsd->epoll_based)
{
vitastor_c_uring_handle_events(bsd->cli);
if (bsd->watch)
break;
vitastor_c_uring_wait_events(bsd->cli);
while (true)
{
vitastor_c_uring_handle_events(bsd->cli);
if (bsd->watch)
break;
vitastor_c_uring_wait_events(bsd->cli);
}
}
else
{
while (true)
{
if (bsd->watch)
break;
vitastor_c_epoll_handle_events(bsd->cli, 1000);
}
}
td->files[0]->real_file_size = vitastor_c_inode_get_size(bsd->watch);
if (!vitastor_c_inode_get_num(bsd->watch) ||
@@ -437,12 +471,24 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int max, const struct timespec *t)
{
sec_data *bsd = (sec_data*)td->io_ops_data;
while (true)
if (!bsd->epoll_based)
{
vitastor_c_uring_handle_events(bsd->cli);
if (bsd->completed.size() >= min)
break;
vitastor_c_uring_wait_events(bsd->cli);
while (true)
{
vitastor_c_uring_handle_events(bsd->cli);
if (bsd->completed.size() >= min)
break;
vitastor_c_uring_wait_events(bsd->cli);
}
}
else
{
while (true)
{
if (bsd->completed.size() >= min)
break;
vitastor_c_epoll_handle_events(bsd->cli, 1000);
}
}
return bsd->completed.size();
}

View File

@@ -47,8 +47,25 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
if (qp)
ibv_destroy_qp(qp);
if (recv_buffers.size())
{
for (auto b: recv_buffers)
free(b);
{
if (b.mr)
ibv_dereg_mr(b.mr);
free(b.buf);
}
recv_buffers.clear();
}
if (send_mrs.size())
{
for (auto & mr: send_mrs)
{
if (mr)
ibv_dereg_mr(mr);
mr = NULL;
}
send_mrs.clear();
}
}
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level)
@@ -141,16 +158,22 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_SEND) ||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV))
{
fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable or does not support RC send and receive with ODP\n");
goto cleanup;
ctx->odp = false;
if (log_level > 0)
fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable. Falling back to slower version without ODP\n");
}
else
ctx->odp = true;
}
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
if (!ctx->mr)
if (ctx->odp)
{
fprintf(stderr, "Couldn't register RDMA memory region\n");
goto cleanup;
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
if (!ctx->mr)
{
fprintf(stderr, "Couldn't register RDMA memory region\n");
goto cleanup;
}
}
ctx->channel = ibv_create_comp_channel(ctx->context);
@@ -376,7 +399,6 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
ibv_sge sge[rc->max_sge];
while (rc->send_pos < cl->send_list.size())
{
iovec & iov = cl->send_list[rc->send_pos];
if (op_size >= rc->max_msg || op_sge >= rc->max_sge)
{
rc->send_sizes.push_back(op_size);
@@ -388,12 +410,23 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
break;
}
}
iovec & iov = cl->send_list[rc->send_pos];
if (!rdma_context->odp && rc->send_mrs.size() <= rc->send_pos)
{
ibv_mr *mr = ibv_reg_mr(rdma_context->pd, iov.iov_base, iov.iov_len, IBV_ACCESS_LOCAL_WRITE);
if (!mr)
{
fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
exit(1);
}
rc->send_mrs.push_back(mr);
}
uint32_t len = (uint32_t)(op_size+iov.iov_len-rc->send_buf_pos < rc->max_msg
? iov.iov_len-rc->send_buf_pos : rc->max_msg-op_size);
sge[op_sge++] = {
.addr = (uintptr_t)((uint8_t*)iov.iov_base+rc->send_buf_pos),
.length = len,
.lkey = rc->ctx->mr->lkey,
.lkey = rdma_context->odp ? rc->ctx->mr->lkey : rc->send_mrs[rc->send_pos]->lkey,
};
op_size += len;
rc->send_buf_pos += len;
@@ -411,12 +444,12 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
return true;
}
static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
static void try_recv_rdma_wr(osd_client_t *cl, msgr_rdma_buf_t b)
{
ibv_sge sge = {
.addr = (uintptr_t)buf,
.addr = (uintptr_t)b.buf,
.length = (uint32_t)cl->rdma_conn->max_msg,
.lkey = cl->rdma_conn->ctx->mr->lkey,
.lkey = !b.mr ? cl->rdma_conn->ctx->mr->lkey : b.mr->lkey,
};
ibv_recv_wr *bad_wr = NULL;
ibv_recv_wr wr = {
@@ -438,9 +471,19 @@ bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
auto rc = cl->rdma_conn;
while (rc->cur_recv < rc->max_recv)
{
void *buf = malloc_or_die(rc->max_msg);
rc->recv_buffers.push_back(buf);
try_recv_rdma_wr(cl, buf);
msgr_rdma_buf_t b;
b.buf = malloc_or_die(rc->max_msg);
if (!rdma_context->odp)
{
b.mr = ibv_reg_mr(rdma_context->pd, b.buf, rc->max_msg, IBV_ACCESS_LOCAL_WRITE);
if (!b.mr)
{
fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
exit(1);
}
}
rc->recv_buffers.push_back(b);
try_recv_rdma_wr(cl, b);
}
return true;
}
@@ -492,7 +535,7 @@ void osd_messenger_t::handle_rdma_events()
if (!is_send)
{
rc->cur_recv--;
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf], wc[i].byte_len))
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
{
// handle_read_buffer may stop the client
continue;
@@ -527,6 +570,11 @@ void osd_messenger_t::handle_rdma_events()
rc->send_pos -= send_pos;
for (int i = 0; i < send_pos; i++)
{
if (!rdma_context->odp)
{
ibv_dereg_mr(rc->send_mrs[i]);
rc->send_mrs[i] = NULL;
}
if (cl->outbox[i].flags & MSGR_SENDP_FREE)
{
// Reply fully sent
@@ -535,6 +583,8 @@ void osd_messenger_t::handle_rdma_events()
}
if (send_pos > 0)
{
if (!rdma_context->odp)
rc->send_mrs.erase(rc->send_mrs.begin(), rc->send_mrs.begin()+send_pos);
cl->send_list.erase(cl->send_list.begin(), cl->send_list.begin()+send_pos);
cl->outbox.erase(cl->outbox.begin(), cl->outbox.begin()+send_pos);
}

View File

@@ -23,6 +23,7 @@ struct msgr_rdma_context_t
ibv_device *dev = NULL;
ibv_device_attr_ex attrx;
ibv_pd *pd = NULL;
bool odp = false;
ibv_mr *mr = NULL;
ibv_comp_channel *channel = NULL;
ibv_cq *cq = NULL;
@@ -39,6 +40,12 @@ struct msgr_rdma_context_t
~msgr_rdma_context_t();
};
struct msgr_rdma_buf_t
{
void *buf = NULL;
ibv_mr *mr = NULL;
};
struct msgr_rdma_connection_t
{
msgr_rdma_context_t *ctx = NULL;
@@ -50,8 +57,9 @@ struct msgr_rdma_connection_t
int send_pos = 0, send_buf_pos = 0;
int next_recv_buf = 0;
std::vector<void*> recv_buffers;
std::vector<msgr_rdma_buf_t> recv_buffers;
std::vector<uint64_t> send_sizes;
std::vector<ibv_mr*> send_mrs;
~msgr_rdma_connection_t();
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);

View File

@@ -119,9 +119,9 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
try_send(cl);
}
}
else if (cl->write_msg.msg_iovlen > 0 || !try_send(cl))
else
{
if (cl->write_state == 0)
if ((cl->write_msg.msg_iovlen > 0 || !try_send(cl)) && (cl->write_state == 0))
{
cl->write_state = CL_WRITE_READY;
write_ready_clients.push_back(cur_op->peer_fd);

View File

@@ -216,6 +216,14 @@ public:
{
nbd_timeout = cfg["nbd_timeout"].uint64_value();
}
if (cfg["client_writeback_allowed"].is_null())
{
// NBD is always aware of fsync, so we allow write-back cache
// by default if it's enabled
auto obj = cfg.object_items();
obj["client_writeback_allowed"] = true;
cfg = obj;
}
// Create client
ringloop = new ring_loop_t(512);
epmgr = new epoll_manager_t(ringloop);

View File

@@ -65,8 +65,9 @@ json11::Json::object nfs_proxy_t::parse_args(int narg, const char *args[])
" --pool <POOL> use <POOL> as default pool for new files (images)\n"
" --foreground 1 stay in foreground, do not daemonize\n"
"\n"
"NFS proxy is stateless if you use immediate_commit=all in your cluster, so\n"
"you can freely use multiple NFS proxies with L3 load balancing in this case.\n"
"NFS proxy is stateless if you use immediate_commit=all in your cluster and if\n"
"you do not use client_enable_writeback=true, so you can freely use multiple\n"
"NFS proxies with L3 load balancing in this case.\n"
"\n"
"Example start and mount commands for a custom NFS port:\n"
" %s --etcd_address 192.168.5.10:2379 --portmap 0 --port 2050 --pool testpool\n"
@@ -114,6 +115,14 @@ void nfs_proxy_t::run(json11::Json cfg)
if (name_prefix.size())
name_prefix += "/";
}
if (cfg["client_writeback_allowed"].is_null())
{
// NFS is always aware of fsync, so we allow write-back cache
// by default if it's enabled
auto obj = cfg.object_items();
obj["client_writeback_allowed"] = true;
cfg = obj;
}
// Create client
ringloop = new ring_loop_t(512);
epmgr = new epoll_manager_t(ringloop);

View File

@@ -160,6 +160,9 @@ void osd_t::parse_config(bool init)
etcd_report_interval = config["etcd_report_interval"].uint64_value();
if (etcd_report_interval <= 0)
etcd_report_interval = 5;
etcd_stats_interval = config["etcd_stats_interval"].uint64_value();
if (etcd_stats_interval <= 0)
etcd_stats_interval = 30;
readonly = json_is_true(config["readonly"]);
run_primary = !json_is_false(config["run_primary"]);
allow_test_ops = json_is_true(config["allow_test_ops"]);

View File

@@ -93,6 +93,7 @@ class osd_t
json11::Json::object cli_config, file_config, etcd_global_config, etcd_osd_config, config;
int etcd_report_interval = 5;
int etcd_stats_interval = 30;
bool readonly = false;
osd_num_t osd_num = 1; // OSD numbers start with 1

View File

@@ -429,14 +429,18 @@ void osd_t::acquire_lease()
create_osd_state();
});
printf(
"[OSD %lu] reporting to etcd at %s every %d seconds\n", this->osd_num,
"[OSD %lu] reporting to etcd at %s every %d seconds (statistics every %d seconds)\n", this->osd_num,
(config["etcd_address"].is_string() ? config["etcd_address"].string_value() : config["etcd_address"].dump()).c_str(),
etcd_report_interval
etcd_report_interval, etcd_stats_interval
);
tfd->set_timer(etcd_report_interval*1000, true, [this](int timer_id)
{
renew_lease(false);
});
tfd->set_timer(etcd_stats_interval*1000, true, [this](int timer_id)
{
report_statistics();
});
}
// Report "up" state once, then keep it alive using the lease
@@ -541,7 +545,6 @@ void osd_t::renew_lease(bool reload)
else
{
etcd_failed_attempts = 0;
report_statistics();
// Reload PGs
if (reload && run_primary)
{

View File

@@ -19,6 +19,14 @@ static void handle_sigint(int sig)
exit(0);
}
static const char* help_text =
"Vitastor OSD (block object storage daemon) " VERSION "\n"
"(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
"\n"
"OSDs are usually started by vitastor-disk.\n"
"Manual usage: vitastor-osd [--option value] ...\n"
;
int main(int narg, char *args[])
{
setvbuf(stdout, NULL, _IONBF, 0);
@@ -37,6 +45,16 @@ int main(int narg, char *args[])
char *opt = args[i]+2;
config[std::string(opt)] = std::string(args[++i]);
}
else if (!strcmp(args[i], "--help"))
{
printf("%s", help_text);
return 0;
}
}
if (!config.size())
{
printf("%s", help_text);
return 1;
}
signal(SIGINT, handle_sigint);
signal(SIGTERM, handle_sigint);

View File

@@ -239,8 +239,9 @@ static void* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size
{
int edd = 0;
int erased[pg_size];
// we should distinguish stripes which are not at all and missing stripes
for (int i = 0; i < pg_size; i++)
erased[i] = (stripes[i].read_end == 0 || stripes[i].missing ? 1 : 0);
erased[i] = (stripes[i].read_end == 0 ? 2 : (stripes[i].missing ? 1 : 0));
for (int i = 0; i < pg_minsize; i++)
if (stripes[i].read_end != 0 && stripes[i].missing)
edd++;
@@ -253,7 +254,7 @@ static void* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size
#ifdef WITH_ISAL
int smrow = 0;
uint8_t *submatrix = (uint8_t*)malloc_or_die(pg_minsize*pg_minsize*2);
for (int i = 0; i < pg_size; i++)
for (int i = 0; i < pg_size && smrow < pg_minsize; i++)
{
if (!erased[i])
{
@@ -279,7 +280,7 @@ static void* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size
smrow = 0;
for (int i = 0; i < pg_minsize; i++)
{
if (erased[i])
if (erased[i] == 1)
{
memcpy(submatrix + pg_minsize*smrow, submatrix + (pg_minsize+i)*pg_minsize, pg_minsize);
smrow++;

View File

@@ -29,6 +29,7 @@ void test15(bool second);
void test16();
void test_recover_22_d2();
void test_ec43_error_bruteforce();
void test_recover_53_d5();
int main(int narg, char *args[])
{
@@ -67,6 +68,8 @@ int main(int narg, char *args[])
test_recover_22_d2();
// Error bruteforce
test_ec43_error_bruteforce();
// Test 19
test_recover_53_d5();
// End
printf("all ok\n");
return 0;
@@ -1112,7 +1115,7 @@ void test_recover_22_d2()
/***
EC 4+2 error location bruteforce
18. EC 4+2 error location bruteforce
***/
@@ -1178,3 +1181,66 @@ void test_ec43_error_bruteforce()
free(write_buf);
use_ec(7, 4, false);
}
/***
19. EC 5+3 recover 5th data block but not 4th
***/
void test_recover_53_d5()
{
const int bmp = 128*1024 / 4096 / 8;
use_ec(8, 5, true);
osd_num_t osd_set[8] = { 1, 2, 3, 0, 0, 6, 7, 8 };
osd_rmw_stripe_t stripes[8] = {};
unsigned bitmaps[8] = { 0 };
// Read 512+128K
split_stripes(5, 128*1024, 512*1024, 128*1024, stripes);
assert(stripes[0].req_start == 0 && stripes[0].req_end == 0);
assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
assert(stripes[4].req_start == 0 && stripes[4].req_end == 128*1024);
uint8_t *data_buf = (uint8_t*)malloc_or_die(128*1024*8);
for (int i = 0; i < 8; i++)
{
stripes[i].read_start = stripes[i].req_start;
stripes[i].read_end = stripes[i].req_end;
stripes[i].read_buf = data_buf + i*128*1024;
stripes[i].bmp_buf = bitmaps + i;
}
// Read using parity
assert(extend_missing_stripes(stripes, osd_set, 5, 8) == 0);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
assert(stripes[4].read_start == 0 && stripes[4].read_end == 128*1024);
assert(stripes[5].read_start == 0 && stripes[5].read_end == 128*1024);
assert(stripes[6].read_start == 0 && stripes[6].read_end == 128*1024);
assert(stripes[7].read_start == 0 && stripes[7].read_end == 0);
bitmaps[0] = 0xffffffff;
bitmaps[1] = 0xffffffff;
bitmaps[2] = 0xffffffff;
bitmaps[3] = 0;
bitmaps[4] = 0;
bitmaps[5] = 0xffffffff;
bitmaps[6] = 0x64646464;
bitmaps[7] = 0;
set_pattern(stripes[0].read_buf, 128*1024, 0x70a549add9a2280a);
set_pattern(stripes[1].read_buf, 128*1024, 0xa70a549add9a2280);
set_pattern(stripes[2].read_buf, 128*1024, 0x0a70a549add9a228);
set_pattern(stripes[3].read_buf, 128*1024, 0); // 0x80a70a549add9a22
set_pattern(stripes[4].read_buf, 128*1024, 0); // 0x280a70a549add9a2
set_pattern(stripes[5].read_buf, 128*1024, 0x7572c28f7a91eb22); // xor
set_pattern(stripes[6].read_buf, 128*1024, 0xb4542b32a560fe26); // 2nd EC chunk
set_pattern(stripes[7].read_buf, 128*1024, 0);
// Reconstruct
reconstruct_stripes_ec(stripes, 8, 5, bmp);
check_pattern(stripes[4].read_buf, 128*1024, 0x280a70a549add9a2);
assert(bitmaps[4] == 0xFFFFFFFF);
free(data_buf);
// Done
use_ec(8, 5, false);
}

View File

@@ -388,6 +388,7 @@ static void vitastor_aio_set_fd_handler(void *vcli, int fd, int unused1, IOHandl
);
}
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
typedef struct str_array
{
const char **items;
@@ -424,6 +425,7 @@ static void strarray_free(str_array *a)
a->items = NULL;
a->len = a->alloc = 0;
}
#endif
static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
{
@@ -443,6 +445,7 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
client->rdma_gid_index = qdict_get_try_int(options, "rdma-gid-index", 0);
client->rdma_mtu = qdict_get_try_int(options, "rdma-mtu", 0);
client->ctx = bdrv_get_aio_context(bs);
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
str_array opt = {};
strarray_push_kv(&opt, "config_path", qdict_get_try_str(options, "config-path"));
strarray_push_kv(&opt, "etcd_address", qdict_get_try_str(options, "etcd-host"));
@@ -472,40 +475,15 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
// Writeback cache is unusable without io_uring because the client can't correctly flush on exit
fprintf(stderr, "vitastor: failed to create io_uring: %s - I/O will be slower%s\n",
strerror(errno), (flags & BDRV_O_NOCACHE ? "" : " and writeback cache will be disabled"));
#endif
client->uring_eventfd = -1;
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
client->proxy = vitastor_c_create_qemu_uring(
vitastor_aio_set_fd_handler, client, client->config_path, client->etcd_host, client->etcd_prefix,
client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
);
if (!client->proxy)
{
fprintf(stderr, "vitastor: failed to create io_uring: %s - I/O will be slower\n", strerror(errno));
client->uring_eventfd = -1;
client->proxy = vitastor_c_create_qemu(
vitastor_aio_set_fd_handler, client, client->config_path, client->etcd_host, client->etcd_prefix,
client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
);
}
else
{
client->uring_eventfd = vitastor_c_uring_register_eventfd(client->proxy);
if (client->uring_eventfd < 0)
{
fprintf(stderr, "vitastor: failed to create io_uring eventfd: %s\n", strerror(errno));
error_setg(errp, "failed to create io_uring eventfd");
vitastor_close(bs);
return -1;
}
universal_aio_set_fd_handler(client->ctx, client->uring_eventfd, vitastor_uring_handler, NULL, client);
}
#else
client->proxy = vitastor_c_create_qemu(
vitastor_aio_set_fd_handler, client, client->config_path, client->etcd_host, client->etcd_prefix,
client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
);
#endif
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
}
#endif
image = client->image = g_strdup(qdict_get_try_str(options, "image"));
client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
// Get image metadata (size and readonly flag) or just wait until the client is ready

View File

@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: Vitastor
Description: Vitastor client library
Version: 1.0.0
Version: 1.2.0
Libs: -L${libdir} -lvitastor_client
Cflags: -I${includedir}

View File

@@ -164,6 +164,15 @@ int vitastor_c_uring_register_eventfd(vitastor_c *client)
vitastor_c *vitastor_c_create_uring_json(const char **options, int options_len)
{
ring_loop_t *ringloop = NULL;
try
{
ringloop = new ring_loop_t(512);
}
catch (std::exception & e)
{
return NULL;
}
json11::Json::object cfg;
for (int i = 0; i < options_len-1; i += 2)
{
@@ -171,18 +180,32 @@ vitastor_c *vitastor_c_create_uring_json(const char **options, int options_len)
}
json11::Json cfg_json(cfg);
vitastor_c *self = new vitastor_c;
self->ringloop = new ring_loop_t(512);
self->ringloop = ringloop;
self->epmgr = new epoll_manager_t(self->ringloop);
self->cli = new cluster_client_t(self->ringloop, self->epmgr->tfd, cfg_json);
return self;
}
vitastor_c *vitastor_c_create_epoll_json(const char **options, int options_len)
{
json11::Json::object cfg;
for (int i = 0; i < options_len-1; i += 2)
{
cfg[options[i]] = std::string(options[i+1]);
}
json11::Json cfg_json(cfg);
vitastor_c *self = new vitastor_c;
self->epmgr = new epoll_manager_t(NULL);
self->cli = new cluster_client_t(NULL, self->epmgr->tfd, cfg_json);
return self;
}
void vitastor_c_destroy(vitastor_c *client)
{
delete client->cli;
if (client->epmgr)
delete client->epmgr;
else
else if (client->tfd)
delete client->tfd;
if (client->ringloop)
delete client->ringloop;
@@ -220,6 +243,16 @@ int vitastor_c_uring_has_work(vitastor_c *client)
return client->ringloop->has_work();
}
int vitastor_c_epoll_get_fd(vitastor_c *client)
{
return !client->ringloop && client->epmgr ? client->epmgr->get_fd() : -1;
}
void vitastor_c_epoll_handle_events(vitastor_c *client, int timeout)
{
return client->epmgr->handle_events(timeout);
}
void vitastor_c_read(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
struct iovec *iov, int iovcnt, VitastorReadHandler cb, void *opaque)
{

View File

@@ -7,7 +7,7 @@
#define VITASTOR_QEMU_PROXY_H
// C API wrapper version
#define VITASTOR_C_API_VERSION 2
#define VITASTOR_C_API_VERSION 3
#ifndef POOL_ID_BITS
#define POOL_ID_BITS 16
@@ -40,6 +40,7 @@ vitastor_c *vitastor_c_create_qemu_uring(QEMUSetFDHandler *aio_set_fd_handler, v
vitastor_c *vitastor_c_create_uring(const char *config_path, const char *etcd_host, const char *etcd_prefix,
int use_rdma, const char *rdma_device, int rdma_port_num, int rdma_gid_index, int rdma_mtu, int log_level);
vitastor_c *vitastor_c_create_uring_json(const char **options, int options_len);
vitastor_c *vitastor_c_create_epoll_json(const char **options, int options_len);
void vitastor_c_destroy(vitastor_c *client);
int vitastor_c_is_ready(vitastor_c *client);
int vitastor_c_uring_register_eventfd(vitastor_c *client);
@@ -47,6 +48,8 @@ void vitastor_c_uring_wait_ready(vitastor_c *client);
void vitastor_c_uring_handle_events(vitastor_c *client);
void vitastor_c_uring_wait_events(vitastor_c *client);
int vitastor_c_uring_has_work(vitastor_c *client);
int vitastor_c_epoll_get_fd(vitastor_c *client);
void vitastor_c_epoll_handle_events(vitastor_c *client, int timeout);
void vitastor_c_read(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len,
struct iovec *iov, int iovcnt, VitastorReadHandler cb, void *opaque);
void vitastor_c_write(vitastor_c *client, uint64_t inode, uint64_t offset, uint64_t len, uint64_t check_version,

View File

@@ -18,10 +18,10 @@ else
fi
if [ "$IMMEDIATE_COMMIT" != "" ]; then
NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 10"
NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 10 --etcd_stats_interval 5"
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1,"immediate_commit":"all","client_enable_writeback":true}'
else
NO_SAME="--journal_sector_buffer_count 1024 --log_level 10"
NO_SAME="--journal_sector_buffer_count 1024 --log_level 10 --etcd_stats_interval 5"
$ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"osd_out_time":1,"client_enable_writeback":true}'
fi
@@ -29,7 +29,7 @@ start_osd_on()
{
local i=$1
local dev=$2
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL \
build/src/vitastor-osd --osd_num $i --bind_address $ETCD_IP $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL \
$(build/src/vitastor-disk simple-offsets --format options $OFFSET_ARGS $dev $OFFSET_ARGS 2>/dev/null) \
>>./testdata/osd$i.log 2>&1 &
eval OSD${i}_PID=$!

View File

@@ -7,7 +7,7 @@ if [[ "$SCHEME" = "ec" ]]; then
PG_DATA_SIZE=${PG_DATA_SIZE:-2}
PG_MINSIZE=${PG_MINSIZE:-3}
fi
OSD_COUNT=7
OSD_COUNT=${OSD_COUNT:-7}
PG_COUNT=32
. `dirname $0`/run_3osds.sh
check_qemu
@@ -29,7 +29,7 @@ kill_osds()
kill -9 $OSD1_PID
$ETCDCTL del /vitastor/osd/state/1
for i in 2 3 4 5 6 7; do
for i in $(seq 2 $OSD_COUNT); do
sleep 15
echo Killing OSD $i and starting OSD $((i-1))
p=OSD${i}_PID
@@ -40,8 +40,8 @@ kill_osds()
done
sleep 5
echo Starting OSD 7
start_osd 7
echo Starting OSD $OSD_COUNT
start_osd $OSD_COUNT
sleep 5
}

View File

@@ -7,7 +7,7 @@ OSD_COUNT=5
OSD_ARGS="$OSD_ARGS"
for i in $(seq 1 $OSD_COUNT); do
dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) >>./testdata/osd$i.log 2>&1 &
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 --etcd_stats_interval 5 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-disk simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) >>./testdata/osd$i.log 2>&1 &
eval OSD${i}_PID=$!
done