Compare commits

...

155 Commits

Author SHA1 Message Date
5524dbdab7 Release 1.2.0
Some checks failed
Test / test_snapshot_ec (push) Successful in 25s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m18s
Test / test_rm (push) Successful in 15s
Test / test_snapshot_down (push) Successful in 22s
Test / test_snapshot_down_ec (push) Successful in 23s
Test / test_splitbrain (push) Successful in 18s
Test / test_snapshot_chain (push) Successful in 2m13s
Test / test_snapshot_chain_ec (push) Successful in 2m57s
Test / test_rebalance_verify_imm (push) Successful in 2m51s
Test / test_write (push) Successful in 38s
Test / test_rebalance_verify (push) Successful in 3m39s
Test / test_write_no_same (push) Successful in 12s
Test / test_rebalance_verify_ec (push) Successful in 3m56s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m6s
Test / test_heal_pg_size_2 (push) Successful in 3m43s
Test / test_heal_csum_32k_dmj (push) Successful in 4m35s
Test / test_heal_csum_32k_dj (push) Successful in 5m44s
Test / test_heal_csum_32k (push) Successful in 5m50s
Test / test_heal_csum_4k_dmj (push) Successful in 5m44s
Test / test_scrub_zero_osd_2 (push) Successful in 57s
Test / test_scrub (push) Successful in 1m0s
Test / test_scrub_xor (push) Successful in 1m5s
Test / test_heal_csum_4k_dj (push) Successful in 5m9s
Test / test_scrub_pg_size_3 (push) Successful in 1m38s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 54s
Test / test_scrub_ec (push) Successful in 52s
Test / test_heal_csum_4k (push) Successful in 5m8s
Test / test_heal_ec (push) Successful in 3m17s
Test / test_write_xor (push) Successful in 35s
Test / test_move_reappear (push) Failing after 48s
New features:

- Implement CSI volume expansion
- Implement CSI volume snapshots
- CSI driver now requires Kubernetes >= 1.20

Bug fixes:

- Important bug fix for EC: fix EC n+k, k>=2 read recovery in ISA-L version returning
  incorrect data when reading at least the second chunk out of multiple missing chunks
  without reading the first one. All users of EC n+k, k>=2 should upgrade as soon as
  possible, and upgrade should be conducted with downtime: first stop all clients
  (VMs/containers), then all OSDs, then upgrade and restart everything.
- Fix unstable statistics aggregation in monitor (affecting vitastor-cli status and df)
- Make udev not wait for OSDs to start during boot
- Do not report negative numbers of offline PGs in vitastor-cli status when changing PG count
- Report both old and new PG counts in vitastor-cli df when changing it
- Fix OSDs sometimes not starting with "The code only supports journal versions 1 and 2,
  but it is 2 on disk" error after upgrading from pre-1.0 versions and letting OSDs run
  for some time
- Fix monitors sometimes returning old PG count back after OSD configuration changes
- Make monitor PG changes more stable and timeout errors less probable
2023-11-05 01:48:57 +03:00
cd3dec06ac Remove spaces from old->new PG count in df
All checks were successful
Test / test_interrupted_rebalance_ec (push) Successful in 1m50s
Test / test_move_reappear (push) Successful in 19s
Test / test_rm (push) Successful in 14s
Test / test_snapshot_ec (push) Successful in 33s
Test / test_snapshot_down (push) Successful in 29s
Test / test_snapshot_down_ec (push) Successful in 32s
Test / test_splitbrain (push) Successful in 24s
Test / test_snapshot_chain (push) Successful in 2m34s
Test / test_rebalance_verify_imm (push) Successful in 3m9s
Test / test_rebalance_verify (push) Successful in 4m9s
Test / test_write (push) Successful in 40s
Test / test_write_no_same (push) Successful in 13s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m15s
Test / test_rebalance_verify_ec (push) Successful in 4m29s
Test / test_heal_pg_size_2 (push) Successful in 3m21s
Test / test_heal_csum_32k_dmj (push) Successful in 5m38s
Test / test_heal_ec (push) Successful in 6m14s
Test / test_heal_csum_32k_dj (push) Successful in 6m22s
Test / test_heal_csum_32k (push) Successful in 6m40s
Test / test_scrub (push) Successful in 1m11s
Test / test_scrub_zero_osd_2 (push) Successful in 1m12s
Test / test_scrub_xor (push) Successful in 1m16s
Test / test_heal_csum_4k_dj (push) Successful in 6m4s
Test / test_heal_csum_4k_dmj (push) Successful in 6m34s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m4s
Test / test_heal_csum_4k (push) Successful in 5m37s
Test / test_scrub_ec (push) Successful in 43s
Test / test_scrub_pg_size_3 (push) Successful in 1m14s
Test / test_write_xor (push) Successful in 1m11s
Test / test_snapshot_chain_ec (push) Successful in 2m43s
2023-11-05 01:45:45 +03:00
371d79e059 Document vitastor-csi features 2023-11-05 01:05:26 +03:00
0e888e6c60 Prevent spamming etcd with last_clean_pgs update requests
All checks were successful
Test / test_snapshot_ec (push) Successful in 34s
Test / test_interrupted_rebalance_ec (push) Successful in 1m49s
Test / test_move_reappear (push) Successful in 19s
Test / test_rm (push) Successful in 14s
Test / test_snapshot_down (push) Successful in 30s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 24s
Test / test_snapshot_chain (push) Successful in 2m23s
Test / test_snapshot_chain_ec (push) Successful in 3m4s
Test / test_rebalance_verify_imm (push) Successful in 2m54s
Test / test_rebalance_verify (push) Successful in 3m48s
Test / test_write (push) Successful in 35s
Test / test_write_no_same (push) Successful in 14s
Test / test_write_xor (push) Successful in 55s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m15s
Test / test_rebalance_verify_ec (push) Successful in 5m3s
Test / test_heal_pg_size_2 (push) Successful in 3m59s
Test / test_heal_ec (push) Successful in 4m56s
Test / test_heal_csum_32k_dmj (push) Successful in 5m48s
Test / test_heal_csum_32k_dj (push) Successful in 5m44s
Test / test_heal_csum_32k (push) Successful in 6m35s
Test / test_scrub (push) Successful in 1m14s
Test / test_heal_csum_4k_dmj (push) Successful in 6m54s
Test / test_scrub_zero_osd_2 (push) Successful in 1m2s
Test / test_scrub_xor (push) Successful in 49s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m9s
Test / test_scrub_pg_size_3 (push) Successful in 1m54s
Test / test_heal_csum_4k_dj (push) Successful in 6m17s
Test / test_heal_csum_4k (push) Successful in 6m18s
Test / test_scrub_ec (push) Successful in 37s
2023-11-05 00:12:00 +03:00
408c21d8f0 Scale last_clean_pgs PG count even if current PGs already contain the new number of PGs
Some checks failed
Test / test_interrupted_rebalance_ec (push) Successful in 1m49s
Test / test_snapshot_ec (push) Successful in 31s
Test / test_rm (push) Successful in 13s
Test / test_snapshot_down (push) Successful in 23s
Test / test_move_reappear (push) Failing after 50s
Test / test_snapshot_down_ec (push) Successful in 22s
Test / test_splitbrain (push) Successful in 20s
Test / test_snapshot_chain (push) Successful in 2m15s
Test / test_snapshot_chain_ec (push) Successful in 2m56s
Test / test_rebalance_verify_imm (push) Successful in 2m59s
Test / test_write (push) Successful in 34s
Test / test_rebalance_verify (push) Successful in 3m44s
Test / test_write_no_same (push) Successful in 13s
Test / test_write_xor (push) Successful in 52s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m5s
Test / test_rebalance_verify_ec (push) Successful in 5m1s
Test / test_heal_pg_size_2 (push) Successful in 4m1s
Test / test_heal_ec (push) Successful in 5m3s
Test / test_heal_csum_32k_dmj (push) Successful in 5m13s
Test / test_heal_csum_32k_dj (push) Successful in 5m37s
Test / test_heal_csum_32k (push) Successful in 6m19s
Test / test_scrub (push) Successful in 1m11s
Test / test_heal_csum_4k_dmj (push) Successful in 6m13s
Test / test_scrub_zero_osd_2 (push) Successful in 1m5s
Test / test_scrub_xor (push) Successful in 48s
Test / test_heal_csum_4k_dj (push) Successful in 6m11s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m10s
Test / test_scrub_pg_size_3 (push) Successful in 1m29s
Test / test_heal_csum_4k (push) Successful in 6m9s
Test / test_scrub_ec (push) Successful in 35s
2023-11-04 23:45:59 +03:00
43cb9ae212 Prevent multiple parallel recheck_pgs in case of timeouts
Some checks failed
Test / test_snapshot_ec (push) Successful in 37s
Test / test_minsize_1 (push) Successful in 13s
Test / test_rm (push) Successful in 12s
Test / test_move_reappear (push) Successful in 17s
Test / test_snapshot_down (push) Successful in 30s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 22s
Test / test_snapshot_chain (push) Successful in 2m25s
Test / test_snapshot_chain_ec (push) Failing after 3m7s
Test / test_rebalance_verify_imm (push) Successful in 3m0s
Test / test_rebalance_verify (push) Successful in 3m54s
Test / test_write (push) Successful in 34s
Test / test_write_no_same (push) Successful in 14s
Test / test_write_xor (push) Successful in 52s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m6s
Test / test_rebalance_verify_ec (push) Successful in 5m10s
Test / test_heal_pg_size_2 (push) Successful in 4m1s
Test / test_heal_ec (push) Successful in 4m21s
Test / test_heal_csum_32k_dmj (push) Successful in 5m10s
Test / test_heal_csum_32k_dj (push) Successful in 5m51s
Test / test_heal_csum_32k (push) Successful in 6m54s
Test / test_heal_csum_4k_dmj (push) Successful in 6m38s
Test / test_scrub (push) Successful in 1m9s
Test / test_scrub_zero_osd_2 (push) Successful in 1m2s
Test / test_scrub_xor (push) Successful in 43s
Test / test_heal_csum_4k_dj (push) Successful in 6m24s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m16s
Test / test_scrub_pg_size_3 (push) Successful in 1m38s
Test / test_scrub_ec (push) Successful in 37s
Test / test_heal_csum_4k (push) Successful in 6m2s
2023-11-04 20:59:56 +03:00
e15b6e7805 Fix "cannot be narrowed" in clang
Some checks failed
Test / test_snapshot_ec (push) Successful in 44s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 2m10s
Test / test_rm (push) Successful in 16s
Test / test_move_reappear (push) Failing after 51s
Test / test_snapshot_down (push) Successful in 22s
Test / test_snapshot_down_ec (push) Successful in 24s
Test / test_splitbrain (push) Successful in 23s
Test / test_snapshot_chain (push) Successful in 2m32s
Test / test_snapshot_chain_ec (push) Successful in 3m2s
Test / test_rebalance_verify_imm (push) Successful in 3m0s
Test / test_write (push) Successful in 33s
Test / test_rebalance_verify (push) Successful in 3m53s
Test / test_write_no_same (push) Successful in 12s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m23s
Test / test_rebalance_verify_ec (push) Successful in 4m11s
Test / test_write_xor (push) Failing after 3m12s
Test / test_heal_pg_size_2 (push) Successful in 3m47s
Test / test_heal_csum_32k_dmj (push) Successful in 5m17s
Test / test_heal_ec (push) Successful in 5m34s
Test / test_heal_csum_32k_dj (push) Successful in 6m43s
Test / test_heal_csum_32k (push) Successful in 6m30s
Test / test_scrub (push) Successful in 1m18s
Test / test_scrub_zero_osd_2 (push) Successful in 1m11s
Test / test_heal_csum_4k_dmj (push) Successful in 6m24s
Test / test_heal_csum_4k_dj (push) Successful in 6m23s
Test / test_scrub_xor (push) Successful in 54s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m1s
Test / test_scrub_ec (push) Successful in 54s
Test / test_scrub_pg_size_3 (push) Successful in 1m25s
Test / test_heal_csum_4k (push) Successful in 6m10s
2023-11-04 18:14:44 +03:00
31017d8412 Allow to start with V2 journal with header size from V1, as incorrectly updated by previous versions 2023-11-04 18:13:42 +03:00
4819854064 Fix OSDs incorrectly updating journal superblock after upgrade to 1.x from pre-1.x and refusing to start after it
Some checks failed
Test / test_interrupted_rebalance_imm (push) Successful in 3m38s
Test / test_snapshot_ec (push) Successful in 33s
Test / test_rm (push) Successful in 16s
Test / test_snapshot_down (push) Successful in 23s
Test / test_move_reappear (push) Failing after 47s
Test / test_snapshot_down_ec (push) Successful in 23s
Test / test_splitbrain (push) Successful in 21s
Test / test_snapshot_chain (push) Successful in 2m31s
Test / test_snapshot_chain_ec (push) Successful in 3m7s
Test / test_rebalance_verify_imm (push) Successful in 2m54s
Test / test_write (push) Successful in 32s
Test / test_rebalance_verify (push) Successful in 3m46s
Test / test_write_no_same (push) Successful in 13s
Test / test_write_xor (push) Successful in 37s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m56s
Test / test_rebalance_verify_ec (push) Successful in 5m0s
Test / test_heal_pg_size_2 (push) Failing after 4m18s
Test / test_heal_ec (push) Successful in 5m3s
Test / test_heal_csum_32k_dmj (push) Successful in 5m19s
Test / test_heal_csum_32k_dj (push) Successful in 5m44s
Test / test_heal_csum_32k (push) Successful in 6m37s
Test / test_heal_csum_4k_dmj (push) Successful in 6m46s
Test / test_scrub (push) Successful in 1m5s
Test / test_scrub_zero_osd_2 (push) Successful in 48s
Test / test_scrub_xor (push) Successful in 45s
Test / test_heal_csum_4k_dj (push) Successful in 6m37s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m17s
Test / test_scrub_pg_size_3 (push) Successful in 1m40s
Test / test_scrub_ec (push) Successful in 34s
Test / test_heal_csum_4k (push) Successful in 7m13s
2023-11-04 15:02:24 +03:00
1f509cca77 Fix unused capture warnings and void* arithmetic (clang)
Some checks failed
Test / test_minsize_1 (push) Successful in 14s
Test / test_snapshot_ec (push) Successful in 40s
Test / test_rm (push) Successful in 16s
Test / test_move_reappear (push) Successful in 18s
Test / test_snapshot_down (push) Successful in 31s
Test / test_snapshot_down_ec (push) Successful in 33s
Test / test_splitbrain (push) Successful in 22s
Test / test_snapshot_chain (push) Successful in 2m31s
Test / test_snapshot_chain_ec (push) Failing after 3m7s
Test / test_rebalance_verify_imm (push) Successful in 3m6s
Test / test_write (push) Successful in 39s
Test / test_rebalance_verify (push) Successful in 4m7s
Test / test_write_no_same (push) Successful in 13s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m23s
Test / test_rebalance_verify_ec (push) Successful in 4m20s
Test / test_write_xor (push) Failing after 3m9s
Test / test_heal_pg_size_2 (push) Successful in 3m55s
Test / test_heal_csum_32k_dmj (push) Successful in 4m44s
Test / test_heal_csum_32k_dj (push) Successful in 6m8s
Test / test_heal_csum_32k (push) Successful in 5m58s
Test / test_heal_ec (push) Failing after 10m16s
Test / test_heal_csum_4k_dmj (push) Successful in 5m57s
Test / test_scrub (push) Successful in 1m8s
Test / test_scrub_zero_osd_2 (push) Successful in 59s
Test / test_scrub_xor (push) Successful in 47s
Test / test_heal_csum_4k_dj (push) Successful in 5m30s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m7s
Test / test_scrub_pg_size_3 (push) Successful in 1m34s
Test / test_heal_csum_4k (push) Successful in 5m21s
Test / test_scrub_ec (push) Successful in 43s
2023-11-04 14:55:12 +03:00
aa8e8e8271 Add version info to --help output
Some checks failed
Test / test_minsize_1 (push) Successful in 16s
Test / test_snapshot_ec (push) Successful in 39s
Test / test_move_reappear (push) Successful in 21s
Test / test_rm (push) Successful in 16s
Test / test_snapshot_down (push) Successful in 32s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 24s
Test / test_snapshot_chain (push) Successful in 2m31s
Test / test_snapshot_chain_ec (push) Failing after 3m6s
Test / test_rebalance_verify_imm (push) Successful in 2m47s
Test / test_rebalance_verify (push) Successful in 3m38s
Test / test_write (push) Successful in 38s
Test / test_write_no_same (push) Successful in 13s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m19s
Test / test_rebalance_verify_ec (push) Successful in 4m6s
Test / test_write_xor (push) Failing after 3m10s
Test / test_heal_pg_size_2 (push) Successful in 3m54s
Test / test_heal_csum_32k_dmj (push) Successful in 5m6s
Test / test_heal_ec (push) Successful in 5m48s
Test / test_heal_csum_32k_dj (push) Successful in 6m22s
Test / test_heal_csum_32k (push) Successful in 6m30s
Test / test_scrub (push) Successful in 1m18s
Test / test_scrub_zero_osd_2 (push) Successful in 1m12s
Test / test_heal_csum_4k_dmj (push) Successful in 6m38s
Test / test_heal_csum_4k_dj (push) Successful in 6m14s
Test / test_scrub_xor (push) Successful in 1m0s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 56s
Test / test_scrub_ec (push) Successful in 51s
Test / test_scrub_pg_size_3 (push) Successful in 1m20s
Test / test_heal_csum_4k (push) Successful in 5m58s
2023-11-04 13:32:12 +03:00
4d79e531c5 Do not print "-X offline" in status when changing pool PG count, print it in df instead
Some checks failed
Test / test_interrupted_rebalance_ec_imm (push) Successful in 2m20s
Test / test_rm (push) Successful in 16s
Test / test_snapshot_ec (push) Successful in 36s
Test / test_snapshot_down (push) Successful in 23s
Test / test_move_reappear (push) Failing after 48s
Test / test_snapshot_down_ec (push) Successful in 23s
Test / test_splitbrain (push) Successful in 21s
Test / test_snapshot_chain (push) Successful in 2m21s
Test / test_snapshot_chain_ec (push) Successful in 3m1s
Test / test_rebalance_verify_imm (push) Successful in 3m6s
Test / test_write (push) Successful in 35s
Test / test_rebalance_verify (push) Successful in 3m46s
Test / test_write_no_same (push) Successful in 13s
Test / test_rebalance_verify_ec (push) Successful in 4m7s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m23s
Test / test_write_xor (push) Failing after 3m8s
Test / test_heal_pg_size_2 (push) Successful in 4m15s
Test / test_heal_csum_32k_dmj (push) Successful in 4m39s
Test / test_heal_ec (push) Successful in 6m35s
Test / test_heal_csum_32k_dj (push) Successful in 6m5s
Test / test_heal_csum_32k (push) Successful in 6m45s
Test / test_scrub (push) Successful in 1m8s
Test / test_heal_csum_4k_dmj (push) Successful in 6m24s
Test / test_scrub_zero_osd_2 (push) Successful in 1m6s
Test / test_scrub_xor (push) Successful in 41s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m18s
Test / test_heal_csum_4k_dj (push) Successful in 6m29s
Test / test_scrub_pg_size_3 (push) Successful in 1m34s
Test / test_heal_csum_4k (push) Successful in 6m7s
Test / test_scrub_ec (push) Successful in 30s
2023-11-04 13:12:13 +03:00
30dff8893f Fix ISA-L version EC recovery with first missing data chunk not being read
Some checks failed
Test / test_snapshot (push) Successful in 44s
Test / test_snapshot_ec (push) Successful in 28s
Test / test_move_reappear (push) Successful in 19s
Test / test_rm (push) Successful in 16s
Test / test_snapshot_down (push) Successful in 30s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 24s
Test / test_snapshot_chain (push) Successful in 2m20s
Test / test_snapshot_chain_ec (push) Successful in 3m1s
Test / test_rebalance_verify_imm (push) Successful in 2m49s
Test / test_rebalance_verify (push) Successful in 3m37s
Test / test_write (push) Successful in 42s
Test / test_write_no_same (push) Successful in 14s
Test / test_write_xor (push) Successful in 54s
Test / test_rebalance_verify_ec (push) Successful in 4m55s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m13s
Test / test_heal_pg_size_2 (push) Successful in 4m4s
Test / test_heal_ec (push) Successful in 5m2s
Test / test_heal_csum_32k_dmj (push) Failing after 5m54s
Test / test_heal_csum_32k_dj (push) Successful in 6m6s
Test / test_heal_csum_32k (push) Successful in 6m59s
Test / test_scrub (push) Successful in 1m16s
Test / test_heal_csum_4k_dmj (push) Successful in 6m56s
Test / test_scrub_xor (push) Successful in 51s
Test / test_scrub_zero_osd_2 (push) Successful in 1m1s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m25s
Test / test_heal_csum_4k (push) Successful in 6m9s
Test / test_heal_csum_4k_dj (push) Successful in 6m33s
Test / test_scrub_pg_size_3 (push) Successful in 1m37s
Test / test_scrub_ec (push) Successful in 26s
(Yes, all EC n + k with k >= 2 users should upgrade as soon as possible)
2023-11-04 01:34:18 +03:00
becf14a705 Add a test for EC with multiple missing data chunks, but without recovery of first of them 2023-11-04 01:34:18 +03:00
64388788c1 Implement CSI volume expansion
Some checks failed
Test / test_snapshot_ec (push) Successful in 35s
Test / test_minsize_1 (push) Successful in 16s
Test / test_rm (push) Successful in 15s
Test / test_snapshot_down (push) Successful in 22s
Test / test_move_reappear (push) Failing after 50s
Test / test_snapshot_down_ec (push) Successful in 23s
Test / test_splitbrain (push) Successful in 19s
Test / test_snapshot_chain (push) Successful in 2m24s
Test / test_snapshot_chain_ec (push) Failing after 3m6s
Test / test_rebalance_verify_imm (push) Successful in 3m15s
Test / test_write (push) Successful in 41s
Test / test_rebalance_verify (push) Successful in 4m13s
Test / test_write_no_same (push) Successful in 13s
Test / test_write_xor (push) Successful in 50s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m28s
Test / test_rebalance_verify_ec (push) Successful in 5m30s
Test / test_heal_pg_size_2 (push) Successful in 4m5s
Test / test_heal_ec (push) Successful in 4m57s
Test / test_heal_csum_32k_dmj (push) Successful in 6m13s
Test / test_heal_csum_32k_dj (push) Successful in 6m10s
Test / test_heal_csum_32k (push) Successful in 6m40s
Test / test_heal_csum_4k_dmj (push) Successful in 6m24s
Test / test_scrub (push) Successful in 1m7s
Test / test_scrub_xor (push) Successful in 47s
Test / test_scrub_zero_osd_2 (push) Successful in 53s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m25s
Test / test_scrub_pg_size_3 (push) Successful in 1m59s
Test / test_heal_csum_4k_dj (push) Successful in 6m10s
Test / test_heal_csum_4k (push) Successful in 6m2s
Test / test_scrub_ec (push) Successful in 40s
2023-11-01 12:46:20 +03:00
37653abe4b Implement CSI volume snapshots 2023-11-01 12:46:20 +03:00
7c054c6f10 Add "id" to df --json output 2023-11-01 12:46:16 +03:00
bb7709e824 Support listening on non-127.0.0.1 in tests 2023-11-01 12:45:27 +03:00
ebeace5a2d Add cmake and pkg-config to debian build depends 2023-11-01 12:45:27 +03:00
a378789f10 Rollback erroneous go.mod changes in 1.1.0 O:-)
All checks were successful
Test / test_snapshot_ec (push) Successful in 30s
Test / test_move_reappear (push) Successful in 19s
Test / test_interrupted_rebalance_ec (push) Successful in 1m52s
Test / test_rm (push) Successful in 16s
Test / test_snapshot_down (push) Successful in 32s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 23s
Test / test_snapshot_chain (push) Successful in 2m23s
Test / test_snapshot_chain_ec (push) Successful in 3m2s
Test / test_rebalance_verify_imm (push) Successful in 3m1s
Test / test_rebalance_verify (push) Successful in 3m49s
Test / test_write (push) Successful in 41s
Test / test_write_no_same (push) Successful in 13s
Test / test_write_xor (push) Successful in 43s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m15s
Test / test_rebalance_verify_ec (push) Successful in 5m9s
Test / test_heal_pg_size_2 (push) Successful in 4m13s
Test / test_heal_ec (push) Successful in 4m31s
Test / test_heal_csum_32k_dmj (push) Successful in 5m59s
Test / test_heal_csum_32k_dj (push) Successful in 6m14s
Test / test_heal_csum_32k (push) Successful in 6m47s
Test / test_heal_csum_4k_dmj (push) Successful in 6m47s
Test / test_scrub (push) Successful in 1m11s
Test / test_scrub_zero_osd_2 (push) Successful in 1m0s
Test / test_scrub_xor (push) Successful in 52s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m19s
Test / test_scrub_pg_size_3 (push) Successful in 1m52s
Test / test_heal_csum_4k_dj (push) Successful in 6m2s
Test / test_heal_csum_4k (push) Successful in 5m46s
Test / test_scrub_ec (push) Successful in 25s
2023-10-30 18:47:48 +03:00
1fe678e57b Add --no-block to udev rule
Some checks failed
Test / test_minsize_1 (push) Successful in 13s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 2m0s
Test / test_move_reappear (push) Successful in 21s
Test / test_rm (push) Successful in 15s
Test / test_snapshot_down (push) Successful in 30s
Test / test_snapshot_down_ec (push) Successful in 32s
Test / test_splitbrain (push) Successful in 23s
Test / test_snapshot_chain (push) Successful in 2m29s
Test / test_snapshot_chain_ec (push) Successful in 3m4s
Test / test_rebalance_verify_ec_imm (push) Failing after 18s
Test / test_write (push) Successful in 29s
Test / test_rebalance_verify_imm (push) Successful in 2m53s
Test / test_write_no_same (push) Successful in 12s
Test / test_rebalance_verify (push) Successful in 3m46s
Test / test_write_xor (push) Failing after 3m6s
Test / test_rebalance_verify_ec (push) Successful in 5m1s
Test / test_heal_pg_size_2 (push) Successful in 4m50s
Test / test_heal_ec (push) Successful in 4m34s
Test / test_heal_csum_32k_dmj (push) Successful in 5m5s
Test / test_heal_csum_32k_dj (push) Successful in 5m57s
Test / test_heal_csum_32k (push) Successful in 6m56s
Test / test_heal_csum_4k_dmj (push) Successful in 7m28s
Test / test_scrub (push) Successful in 1m10s
Test / test_scrub_zero_osd_2 (push) Successful in 57s
Test / test_scrub_xor (push) Successful in 53s
Test / test_heal_csum_4k_dj (push) Successful in 6m34s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m8s
Test / test_scrub_pg_size_3 (push) Successful in 1m37s
Test / test_scrub_ec (push) Successful in 41s
Test / test_heal_csum_4k (push) Successful in 6m6s
2023-10-30 12:18:29 +03:00
2e592a2f22 Fix undefined variable "timeout"
Some checks failed
Test / test_snapshot_ec (push) Successful in 44s
Test / test_rm (push) Successful in 17s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 2m9s
Test / test_snapshot_down (push) Successful in 23s
Test / test_move_reappear (push) Failing after 51s
Test / test_snapshot_down_ec (push) Successful in 25s
Test / test_splitbrain (push) Successful in 24s
Test / test_snapshot_chain (push) Successful in 2m26s
Test / test_snapshot_chain_ec (push) Failing after 3m6s
Test / test_rebalance_verify_imm (push) Successful in 3m2s
Test / test_write (push) Successful in 35s
Test / test_rebalance_verify (push) Successful in 3m56s
Test / test_write_no_same (push) Successful in 12s
Test / test_write_xor (push) Successful in 38s
Test / test_rebalance_verify_ec (push) Successful in 5m2s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m13s
Test / test_heal_pg_size_2 (push) Successful in 4m17s
Test / test_heal_ec (push) Successful in 5m2s
Test / test_heal_csum_32k_dmj (push) Successful in 5m43s
Test / test_heal_csum_32k_dj (push) Successful in 5m36s
Test / test_heal_csum_32k (push) Successful in 7m4s
Test / test_heal_csum_4k_dmj (push) Successful in 6m47s
Test / test_scrub (push) Successful in 1m11s
Test / test_scrub_zero_osd_2 (push) Successful in 59s
Test / test_scrub_pg_size_3 (push) Successful in 1m26s
Test / test_heal_csum_4k_dj (push) Successful in 6m32s
Test / test_heal_csum_4k (push) Successful in 6m31s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 27s
Test / test_scrub_ec (push) Successful in 25s
Test / test_scrub_xor (push) Failing after 3m7s
2023-10-29 01:30:55 +03:00
b92f644e3a Fix statistics aggregation, calculate inode stats by first deriving per-OSD stats, too 2023-10-29 01:30:55 +03:00
890ea3dbc0 Forgot to add new parameter page to README 2023-10-28 13:39:53 +03:00
06630369bf Plans++ 2023-10-28 13:38:04 +03:00
b4740acf62 Fix operations paused for 0.5-1 second when it happens that io_uring submit is not triggered
Some checks failed
Test / test_snapshot (push) Successful in 24s
Test / test_snapshot_ec (push) Successful in 33s
Test / test_minsize_1 (push) Successful in 15s
Test / test_rm (push) Successful in 17s
Test / test_move_reappear (push) Failing after 48s
Test / test_snapshot_down_ec (push) Successful in 24s
Test / test_splitbrain (push) Successful in 22s
Test / test_snapshot_chain (push) Successful in 2m30s
Test / test_snapshot_chain_ec (push) Successful in 3m4s
Test / test_rebalance_verify_imm (push) Successful in 2m41s
Test / test_write (push) Successful in 48s
Test / test_rebalance_verify (push) Successful in 3m42s
Test / test_write_no_same (push) Successful in 12s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m17s
Test / test_rebalance_verify_ec (push) Successful in 4m11s
Test / test_write_xor (push) Failing after 3m8s
Test / test_heal_pg_size_2 (push) Successful in 3m40s
Test / test_heal_csum_32k_dmj (push) Successful in 5m9s
Test / test_heal_ec (push) Successful in 6m31s
Test / test_heal_csum_32k_dj (push) Successful in 6m30s
Test / test_heal_csum_32k (push) Successful in 6m22s
Test / test_scrub (push) Successful in 1m14s
Test / test_scrub_zero_osd_2 (push) Successful in 1m20s
Test / test_heal_csum_4k_dmj (push) Successful in 6m23s
Test / test_scrub_xor (push) Successful in 1m4s
Test / test_heal_csum_4k_dj (push) Successful in 6m2s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 59s
Test / test_scrub_ec (push) Successful in 50s
Test / test_scrub_pg_size_3 (push) Successful in 1m35s
Test / test_heal_csum_4k (push) Successful in 6m1s
2023-10-28 13:18:21 +03:00
eae81bbda6 Fix typo 2023-10-28 01:09:20 +03:00
8222e3c77d Release 1.1.0
Some checks failed
Test / test_interrupted_rebalance_ec (push) Successful in 1m49s
Test / test_snapshot_ec (push) Successful in 38s
Test / test_rm (push) Successful in 15s
Test / test_snapshot_down (push) Successful in 23s
Test / test_move_reappear (push) Failing after 49s
Test / test_snapshot_down_ec (push) Successful in 23s
Test / test_splitbrain (push) Successful in 22s
Test / test_snapshot_chain (push) Successful in 2m25s
Test / test_snapshot_chain_ec (push) Successful in 3m5s
Test / test_rebalance_verify_imm (push) Successful in 2m51s
Test / test_write (push) Successful in 34s
Test / test_rebalance_verify (push) Successful in 3m38s
Test / test_write_no_same (push) Successful in 14s
Test / test_write_xor (push) Successful in 50s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m3s
Test / test_rebalance_verify_ec (push) Successful in 5m0s
Test / test_heal_pg_size_2 (push) Successful in 4m2s
Test / test_heal_ec (push) Successful in 4m49s
Test / test_heal_csum_32k_dmj (push) Successful in 5m27s
Test / test_heal_csum_32k_dj (push) Successful in 5m44s
Test / test_heal_csum_32k (push) Successful in 6m57s
Test / test_heal_csum_4k_dmj (push) Successful in 6m50s
Test / test_scrub (push) Successful in 1m12s
Test / test_scrub_xor (push) Successful in 48s
Test / test_scrub_zero_osd_2 (push) Successful in 54s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m14s
Test / test_heal_csum_4k_dj (push) Successful in 6m32s
Test / test_scrub_pg_size_3 (push) Successful in 1m38s
Test / test_heal_csum_4k (push) Successful in 6m20s
Test / test_scrub_ec (push) Successful in 27s
New features:

- Implement [client writeback cache](docs/config/client.en.md#client_enable_writeback)
- Add the third I/O mode: [O_DIRECT|O_SYNC](docs/config/osd.en.md#data_io) (good for Optane)
- Reduce load on etcd by splitting OSD lease and statistics reporting intervals:
  [etcd_stats_interval](docs/config/osd.en.md#etcd_stats_interval) (default 30 sec)
- Make MON automatically filter OSDs by layout (block_size/immediate_commit/bitmap_granularity)
  to prevent "refusing to start PGs of this pool" errors on misconfiguration
- Support running fio benchmarks on systems without io_uring
- Make QEMU driver compatible with QEMU 8.1
- Document usage of [vhost-user-blk](docs/usage/qemu.en.md#vhost-user-blk)

Bug fixes:

- Fix resizing disks in QEMU driver (for example, in Proxmox)
- Fix "unexpected result" in Proxmox driver by making CLI flush output on exit
- Remove unneeded block_size mismatch warnings on pools without matching PGs
- Fix possible segfault in vitastor-cli ls -l (usually with deleted pools)
- Fix QEMU driver compatibility with systems without io_uring
- Fix monitor eating 100% CPU when etcd is down (caused by infinite retries)
- Fix potential incorrect write processing with snapshots (not caught in tests
  but could probably lead to client hangs)
- Fix buffer insertion in cluster_client (not caught in tests but could
  probably lead to incorrect writes in rare cases)
- Fix rare OSD crash during sync operation processing
- Fix a reenterability issue in cluster_client not reproducible in QEMU/fio,
  but reproducible with the currently developed K/V database implementation
- Fix deletion of the first modified object - OSDs could crash if you modified
  the same object a lot of times, then deleted it, and then modified it again
- Fix the fio_sec_osd test tool
2023-10-28 00:33:06 +03:00
29cbe70e74 Bump qemu version to vitastor4 2023-10-28 00:33:06 +03:00
a883e79507 Make docs to add etcd_stats_interval 2023-10-27 14:09:26 +03:00
be7e76f849 Split etcd_stats_interval out of etcd_report_interval
All checks were successful
Test / test_interrupted_rebalance_ec (push) Successful in 1m46s
Test / test_snapshot_ec (push) Successful in 36s
Test / test_move_reappear (push) Successful in 19s
Test / test_rm (push) Successful in 15s
Test / test_snapshot_down (push) Successful in 29s
Test / test_snapshot_down_ec (push) Successful in 30s
Test / test_splitbrain (push) Successful in 26s
Test / test_snapshot_chain (push) Successful in 2m15s
Test / test_snapshot_chain_ec (push) Successful in 2m57s
Test / test_rebalance_verify_imm (push) Successful in 2m29s
Test / test_rebalance_verify (push) Successful in 3m40s
Test / test_write (push) Successful in 1m0s
Test / test_write_no_same (push) Successful in 13s
Test / test_write_xor (push) Successful in 50s
Test / test_rebalance_verify_ec (push) Successful in 4m58s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m14s
Test / test_heal_pg_size_2 (push) Successful in 4m21s
Test / test_heal_ec (push) Successful in 4m5s
Test / test_heal_csum_32k_dmj (push) Successful in 5m36s
Test / test_heal_csum_32k_dj (push) Successful in 6m28s
Test / test_heal_csum_32k (push) Successful in 6m38s
Test / test_heal_csum_4k_dmj (push) Successful in 6m46s
Test / test_scrub_zero_osd_2 (push) Successful in 59s
Test / test_scrub (push) Successful in 1m16s
Test / test_scrub_xor (push) Successful in 53s
Test / test_scrub_pg_size_3 (push) Successful in 1m57s
Test / test_heal_csum_4k_dj (push) Successful in 6m18s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m7s
Test / test_heal_csum_4k (push) Successful in 5m43s
Test / test_scrub_ec (push) Successful in 32s
2023-10-27 01:26:26 +03:00
6fd2cf5df6 Add documentation for the write-back cache 2023-10-27 01:26:26 +03:00
294a754c9e Allow write-back by default in NBD & NFS 2023-10-27 01:26:26 +03:00
8bfea6e7de Support vitastor_c_create_epoll() in fio driver
All checks were successful
Test / test_snapshot_ec (push) Successful in 29s
Test / test_move_reappear (push) Successful in 21s
Test / test_rm (push) Successful in 21s
Test / test_snapshot_down (push) Successful in 27s
Test / test_interrupted_rebalance_ec (push) Successful in 2m29s
Test / test_splitbrain (push) Successful in 20s
Test / test_snapshot_down_ec (push) Successful in 24s
Test / test_snapshot_chain (push) Successful in 2m35s
Test / test_rebalance_verify (push) Successful in 3m22s
Test / test_rebalance_verify_imm (push) Successful in 3m22s
Test / test_write (push) Successful in 32s
Test / test_write_no_same (push) Successful in 12s
Test / test_rebalance_verify_ec (push) Successful in 3m56s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m51s
Test / test_heal_pg_size_2 (push) Successful in 3m21s
Test / test_heal_ec (push) Successful in 5m45s
Test / test_heal_csum_32k_dmj (push) Successful in 5m43s
Test / test_heal_csum_32k_dj (push) Successful in 6m10s
Test / test_heal_csum_32k (push) Successful in 6m29s
Test / test_heal_csum_4k_dmj (push) Successful in 6m22s
Test / test_heal_csum_4k_dj (push) Successful in 5m51s
Test / test_scrub (push) Successful in 36s
Test / test_scrub_zero_osd_2 (push) Successful in 34s
Test / test_heal_csum_4k (push) Successful in 5m35s
Test / test_scrub_xor (push) Successful in 35s
Test / test_write_xor (push) Successful in 1m31s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 31s
Test / test_scrub_pg_size_3 (push) Successful in 42s
Test / test_scrub_ec (push) Successful in 20s
Test / test_snapshot_chain_ec (push) Successful in 1m23s
2023-10-26 22:57:36 +03:00
bac9e34836 Allow to create vitastor_c with plain epoll without uring :-) 2023-10-26 22:57:36 +03:00
8aa4d492c1 Allow to use epoll_manager without ringloop 2023-10-26 22:57:36 +03:00
9336ee5476 Correctly free manual "small vector" in cluster_client %-) 2023-10-26 22:57:36 +03:00
ad30b11519 Add the missing ringloop creation check to vitastor_c_create_uring_json()
Some checks failed
Test / test_minsize_1 (push) Failing after 1s
Test / test_move_reappear (push) Failing after 1s
Test / test_rm (push) Failing after 0s
Test / test_snapshot_chain (push) Failing after 0s
Test / test_snapshot_chain_ec (push) Failing after 1s
Test / test_snapshot_down (push) Failing after 1s
Test / test_snapshot_down_ec (push) Failing after 0s
Test / test_splitbrain (push) Failing after 1s
Test / test_rebalance_verify (push) Failing after 1s
Test / test_rebalance_verify_imm (push) Failing after 1s
Test / test_rebalance_verify_ec (push) Failing after 1s
Test / test_rebalance_verify_ec_imm (push) Failing after 1s
Test / test_write (push) Failing after 1s
Test / test_write_xor (push) Failing after 1s
Test / test_write_no_same (push) Failing after 1s
Test / test_heal_pg_size_2 (push) Failing after 1s
Test / test_heal_ec (push) Failing after 1s
Test / test_heal_csum_32k_dmj (push) Failing after 1s
Test / test_heal_csum_32k_dj (push) Failing after 1s
Test / test_heal_csum_32k (push) Failing after 1s
Test / test_heal_csum_4k_dmj (push) Failing after 1s
Test / test_heal_csum_4k_dj (push) Failing after 1s
Test / test_heal_csum_4k (push) Failing after 1s
Test / test_scrub (push) Failing after 1s
Test / test_scrub_zero_osd_2 (push) Failing after 1s
Test / test_scrub_xor (push) Failing after 1s
Test / test_scrub_pg_size_3 (push) Failing after 1s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Failing after 0s
Test / test_scrub_ec (push) Failing after 1s
Test / build (push) Successful in 2m35s
2023-10-26 18:07:23 +03:00
a061246997 Do not attempt to initialize QEMU driver via vitastor_c_create_qemu_uring()
Some checks failed
Test / test_snapshot_ec (push) Successful in 38s
Test / test_minsize_1 (push) Successful in 18s
Test / test_rm (push) Successful in 17s
Test / test_snapshot_down (push) Failing after 26s
Test / test_move_reappear (push) Failing after 52s
Test / test_snapshot_down_ec (push) Failing after 27s
Test / test_splitbrain (push) Successful in 25s
Test / test_snapshot_chain (push) Successful in 2m28s
Test / test_snapshot_chain_ec (push) Successful in 3m6s
Test / test_rebalance_verify (push) Successful in 3m30s
Test / test_rebalance_verify_imm (push) Successful in 3m29s
Test / test_rebalance_verify_ec (push) Has been cancelled
Test / test_write_no_same (push) Has been cancelled
Test / test_heal_pg_size_2 (push) Has been cancelled
Test / test_heal_ec (push) Has been cancelled
Test / test_heal_csum_32k_dmj (push) Has been cancelled
Test / test_heal_csum_32k_dj (push) Has been cancelled
Test / test_heal_csum_32k (push) Has been cancelled
Test / test_heal_csum_4k_dmj (push) Has been cancelled
Test / test_heal_csum_4k_dj (push) Has been cancelled
Test / test_heal_csum_4k (push) Has been cancelled
Test / test_scrub (push) Has been cancelled
Test / test_scrub_zero_osd_2 (push) Has been cancelled
Test / test_scrub_xor (push) Has been cancelled
Test / test_scrub_pg_size_3 (push) Has been cancelled
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Has been cancelled
Test / test_write_xor (push) Has been cancelled
Test / test_scrub_ec (push) Has been cancelled
Test / test_rebalance_verify_ec_imm (push) Has been cancelled
Test / test_write (push) Has been cancelled
It doesn't add any compatibility because vitastor_c_uring_register_eventfd()
is added in the same VITASTOR_C_API_VERSION 2.
2023-10-26 17:46:19 +03:00
5066e35a49 Fix write-over-delete failing for the very first entry in dirty_db
Some checks failed
Test / test_minsize_1 (push) Successful in 17s
Test / test_snapshot_ec (push) Successful in 38s
Test / test_rm (push) Successful in 17s
Test / test_snapshot_down (push) Failing after 27s
Test / test_move_reappear (push) Failing after 53s
Test / test_snapshot_down_ec (push) Failing after 27s
Test / test_splitbrain (push) Successful in 25s
Test / test_snapshot_chain (push) Successful in 2m28s
Test / test_snapshot_chain_ec (push) Successful in 2m59s
Test / test_rebalance_verify (push) Successful in 3m22s
Test / test_rebalance_verify_imm (push) Successful in 3m21s
Test / test_write (push) Successful in 42s
Test / test_write_xor (push) Successful in 48s
Test / test_write_no_same (push) Successful in 14s
Test / test_rebalance_verify_ec (push) Successful in 4m41s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m8s
Test / test_heal_pg_size_2 (push) Successful in 4m17s
Test / test_heal_ec (push) Successful in 4m33s
Test / test_heal_csum_32k_dmj (push) Successful in 5m23s
Test / test_heal_csum_32k_dj (push) Successful in 6m18s
Test / test_heal_csum_32k (push) Successful in 6m44s
Test / test_heal_csum_4k_dmj (push) Successful in 6m53s
Test / test_scrub (push) Successful in 1m11s
Test / test_scrub_zero_osd_2 (push) Successful in 1m5s
Test / test_scrub_xor (push) Successful in 1m1s
Test / test_heal_csum_4k_dj (push) Successful in 6m58s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m2s
Test / test_scrub_pg_size_3 (push) Successful in 1m41s
Test / test_scrub_ec (push) Successful in 37s
Test / test_heal_csum_4k (push) Successful in 6m4s
2023-10-21 17:00:14 +03:00
93dc31f3fc Fix possible segfault in vitastor-cli ls -l
Some checks failed
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m21s
Test / test_snapshot_ec (push) Successful in 32s
Test / test_rm (push) Successful in 17s
Test / test_snapshot_down (push) Failing after 26s
Test / test_move_reappear (push) Failing after 52s
Test / test_splitbrain (push) Successful in 22s
Test / test_snapshot_down_ec (push) Failing after 26s
Test / test_snapshot_chain (push) Successful in 2m22s
Test / test_snapshot_chain_ec (push) Successful in 2m44s
Test / test_rebalance_verify (push) Successful in 3m20s
Test / test_rebalance_verify_imm (push) Successful in 3m20s
Test / test_write (push) Successful in 36s
Test / test_write_no_same (push) Successful in 14s
Test / test_rebalance_verify_ec (push) Successful in 3m59s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m49s
Test / test_write_xor (push) Failing after 3m11s
Test / test_heal_pg_size_2 (push) Failing after 4m40s
Test / test_heal_csum_32k_dmj (push) Successful in 5m13s
Test / test_heal_ec (push) Successful in 5m32s
Test / test_heal_csum_32k_dj (push) Successful in 6m1s
Test / test_heal_csum_32k (push) Successful in 6m54s
Test / test_scrub (push) Successful in 1m28s
Test / test_heal_csum_4k_dmj (push) Successful in 7m1s
Test / test_scrub_zero_osd_2 (push) Successful in 1m7s
Test / test_heal_csum_4k_dj (push) Successful in 7m25s
Test / test_scrub_xor (push) Successful in 59s
Test / test_heal_csum_4k (push) Successful in 6m39s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 59s
Test / test_scrub_pg_size_3 (push) Successful in 1m6s
Test / test_scrub_ec (push) Successful in 35s
2023-10-18 11:11:41 +03:00
f245b56176 Fix another possible reenterability issue in cluster_client
Some checks failed
Test / test_interrupted_rebalance_ec_imm (push) Successful in 2m9s
Test / test_snapshot_ec (push) Successful in 42s
Test / test_rm (push) Successful in 15s
Test / test_snapshot_down (push) Failing after 27s
Test / test_move_reappear (push) Failing after 50s
Test / test_snapshot_down_ec (push) Failing after 26s
Test / test_splitbrain (push) Successful in 23s
Test / test_snapshot_chain (push) Successful in 2m26s
Test / test_snapshot_chain_ec (push) Successful in 2m58s
Test / test_rebalance_verify (push) Successful in 3m26s
Test / test_rebalance_verify_imm (push) Successful in 3m27s
Test / test_write (push) Successful in 42s
Test / test_write_xor (push) Successful in 51s
Test / test_write_no_same (push) Successful in 16s
Test / test_rebalance_verify_ec (push) Successful in 4m56s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m11s
Test / test_heal_pg_size_2 (push) Successful in 4m18s
Test / test_heal_ec (push) Successful in 5m5s
Test / test_heal_csum_32k_dmj (push) Successful in 5m7s
Test / test_heal_csum_32k_dj (push) Successful in 6m14s
Test / test_heal_csum_32k (push) Successful in 6m54s
Test / test_heal_csum_4k_dmj (push) Successful in 6m55s
Test / test_scrub (push) Successful in 1m23s
Test / test_scrub_zero_osd_2 (push) Successful in 1m8s
Test / test_scrub_xor (push) Successful in 1m0s
Test / test_heal_csum_4k_dj (push) Successful in 7m15s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m7s
Test / test_scrub_pg_size_3 (push) Successful in 1m40s
Test / test_scrub_ec (push) Successful in 42s
Test / test_heal_csum_4k (push) Successful in 6m17s
Non-reproducible in QEMU/FIO, only caught during K/V DB debugging
2023-10-08 11:02:53 +03:00
befca06f18 Support any OSD count in test_heal 2023-10-08 11:02:53 +03:00
fbf0263625 Add qemu-storage-daemon to documentation 2023-09-16 18:40:52 +03:00
3bcf276d4d Run tests with writeback
All checks were successful
Test / test_snapshot_ec (push) Successful in 35s
Test / test_interrupted_rebalance_ec (push) Successful in 1m56s
Test / test_rm (push) Successful in 21s
Test / test_snapshot_down (push) Successful in 24s
Test / test_snapshot_down_ec (push) Successful in 25s
Test / test_splitbrain (push) Successful in 22s
Test / test_snapshot_chain (push) Successful in 2m24s
Test / test_snapshot_chain_ec (push) Successful in 2m54s
Test / test_rebalance_verify (push) Successful in 3m32s
Test / test_rebalance_verify_imm (push) Successful in 3m31s
Test / test_write (push) Successful in 46s
Test / test_write_xor (push) Successful in 52s
Test / test_write_no_same (push) Successful in 14s
Test / test_rebalance_verify_ec (push) Successful in 4m52s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m3s
Test / test_heal_pg_size_2 (push) Successful in 3m52s
Test / test_heal_ec (push) Successful in 5m7s
Test / test_heal_csum_32k_dmj (push) Successful in 5m29s
Test / test_heal_csum_32k_dj (push) Successful in 6m7s
Test / test_heal_csum_32k (push) Successful in 6m44s
Test / test_scrub (push) Successful in 1m20s
Test / test_heal_csum_4k_dmj (push) Successful in 6m57s
Test / test_scrub_xor (push) Successful in 55s
Test / test_scrub_zero_osd_2 (push) Successful in 1m11s
Test / test_heal_csum_4k_dj (push) Successful in 6m42s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m12s
Test / test_scrub_pg_size_3 (push) Successful in 1m33s
Test / test_scrub_ec (push) Successful in 42s
Test / test_heal_csum_4k (push) Successful in 6m14s
Test / test_move_reappear (push) Successful in 18s
2023-09-16 17:52:17 +03:00
38db53f5ee Implement client writeback cache
- Disabled by default, enable with client_enable_writeback=true
- Even then only enabled in FIO when -direct is disabled and in QEMU when
  block device cache is enabled in settings
- Can also be enabled in other clients like vitastor-cli using parameter
  client_writeback_allowed=true, but not recommended
2023-09-16 17:52:17 +03:00
cd543a90bc Prevent stack overflows in cli_merge with CAS and writeback cache 2023-09-16 17:52:17 +03:00
f600cc07b0 Autosync in blockstore every autosync_writes, too 2023-09-16 17:52:17 +03:00
6a8e530e6b Add FIXME to timerfd_manager 2023-09-16 17:52:17 +03:00
5cadb170b9 Fix possible OSD crash during sync due to missing min_flushed_journal_sector reset 2023-09-16 17:52:17 +03:00
e72d4ed1d4 Remove unused bs_sync fields 2023-09-16 17:52:17 +03:00
ff479a102d Make MON filter OSDs by block layout to prevent "refusing to start PGs of this pool" errors on misconfiguration 2023-09-16 17:52:17 +03:00
27d0d5b06a Reads do not have to wait for buffer flushes anymore 2023-09-16 17:52:17 +03:00
33950c1ec8 Fix fio_sec_osd attr_len
Some checks reported warnings
Test / test_failure_domain (push) Has been cancelled
Test / test_snapshot (push) Has been cancelled
Test / test_snapshot_ec (push) Has been cancelled
Test / test_minsize_1 (push) Has been cancelled
Test / test_move_reappear (push) Has been cancelled
Test / test_rm (push) Has been cancelled
Test / test_snapshot_chain (push) Has been cancelled
Test / test_snapshot_chain_ec (push) Has been cancelled
Test / test_snapshot_down (push) Has been cancelled
Test / test_snapshot_down_ec (push) Has been cancelled
Test / test_splitbrain (push) Has been cancelled
Test / test_rebalance_verify (push) Has been cancelled
Test / test_rebalance_verify_imm (push) Has been cancelled
Test / test_rebalance_verify_ec (push) Has been cancelled
Test / test_rebalance_verify_ec_imm (push) Has been cancelled
Test / test_write (push) Has been cancelled
Test / test_write_xor (push) Has been cancelled
Test / test_write_no_same (push) Has been cancelled
Test / test_heal_pg_size_2 (push) Has been cancelled
Test / test_heal_ec (push) Has been cancelled
Test / test_heal_csum_32k_dmj (push) Has been cancelled
Test / test_heal_csum_32k_dj (push) Has been cancelled
Test / test_heal_csum_32k (push) Has been cancelled
Test / test_heal_csum_4k_dmj (push) Has been cancelled
Test / test_heal_csum_4k_dj (push) Has been cancelled
Test / test_heal_csum_4k (push) Has been cancelled
Test / test_scrub (push) Has been cancelled
Test / test_scrub_zero_osd_2 (push) Has been cancelled
Test / test_scrub_xor (push) Has been cancelled
Test / build (push) Has been cancelled
2023-09-16 17:49:10 +03:00
eea7ef1f19 Remove debug osd_trace from test_write
All checks were successful
Test / test_snapshot_ec (push) Successful in 41s
Test / test_minsize_1 (push) Successful in 15s
Test / test_rm (push) Successful in 15s
Test / test_move_reappear (push) Successful in 19s
Test / test_snapshot_down (push) Successful in 32s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 20s
Test / test_snapshot_chain (push) Successful in 2m18s
Test / test_snapshot_chain_ec (push) Successful in 3m1s
Test / test_rebalance_verify (push) Successful in 3m24s
Test / test_rebalance_verify_imm (push) Successful in 3m26s
Test / test_write (push) Successful in 45s
Test / test_write_no_same (push) Successful in 17s
Test / test_write_xor (push) Successful in 49s
Test / test_rebalance_verify_ec (push) Successful in 4m59s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m56s
Test / test_heal_csum_32k_dmj (push) Successful in 4m39s
Test / test_heal_csum_32k_dj (push) Successful in 6m9s
Test / test_heal_csum_32k (push) Successful in 6m8s
Test / test_scrub (push) Successful in 1m26s
Test / test_heal_csum_4k_dmj (push) Successful in 6m53s
Test / test_scrub_zero_osd_2 (push) Successful in 1m15s
Test / test_scrub_xor (push) Successful in 57s
Test / test_heal_csum_4k_dj (push) Successful in 6m18s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 59s
Test / test_scrub_ec (push) Successful in 57s
Test / test_scrub_pg_size_3 (push) Successful in 1m37s
Test / test_heal_csum_4k (push) Successful in 5m21s
Test / test_heal_ec (push) Successful in 7m21s
Test / test_heal_pg_size_2 (push) Successful in 3m59s
2023-09-12 01:35:36 +03:00
cc0fdc6253 Remove erroneous block_size mismatch warnings on pools without matching PGs
Some checks failed
Test / test_snapshot_ec (push) Successful in 36s
Test / test_minsize_1 (push) Successful in 13s
Test / test_rm (push) Successful in 16s
Test / test_snapshot_down (push) Successful in 24s
Test / test_move_reappear (push) Failing after 52s
Test / test_snapshot_down_ec (push) Successful in 23s
Test / test_splitbrain (push) Successful in 21s
Test / test_snapshot_chain (push) Successful in 2m23s
Test / test_snapshot_chain_ec (push) Successful in 2m58s
Test / test_rebalance_verify (push) Successful in 3m32s
Test / test_rebalance_verify_imm (push) Successful in 3m29s
Test / test_write (push) Successful in 52s
Test / test_write_xor (push) Successful in 56s
Test / test_write_no_same (push) Successful in 15s
Test / test_rebalance_verify_ec (push) Successful in 5m0s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m30s
Test / test_heal_ec (push) Successful in 4m6s
Test / test_heal_pg_size_2 (push) Failing after 4m19s
Test / test_heal_csum_32k_dmj (push) Successful in 5m2s
Test / test_heal_csum_32k_dj (push) Successful in 6m12s
Test / test_heal_csum_32k (push) Successful in 6m24s
Test / test_heal_csum_4k_dmj (push) Successful in 6m19s
Test / test_scrub_zero_osd_2 (push) Successful in 1m8s
Test / test_scrub (push) Successful in 1m15s
Test / test_scrub_xor (push) Successful in 1m8s
Test / test_heal_csum_4k_dj (push) Successful in 6m45s
Test / test_scrub_pg_size_3 (push) Successful in 1m58s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m9s
Test / test_scrub_ec (push) Successful in 42s
Test / test_heal_csum_4k (push) Successful in 5m26s
2023-09-08 23:19:04 +03:00
79ecd59b10 Flush STDOUT and STDERR before exiting from cli to fix Proxmox "Unexpected result"
Some checks failed
Test / test_interrupted_rebalance_ec (push) Successful in 1m51s
Test / test_snapshot_ec (push) Successful in 33s
Test / test_move_reappear (push) Successful in 19s
Test / test_rm (push) Successful in 18s
Test / test_snapshot_down (push) Successful in 29s
Test / test_snapshot_down_ec (push) Successful in 30s
Test / test_splitbrain (push) Successful in 21s
Test / test_snapshot_chain (push) Successful in 2m18s
Test / test_snapshot_chain_ec (push) Successful in 2m55s
Test / test_rebalance_verify (push) Successful in 3m15s
Test / test_rebalance_verify_imm (push) Successful in 3m10s
Test / test_write (push) Successful in 47s
Test / test_write_xor (push) Successful in 57s
Test / test_write_no_same (push) Successful in 17s
Test / test_rebalance_verify_ec (push) Successful in 5m3s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m24s
Test / test_heal_pg_size_2 (push) Failing after 3m44s
Test / test_heal_ec (push) Successful in 4m55s
Test / test_heal_csum_32k_dmj (push) Successful in 4m48s
Test / test_heal_csum_32k_dj (push) Successful in 6m11s
Test / test_heal_csum_32k (push) Successful in 6m14s
Test / test_scrub (push) Successful in 1m32s
Test / test_heal_csum_4k_dmj (push) Successful in 6m33s
Test / test_scrub_xor (push) Successful in 59s
Test / test_scrub_zero_osd_2 (push) Successful in 1m7s
Test / test_heal_csum_4k_dj (push) Successful in 6m25s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m21s
Test / test_scrub_pg_size_3 (push) Successful in 1m46s
Test / test_scrub_ec (push) Successful in 53s
Test / test_heal_csum_4k (push) Successful in 5m20s
2023-09-07 17:30:26 +03:00
51081c9b45 Put etcd into tmpfs for tests
All checks were successful
Test / test_minsize_1 (push) Successful in 16s
Test / test_rm (push) Successful in 15s
Test / test_move_reappear (push) Successful in 21s
Test / test_snapshot_ec (push) Successful in 25s
Test / test_snapshot_down (push) Successful in 31s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 26s
Test / test_snapshot_chain (push) Successful in 2m17s
Test / test_snapshot_chain_ec (push) Successful in 2m58s
Test / test_rebalance_verify (push) Successful in 3m18s
Test / test_rebalance_verify_imm (push) Successful in 3m11s
Test / test_write (push) Successful in 40s
Test / test_write_no_same (push) Successful in 13s
Test / test_rebalance_verify_ec (push) Successful in 4m0s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m16s
Test / test_heal_ec (push) Successful in 4m44s
Test / test_heal_csum_32k_dmj (push) Successful in 5m38s
Test / test_heal_csum_32k_dj (push) Successful in 5m36s
Test / test_heal_csum_32k (push) Successful in 6m35s
Test / test_scrub (push) Successful in 1m38s
Test / test_heal_csum_4k_dmj (push) Successful in 6m39s
Test / test_scrub_zero_osd_2 (push) Successful in 1m17s
Test / test_scrub_xor (push) Successful in 57s
Test / test_heal_csum_4k_dj (push) Successful in 6m4s
Test / test_heal_csum_4k (push) Successful in 6m8s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 48s
Test / test_scrub_pg_size_3 (push) Successful in 1m9s
Test / test_scrub_ec (push) Successful in 29s
Test / test_write_xor (push) Successful in 1m13s
Test / test_heal_pg_size_2 (push) Successful in 3m24s
2023-09-07 02:35:09 +03:00
b7d398be5b Fix sscanf validation usage (field count instead of null_byte == 0)
Some checks reported warnings
Test / test_minsize_1 (push) Has been cancelled
Test / test_move_reappear (push) Has been cancelled
Test / test_rm (push) Has been cancelled
Test / test_snapshot_chain (push) Has been cancelled
Test / test_snapshot_chain_ec (push) Has been cancelled
Test / test_snapshot_down (push) Has been cancelled
Test / test_snapshot_down_ec (push) Has been cancelled
Test / test_splitbrain (push) Has been cancelled
Test / test_rebalance_verify (push) Has been cancelled
Test / build (push) Has been cancelled
Test / test_rebalance_verify_imm (push) Has been cancelled
Test / test_rebalance_verify_ec (push) Has been cancelled
Test / test_rebalance_verify_ec_imm (push) Has been cancelled
Test / test_write (push) Has been cancelled
Test / test_write_xor (push) Has been cancelled
Test / test_write_no_same (push) Has been cancelled
Test / test_heal_pg_size_2 (push) Has been cancelled
Test / test_heal_ec (push) Has been cancelled
Test / test_heal_csum_32k_dmj (push) Has been cancelled
Test / test_heal_csum_32k_dj (push) Has been cancelled
Test / test_heal_csum_32k (push) Has been cancelled
Test / test_heal_csum_4k_dmj (push) Has been cancelled
Test / test_heal_csum_4k_dj (push) Has been cancelled
Test / test_heal_csum_4k (push) Has been cancelled
Test / test_scrub (push) Has been cancelled
Test / test_scrub_zero_osd_2 (push) Has been cancelled
Test / test_scrub_xor (push) Has been cancelled
Test / test_scrub_pg_size_3 (push) Has been cancelled
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Has been cancelled
Test / test_scrub_ec (push) Has been cancelled
2023-09-07 02:34:35 +03:00
85e9f67d9d Add supported_truncate_flags
All checks were successful
Test / test_minsize_1 (push) Successful in 15s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m19s
Test / test_rm (push) Successful in 16s
Test / test_snapshot_down (push) Successful in 23s
Test / test_snapshot_down_ec (push) Successful in 25s
Test / test_splitbrain (push) Successful in 19s
Test / test_snapshot_chain (push) Successful in 2m13s
Test / test_snapshot_chain_ec (push) Successful in 2m54s
Test / test_rebalance_verify (push) Successful in 3m25s
Test / test_rebalance_verify_imm (push) Successful in 3m22s
Test / test_write (push) Successful in 52s
Test / test_write_xor (push) Successful in 55s
Test / test_write_no_same (push) Successful in 15s
Test / test_rebalance_verify_ec (push) Successful in 4m50s
Test / test_rebalance_verify_ec_imm (push) Successful in 6m8s
Test / test_heal_ec (push) Successful in 5m1s
Test / test_heal_csum_32k_dmj (push) Successful in 4m32s
Test / test_heal_csum_32k (push) Successful in 6m14s
Test / test_heal_csum_32k_dj (push) Successful in 6m23s
Test / test_scrub (push) Successful in 1m27s
Test / test_heal_csum_4k_dmj (push) Successful in 6m24s
Test / test_heal_csum_4k_dj (push) Successful in 6m15s
Test / test_scrub_xor (push) Successful in 58s
Test / test_scrub_zero_osd_2 (push) Successful in 1m1s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 55s
Test / test_scrub_ec (push) Successful in 53s
Test / test_scrub_pg_size_3 (push) Successful in 1m24s
Test / test_heal_csum_4k (push) Successful in 4m44s
Test / test_move_reappear (push) Successful in 20s
Test / test_heal_pg_size_2 (push) Successful in 4m1s
2023-09-06 17:37:52 +03:00
79c6d6f323 Make QEMU driver compatible with QEMU 8.1
All checks were successful
Test / test_snapshot_ec (push) Successful in 31s
Test / test_minsize_1 (push) Successful in 15s
Test / test_move_reappear (push) Successful in 19s
Test / test_rm (push) Successful in 12s
Test / test_snapshot_chain (push) Successful in 2m2s
Test / test_snapshot_chain_ec (push) Successful in 2m44s
Test / test_snapshot_down (push) Successful in 23s
Test / test_snapshot_down_ec (push) Successful in 24s
Test / test_splitbrain (push) Successful in 16s
Test / test_rebalance_verify (push) Successful in 3m10s
Test / test_rebalance_verify_imm (push) Successful in 3m10s
Test / test_rebalance_verify_ec (push) Successful in 3m39s
Test / test_rebalance_verify_ec_imm (push) Successful in 7m1s
Test / test_write (push) Successful in 36s
Test / test_write_xor (push) Successful in 1m11s
Test / test_write_no_same (push) Successful in 14s
Test / test_heal_pg_size_2 (push) Successful in 4m1s
Test / test_heal_ec (push) Successful in 6m7s
Test / test_heal_csum_32k_dmj (push) Successful in 5m19s
Test / test_heal_csum_32k_dj (push) Successful in 5m7s
Test / test_heal_csum_32k (push) Successful in 6m21s
Test / test_heal_csum_4k_dmj (push) Successful in 6m22s
Test / test_heal_csum_4k_dj (push) Successful in 6m40s
Test / test_heal_csum_4k (push) Successful in 5m34s
Test / test_scrub (push) Successful in 1m2s
Test / test_scrub_zero_osd_2 (push) Successful in 46s
Test / test_scrub_xor (push) Successful in 38s
Test / test_scrub_pg_size_3 (push) Successful in 1m7s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 50s
Test / test_scrub_ec (push) Successful in 48s
2023-08-24 02:23:55 +03:00
ae760dbc1d Fix co_truncate size division by BDRV_SECTOR_SIZE 2023-08-24 01:55:35 +03:00
65487da4b1 Do not include msgr_rdma.h into messenger.h 2023-08-24 01:55:35 +03:00
7862282938 Extract validation to check_rw(), remove duplicate code with OP_SYNC
All checks were successful
Test / test_snapshot_ec (push) Successful in 19s
Test / test_minsize_1 (push) Successful in 12s
Test / test_move_reappear (push) Successful in 18s
Test / test_rm (push) Successful in 11s
Test / test_snapshot_chain (push) Successful in 59s
Test / test_snapshot_chain_ec (push) Successful in 1m41s
Test / test_snapshot_down (push) Successful in 22s
Test / test_snapshot_down_ec (push) Successful in 22s
Test / test_splitbrain (push) Successful in 14s
Test / test_rebalance_verify (push) Successful in 3m23s
Test / test_rebalance_verify_imm (push) Successful in 3m19s
Test / test_rebalance_verify_ec (push) Successful in 5m33s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m28s
Test / test_write (push) Successful in 51s
Test / test_write_xor (push) Successful in 1m6s
Test / test_write_no_same (push) Successful in 18s
Test / test_heal_pg_size_2 (push) Successful in 4m26s
Test / test_heal_ec (push) Successful in 5m21s
Test / test_heal_csum_32k_dmj (push) Successful in 5m56s
Test / test_heal_csum_32k_dj (push) Successful in 6m2s
Test / test_heal_csum_32k (push) Successful in 6m41s
Test / test_heal_csum_4k_dmj (push) Successful in 6m53s
Test / test_heal_csum_4k_dj (push) Successful in 3m58s
Test / test_heal_csum_4k (push) Successful in 4m55s
Test / test_scrub (push) Successful in 48s
Test / test_scrub_zero_osd_2 (push) Successful in 54s
Test / test_scrub_xor (push) Successful in 50s
Test / test_scrub_pg_size_3 (push) Successful in 1m4s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 46s
Test / test_scrub_ec (push) Successful in 47s
2023-08-13 23:49:52 +03:00
30ce2bd951 Fix buffer insert in cluster_client 2023-08-12 11:08:50 +03:00
b1a0afd10a Aggregate buffer flushes
All checks were successful
Test / test_snapshot_ec (push) Successful in 30s
Test / test_minsize_1 (push) Successful in 13s
Test / test_move_reappear (push) Successful in 17s
Test / test_rm (push) Successful in 14s
Test / test_snapshot_chain (push) Successful in 2m21s
Test / test_snapshot_chain_ec (push) Successful in 2m56s
Test / test_snapshot_down (push) Successful in 25s
Test / test_snapshot_down_ec (push) Successful in 23s
Test / test_splitbrain (push) Successful in 19s
Test / test_rebalance_verify (push) Successful in 3m39s
Test / test_rebalance_verify_imm (push) Successful in 3m35s
Test / test_rebalance_verify_ec (push) Successful in 5m6s
Test / test_rebalance_verify_ec_imm (push) Successful in 6m8s
Test / test_write (push) Successful in 50s
Test / test_write_xor (push) Successful in 55s
Test / test_write_no_same (push) Successful in 14s
Test / test_heal_pg_size_2 (push) Successful in 4m14s
Test / test_heal_ec (push) Successful in 5m2s
Test / test_heal_csum_32k_dmj (push) Successful in 5m1s
Test / test_heal_csum_32k_dj (push) Successful in 6m12s
Test / test_heal_csum_32k (push) Successful in 6m35s
Test / test_heal_csum_4k_dmj (push) Successful in 7m18s
Test / test_heal_csum_4k_dj (push) Successful in 6m24s
Test / test_heal_csum_4k (push) Successful in 5m12s
Test / test_scrub (push) Successful in 1m39s
Test / test_scrub_zero_osd_2 (push) Successful in 49s
Test / test_scrub_xor (push) Successful in 45s
Test / test_scrub_pg_size_3 (push) Successful in 1m19s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 57s
Test / test_scrub_ec (push) Successful in 59s
2023-08-11 11:26:13 +03:00
85b6134910 Return dirty buffers on read in client
All checks were successful
Test / test_snapshot_ec (push) Successful in 22s
Test / test_minsize_1 (push) Successful in 12s
Test / test_move_reappear (push) Successful in 19s
Test / test_rm (push) Successful in 13s
Test / test_snapshot_chain (push) Successful in 1m57s
Test / test_snapshot_chain_ec (push) Successful in 2m39s
Test / test_snapshot_down (push) Successful in 22s
Test / test_snapshot_down_ec (push) Successful in 25s
Test / test_splitbrain (push) Successful in 15s
Test / test_rebalance_verify (push) Successful in 3m4s
Test / test_rebalance_verify_imm (push) Successful in 3m7s
Test / test_rebalance_verify_ec (push) Successful in 3m26s
Test / test_rebalance_verify_ec_imm (push) Successful in 7m35s
Test / test_write (push) Successful in 35s
Test / test_write_xor (push) Successful in 38s
Test / test_write_no_same (push) Successful in 21s
Test / test_heal_pg_size_2 (push) Successful in 4m58s
Test / test_heal_ec (push) Successful in 3m15s
Test / test_heal_csum_32k_dmj (push) Successful in 4m52s
Test / test_heal_csum_32k_dj (push) Successful in 5m3s
Test / test_heal_csum_32k (push) Successful in 5m26s
Test / test_heal_csum_4k_dmj (push) Successful in 6m12s
Test / test_heal_csum_4k_dj (push) Successful in 5m9s
Test / test_heal_csum_4k (push) Successful in 5m13s
Test / test_scrub (push) Successful in 1m24s
Test / test_scrub_zero_osd_2 (push) Successful in 1m15s
Test / test_scrub_xor (push) Successful in 1m5s
Test / test_scrub_pg_size_3 (push) Successful in 1m45s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m11s
Test / test_scrub_ec (push) Successful in 56s
Required at least to return buffers when they need to be replayed, but until
they are actually replayed
2023-08-09 00:57:08 +03:00
b1b07a393d Fix incorrect marking op parts as done with snapshots (could probably lead to client hangs) 2023-08-09 00:57:08 +03:00
7333022adf Add a third I/O mode: O_DIRECT|O_SYNC, change parameters to data_io/meta_io/journal_io 2023-08-09 00:57:08 +03:00
ab8627c9fa Fix monitor retrying failed etcd connection in an infinite loop without pauses 2023-08-09 00:57:08 +03:00
6acf562e01 Release 1.0.0
New features:

- Data and metadata checksums!
  - Metadata checksums are always used with new disk format
  - Data checksums can be turned on with --data_csum_type crc32c for new OSDs
  - Checksum block size can be configured
  - inmemory_metadata now also affects keeping checksums in memory
- Linux page cache I/O caching support which can be enabled separately for
  data, metadata (including checksums) and journal (O_SYNC instead of O_DIRECT)
- Details [here](https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/docs/config/layout-osd.en.md#data_csum_type)
- Backwards compatibility is preserved, you can use new OSDs with old disks

Release also includes bug fixes from [0.9.6](https://git.yourcmc.ru/vitalif/vitastor/releases/tag/v0.9.6).

0.9.6 is moved to "-oldstable" repositories and will be available for some additional time.
2023-07-29 18:57:19 +03:00
6f797f429e Add a note about -oldstable 2023-07-29 18:11:13 +03:00
b8a1734465 Reword checksum docs one more time 2023-07-29 14:42:56 +03:00
c752b68167 Remove "without checksums" from docs :) 2023-07-29 12:19:03 +03:00
564df2eb5d Support using buffered I/O with O_SYNC instead of direct I/O 2023-07-29 12:17:18 +03:00
9a427dd70a Allow to override OSD devices in tests 2023-07-29 12:17:18 +03:00
1a4ceb420d Track used blocks, not object versions 2023-07-29 12:17:18 +03:00
21b5124a4b Document data_csum_type and csum_block_size parameters 2023-07-29 12:17:18 +03:00
4181add1f4 Remove creepy "metadata copying" during overwrite
Instead of it, just do not verify checksums of currently mutated objects.
When clean data modification during flush runs in parallel to a read request,
that request may read a mix of old and new data. It may even read a mix of
multiple flushed versions if it lasts too long... And attempts to verify it
using temporary copies of metadata make the algorithm too complex and creepy.
2023-07-29 12:17:18 +03:00
a8464c19af Support keeping checksums on disk (not in memory)
Definitely beneficial for SSD+HDD setups
2023-07-29 12:17:18 +03:00
819cb70cdd Check for "Checksum mismatch" and "BUG" messages during test_heal 2023-07-29 12:17:18 +03:00
3c8e4c6b72 Use clean_dyn_size for space check 2023-07-29 12:17:18 +03:00
8ef4cf89dc Log more details about checksum mismatch in big_writes 2023-07-29 12:17:18 +03:00
7bfb1639ea Use find_holes() in flusher for unification 2023-07-29 12:17:18 +03:00
628e481c32 Fill journal header to know checksum type & size when dumping journal with --all 2023-07-29 12:17:18 +03:00
af6f2046fc Fix journal read checksum verification with inmemory_journal=false 2023-07-29 12:17:18 +03:00
9357e5293e Call fill_partial_checksum_blocks() correctly in regard to COPY_BUF_CSUM_FILL 2023-07-29 12:17:18 +03:00
12851dc07d Wait for journal reads before checking them in clear_incomplete_csum_block_bits 2023-07-29 12:17:18 +03:00
a5753e35a3 Check for checksum mismatch absence in test_heal 2023-07-29 12:17:18 +03:00
d6ee1ca17c Use zero checksum size for zero-length writes 2023-07-29 12:17:18 +03:00
71674d00cf Fix journal data checksum mangling on corrupted block overwrite 2023-07-29 12:17:18 +03:00
ddb078d5a7 Check journal entry size when checking block checksums 2023-07-29 12:17:18 +03:00
d22d56f90a Fix journal data checksum verification on start 2023-07-29 12:17:18 +03:00
eb1331a079 Add more details to "journal entry data is corrupt" messages 2023-07-29 12:17:18 +03:00
c5274f655b ...and partially remove the perversion with bitmap inlining 2023-07-29 12:17:18 +03:00
45e07d6294 Sadly we have to refcount dyn_data... 2023-07-29 12:17:18 +03:00
a8ee391e05 Fix clean block checksum read 2023-07-29 12:17:18 +03:00
de48fa3fd2 Allow to forcibly set meta_format 2023-07-29 12:17:18 +03:00
874a766b62 Rename meta_version to meta_format 2023-07-29 12:17:18 +03:00
384bd8e28f Support old metadata format in vitastor-disk dump-meta 2023-07-29 12:17:18 +03:00
430994f48a Fix journal big_write simple reads after checksum changes 2023-07-29 12:17:18 +03:00
3d7f838c59 Verify checksums in test_heal in different combinations 2023-07-29 12:17:18 +03:00
b909d81f41 Fix bitmap-granular checksums 2023-07-29 12:17:18 +03:00
e42975ffd1 Fix wait_journal_count not being zeroed 2023-07-29 12:17:18 +03:00
93778324e5 Rewrite and fix find_holes into a more obvious version 2023-07-29 12:17:18 +03:00
eeb6727170 Fix missing checksum read offset 2023-07-29 12:17:18 +03:00
7fe82c692e Add a test for checksums 2023-07-29 12:17:18 +03:00
92c6e16eba Fix checksum verification in big_write journal reads 2023-07-29 12:17:18 +03:00
213a9ccb4d Verify checksums during journal reads 2023-07-29 12:17:18 +03:00
a166147110 Add backwards compatibility with non-checksum metadata and journal formats 2023-07-29 12:17:18 +03:00
7d532880c3 Implement large csum_block_size support (more than 4k) + refactor blockstore_flush 2023-07-29 12:17:18 +03:00
0b0405d115 Implement bitmap-granular (4k) metadata & data checksums 2023-07-29 12:17:18 +03:00
e651c93a90 Release 0.9.6
All checks were successful
Test / test_interrupted_rebalance (push) Successful in 1m59s
Test / test_interrupted_rebalance_imm (push) Successful in 3m41s
Test / test_interrupted_rebalance_ec (push) Successful in 1m53s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m29s
Test / test_failure_domain (push) Successful in 50s
Test / test_snapshot (push) Successful in 45s
Test / test_snapshot_ec (push) Successful in 23s
Test / test_minsize_1 (push) Successful in 15s
Test / test_move_reappear (push) Successful in 18s
Test / test_rm (push) Successful in 15s
Test / test_snapshot_chain (push) Successful in 2m23s
Test / test_snapshot_chain_ec (push) Successful in 3m1s
Test / test_snapshot_down (push) Successful in 31s
Test / test_snapshot_down_ec (push) Successful in 32s
Test / test_splitbrain (push) Successful in 19s
Test / test_rebalance_verify (push) Successful in 3m34s
Test / test_rebalance_verify_imm (push) Successful in 3m31s
Test / test_rebalance_verify_ec (push) Successful in 5m14s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m14s
Test / test_write (push) Successful in 44s
Test / test_write_xor (push) Successful in 54s
Test / test_write_no_same (push) Successful in 15s
Test / test_heal_pg_size_2 (push) Successful in 4m38s
Test / test_heal_ec (push) Successful in 3m56s
Test / test_scrub (push) Successful in 36s
Test / test_scrub_zero_osd_2 (push) Successful in 35s
Test / test_scrub_xor (push) Successful in 31s
Test / test_scrub_pg_size_3 (push) Successful in 46s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 25s
Test / test_scrub_ec (push) Successful in 24s
- Fix vitastor-disk partition zeroing (sometimes it was writing garbage instead of zeroes)
- Fix incorrect EC space statistics in `vitastor-cli status`
- Several bug fixes for NFS:
  - Add . and .. in NFS directory listings
  - Return FILE_SYNC from NFS writes if immediate_commit is enabled
  - Return the same "verifier" in NFS COMMIT as in NFS WRITE
  - Make parallel NFS extending writes work correctly, without conflicts
  - Handle parallel NFS extending writes without imposing extra load on etcd
- Support UTF-8 in vitastor-cli table output
- Also allow "0" and "no" as false for inmemory_metadata and inmemory_journal
- Use HDD defaults for HDD-only in automatic `vitastor-disk prepare` mode
2023-07-29 10:54:00 +03:00
988e90be69 Fix vitastor-disk partition zeroing (it was writing random garbage instead of zeroes :D)
All checks were successful
Test / test_interrupted_rebalance (push) Successful in 1m59s
Test / test_interrupted_rebalance_imm (push) Successful in 1m38s
Test / test_interrupted_rebalance_ec (push) Successful in 1m12s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m18s
Test / test_failure_domain (push) Successful in 8s
Test / test_snapshot (push) Successful in 19s
Test / test_snapshot_ec (push) Successful in 20s
Test / test_minsize_1 (push) Successful in 12s
Test / test_move_reappear (push) Successful in 18s
Test / test_rm (push) Successful in 12s
Test / test_snapshot_chain (push) Successful in 1m3s
Test / test_snapshot_chain_ec (push) Successful in 1m25s
Test / test_snapshot_down (push) Successful in 20s
Test / test_snapshot_down_ec (push) Successful in 20s
Test / test_splitbrain (push) Successful in 12s
Test / test_rebalance_verify (push) Successful in 3m16s
Test / test_rebalance_verify_imm (push) Successful in 3m40s
Test / test_rebalance_verify_ec (push) Successful in 4m17s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m20s
Test / test_write (push) Successful in 40s
Test / test_write_xor (push) Successful in 33s
Test / test_write_no_same (push) Successful in 13s
Test / test_heal_pg_size_2 (push) Successful in 3m54s
Test / test_heal_ec (push) Successful in 4m7s
Test / test_scrub (push) Successful in 58s
Test / test_scrub_zero_osd_2 (push) Successful in 53s
Test / test_scrub_xor (push) Successful in 40s
Test / test_scrub_pg_size_3 (push) Successful in 1m6s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 37s
Test / test_scrub_ec (push) Successful in 39s
2023-07-28 12:29:07 +03:00
272a45ad63 Fix modprobe command in docs 2023-07-27 23:57:02 +03:00
25a15d24cf Fix incorrect EC space statistics in vitastor-cli status
All checks were successful
Test / test_etcd_fail (push) Successful in 1m21s
Test / test_interrupted_rebalance_imm (push) Successful in 2m9s
Test / test_interrupted_rebalance_ec (push) Successful in 1m52s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m25s
Test / test_failure_domain (push) Successful in 10s
Test / test_snapshot (push) Successful in 28s
Test / test_snapshot_ec (push) Successful in 30s
Test / test_minsize_1 (push) Successful in 15s
Test / test_move_reappear (push) Successful in 17s
Test / test_rm (push) Successful in 11s
Test / test_snapshot_chain (push) Successful in 2m1s
Test / test_snapshot_chain_ec (push) Successful in 2m41s
Test / test_snapshot_down (push) Successful in 23s
Test / test_snapshot_down_ec (push) Successful in 24s
Test / test_splitbrain (push) Successful in 17s
Test / test_rebalance_verify (push) Successful in 3m9s
Test / test_rebalance_verify_imm (push) Successful in 3m9s
Test / test_rebalance_verify_ec (push) Successful in 3m23s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m38s
Test / test_write (push) Successful in 33s
Test / test_write_xor (push) Successful in 43s
Test / test_write_no_same (push) Successful in 14s
Test / test_heal_pg_size_2 (push) Successful in 4m16s
Test / test_heal_ec (push) Successful in 5m0s
Test / test_scrub (push) Successful in 56s
Test / test_scrub_zero_osd_2 (push) Successful in 41s
Test / test_scrub_xor (push) Successful in 32s
Test / test_scrub_pg_size_3 (push) Successful in 53s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 45s
Test / test_scrub_ec (push) Successful in 40s
2023-07-27 02:26:17 +00:00
700e0e9bff Handle parallel NFS extending writes without imposing extra load on etcd 2023-07-27 02:26:17 +00:00
ab0ca7c00f Return FILE_SYNC from NFS writes if immediate_commit is enabled 2023-07-26 02:09:47 +03:00
f153bc950b Return the same "verifier" in NFS COMMIT as in NFS WRITE
This fixes buffered (not O_DIRECT) NFS writes in Linux - previously they were
hanging in an infinite loop because COMMIT didn't return the same verifier as
previous WRITEs, and NFS kernel client was infinitely retrying the same writes.

Also this probably allows for correct NFS failover, at least for the same
buffered writes, because NFS clients repeat all write requests until a COMMIT
confirms them.
2023-07-26 02:09:47 +03:00
425ff8818d Add . and .. in NFS directory listings
MC, for example, hangs with infinite listing retries without them
2023-07-26 02:09:47 +03:00
9e287a7778 Handle extending writes correctly in NFS proxy
Previously, multiple parallel writes extending file size through NFS were
racing with each other and triggering deletions of part of the written data

I.e. if you mounted vitastor-nfs and just copied a file into it in MC then
you could end up with only a part of the file actually written
2023-07-26 02:09:43 +03:00
f52f58b9e9 Support UTF-8 in vitastor-cli table output 2023-07-25 01:48:57 +00:00
1fe6b0c0e2 Also allow "0" and "no" as false for inmemory_metadata and inmemory_journal 2023-07-25 01:48:57 +00:00
e4237e9ed8 Enable HDD defaults for HDD-only in automatic vitastor-disk prepare mode
Some checks failed
Test / test_interrupted_rebalance (push) Successful in 2m20s
Test / test_interrupted_rebalance_imm (push) Failing after 10m5s
Test / test_interrupted_rebalance_ec (push) Successful in 2m4s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m19s
Test / test_failure_domain (push) Successful in 39s
Test / test_snapshot (push) Successful in 34s
Test / test_snapshot_ec (push) Successful in 23s
Test / test_minsize_1 (push) Successful in 12s
Test / test_move_reappear (push) Successful in 19s
Test / test_rm (push) Successful in 12s
Test / test_snapshot_chain (push) Successful in 1m59s
Test / test_snapshot_chain_ec (push) Successful in 2m40s
Test / test_snapshot_down (push) Successful in 22s
Test / test_snapshot_down_ec (push) Successful in 24s
Test / test_splitbrain (push) Successful in 15s
Test / test_rebalance_verify (push) Successful in 3m10s
Test / test_rebalance_verify_imm (push) Successful in 3m11s
Test / test_rebalance_verify_ec (push) Successful in 3m29s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m54s
Test / test_write (push) Successful in 32s
Test / test_write_xor (push) Successful in 35s
Test / test_write_no_same (push) Successful in 18s
Test / test_heal_pg_size_2 (push) Successful in 3m59s
Test / test_heal_ec (push) Successful in 5m12s
Test / test_scrub (push) Successful in 59s
Test / test_scrub_zero_osd_2 (push) Successful in 47s
Test / test_scrub_xor (push) Successful in 30s
Test / test_scrub_pg_size_3 (push) Successful in 53s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 38s
Test / test_scrub_ec (push) Successful in 22s
2023-07-23 02:33:22 +03:00
10a5fd6abb Release 0.9.5
All checks were successful
Test / test_etcd_fail (push) Successful in 50s
Test / test_interrupted_rebalance (push) Successful in 2m27s
Test / test_interrupted_rebalance_imm (push) Successful in 1m39s
Test / test_interrupted_rebalance_ec (push) Successful in 1m49s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m29s
Test / test_failure_domain (push) Successful in 19s
Test / test_snapshot (push) Successful in 31s
Test / test_snapshot_ec (push) Successful in 26s
Test / test_minsize_1 (push) Successful in 15s
Test / test_rm (push) Successful in 18s
Test / test_snapshot_chain (push) Successful in 1m49s
Test / test_snapshot_chain_ec (push) Successful in 2m51s
Test / test_snapshot_down (push) Successful in 26s
Test / test_snapshot_down_ec (push) Successful in 24s
Test / test_splitbrain (push) Successful in 18s
Test / test_rebalance_verify (push) Successful in 3m8s
Test / test_rebalance_verify_imm (push) Successful in 3m13s
Test / test_rebalance_verify_ec (push) Successful in 3m36s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m59s
Test / test_write (push) Successful in 48s
Test / test_write_xor (push) Successful in 37s
Test / test_write_no_same (push) Successful in 14s
Test / test_heal_pg_size_2 (push) Successful in 3m43s
Test / test_heal_ec (push) Successful in 4m6s
Test / test_scrub (push) Successful in 35s
Test / test_scrub_zero_osd_2 (push) Successful in 34s
Test / test_scrub_xor (push) Successful in 42s
Test / test_scrub_pg_size_3 (push) Successful in 52s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 33s
Test / test_scrub_ec (push) Successful in 28s
A hotfix to 0.9.4 containing only one bugfix: 100% CPU usage in the new QEMU
driver caused by the lack of eventfd reset on io_uring event handling :)
2023-07-21 00:04:41 +03:00
1c316ef350 Reset eventfd on every ringloop::loop() 2023-07-21 00:04:41 +03:00
0b2d12eef1 Remove has_work, it was unnecessary 2023-07-21 00:04:37 +03:00
1c10430ae1 Release 0.9.4
All checks were successful
Test / test_interrupted_rebalance (push) Successful in 1m54s
Test / test_interrupted_rebalance_imm (push) Successful in 2m4s
Test / test_interrupted_rebalance_ec (push) Successful in 1m40s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m25s
Test / test_failure_domain (push) Successful in 15s
Test / test_snapshot (push) Successful in 25s
Test / test_snapshot_ec (push) Successful in 20s
Test / test_minsize_1 (push) Successful in 13s
Test / test_move_reappear (push) Successful in 16s
Test / test_rm (push) Successful in 13s
Test / test_snapshot_chain (push) Successful in 1m56s
Test / test_snapshot_chain_ec (push) Successful in 2m33s
Test / test_snapshot_down (push) Successful in 23s
Test / test_snapshot_down_ec (push) Successful in 22s
Test / test_splitbrain (push) Successful in 16s
Test / test_rebalance_verify (push) Successful in 3m3s
Test / test_rebalance_verify_imm (push) Successful in 3m2s
Test / test_rebalance_verify_ec (push) Successful in 3m13s
Test / test_rebalance_verify_ec_imm (push) Successful in 8m35s
Test / test_write (push) Successful in 33s
Test / test_write_xor (push) Successful in 40s
Test / test_write_no_same (push) Successful in 15s
Test / test_heal_pg_size_2 (push) Successful in 4m25s
Test / test_heal_ec (push) Successful in 3m9s
Test / test_scrub (push) Successful in 1m0s
Test / test_scrub_zero_osd_2 (push) Successful in 46s
Test / test_scrub_xor (push) Successful in 1m1s
Test / test_scrub_pg_size_3 (push) Successful in 1m55s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m25s
Test / test_scrub_ec (push) Successful in 52s
- Improve QEMU driver performance by integrating io_uring in it (up to 1.5x total iops improvement)
- Fix QEMU driver deadlocks which started to reproduce in qemu-img after iothread fixes
- Fix `vitastor-cli status` reporting more etcds than actually exists (fix etcd address duplication in config on reload)
- Fix `vitastor-cli ls` crashing on inodes in non-existing pools
- Delete old garbage /pool/stats/ keys for non-existing (deleted) pools
- Reduce memory usage of etcds initialized by make-etcd script
- Fix OSDs almost always crashing on etcd restart due to "revisions were compacted" (support reloading state from etcd)
- Fix a crash and a stall possible mostly in HDD setups with small journal and big (512k, 900k) random writes
- Add notes about HDDs to documentation. You are officially allowed to use HDD-only Vitastor with HGST/Toshiba/EXOS :)
2023-07-19 02:50:30 +03:00
dfce91d168 Change git url in docs, correct block/vitastor.c path 2023-07-19 01:02:12 +03:00
332a13ba30 Build patched QEMU against local packages 2023-07-19 00:05:02 +03:00
d0e257ee81 Fix non-existing pool handling in vitastor-cli ls
All checks were successful
Test / test_interrupted_rebalance (push) Successful in 2m6s
Test / test_interrupted_rebalance_imm (push) Successful in 3m11s
Test / test_interrupted_rebalance_ec (push) Successful in 2m6s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 2m20s
Test / test_failure_domain (push) Successful in 22s
Test / test_snapshot (push) Successful in 50s
Test / test_snapshot_ec (push) Successful in 33s
Test / test_minsize_1 (push) Successful in 13s
Test / test_move_reappear (push) Successful in 1m23s
Test / test_rm (push) Successful in 13s
Test / test_snapshot_chain (push) Successful in 2m22s
Test / test_snapshot_chain_ec (push) Successful in 3m6s
Test / test_snapshot_down (push) Successful in 24s
Test / test_snapshot_down_ec (push) Successful in 22s
Test / test_splitbrain (push) Successful in 19s
Test / test_rebalance_verify (push) Successful in 3m28s
Test / test_rebalance_verify_imm (push) Successful in 3m27s
Test / test_rebalance_verify_ec (push) Successful in 9m10s
Test / test_rebalance_verify_ec_imm (push) Successful in 9m29s
Test / test_write (push) Successful in 1m36s
Test / test_write_xor (push) Successful in 2m17s
Test / test_write_no_same (push) Successful in 36s
Test / test_heal_pg_size_2 (push) Successful in 6m27s
Test / test_heal_ec (push) Successful in 5m53s
Test / test_scrub (push) Successful in 44s
Test / test_scrub_zero_osd_2 (push) Successful in 35s
Test / test_scrub_xor (push) Successful in 36s
Test / test_scrub_pg_size_3 (push) Successful in 1m1s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 46s
Test / test_scrub_ec (push) Successful in 36s
2023-07-18 23:52:02 +03:00
004912aac0 Add RPM spec patches for 6.2-el8 and 7.2-el9
Some checks failed
Test / test_interrupted_rebalance (push) Successful in 1m57s
Test / test_interrupted_rebalance_imm (push) Successful in 3m16s
Test / test_interrupted_rebalance_ec (push) Successful in 1m52s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m25s
Test / test_failure_domain (push) Failing after 47s
Test / test_snapshot (push) Successful in 40s
Test / test_snapshot_ec (push) Successful in 24s
Test / test_minsize_1 (push) Successful in 16s
Test / test_move_reappear (push) Failing after 52s
Test / test_rm (push) Successful in 19s
Test / test_snapshot_chain (push) Successful in 2m27s
Test / test_snapshot_chain_ec (push) Failing after 3m9s
Test / test_snapshot_down (push) Successful in 22s
Test / test_snapshot_down_ec (push) Successful in 21s
Test / test_splitbrain (push) Successful in 21s
Test / test_rebalance_verify (push) Successful in 3m34s
Test / test_rebalance_verify_imm (push) Successful in 3m32s
Test / test_rebalance_verify_ec (push) Successful in 5m14s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m18s
Test / test_write (push) Successful in 49s
Test / test_write_xor (push) Successful in 58s
Test / test_write_no_same (push) Successful in 13s
Test / test_heal_pg_size_2 (push) Successful in 3m55s
Test / test_heal_ec (push) Failing after 10m39s
Test / test_scrub (push) Successful in 33s
Test / test_scrub_zero_osd_2 (push) Successful in 29s
Test / test_scrub_xor (push) Successful in 30s
Test / test_scrub_pg_size_3 (push) Successful in 1m1s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 44s
Test / test_scrub_ec (push) Successful in 25s
2023-07-18 23:38:14 +03:00
c18e92273e Copy qemu 5.1 -> 5.2 patch for convenience 2023-07-18 23:37:53 +03:00
9815d70ffc It is impossible to use io_uring with older vitastor-client because it does not have vitastor_c_uring_has_work() 2023-07-18 23:37:53 +03:00
4a4627dcab Do not use bool in C library 2023-07-18 23:37:53 +03:00
b963f2fd93 Add QEMU 2.12 patch (basically the same as 3.1) 2023-07-18 23:37:06 +03:00
ba7427020e Fix deadlocks possible in qemu-img after fixing iothread
Deadlock was caused by switching QEMU coroutines directly inside
vitastor_co_read_bitmap_cb() callback. The correct way is to schedule a BH
/BH is a QEMU term for setImmediate() :)/, same as in read and write callbacks.
2023-07-18 23:32:16 +03:00
a0aac7eb2a Update drives
Some checks failed
Test / test_interrupted_rebalance (push) Failing after 1m42s
Test / test_interrupted_rebalance_imm (push) Failing after 1m37s
Test / test_interrupted_rebalance_ec (push) Successful in 1m48s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m25s
Test / test_failure_domain (push) Successful in 19s
Test / test_snapshot (push) Successful in 25s
Test / test_snapshot_ec (push) Successful in 27s
Test / test_minsize_1 (push) Successful in 15s
Test / test_move_reappear (push) Failing after 49s
Test / test_rm (push) Successful in 16s
Test / test_snapshot_chain (push) Successful in 2m24s
Test / test_snapshot_chain_ec (push) Successful in 3m0s
Test / test_snapshot_down (push) Successful in 23s
Test / test_snapshot_down_ec (push) Successful in 25s
Test / test_splitbrain (push) Successful in 22s
Test / test_rebalance_verify (push) Successful in 3m39s
Test / test_rebalance_verify_imm (push) Successful in 3m38s
Test / test_rebalance_verify_ec (push) Successful in 5m29s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m32s
Test / test_write (push) Successful in 54s
Test / test_write_xor (push) Successful in 1m5s
Test / test_write_no_same (push) Successful in 15s
Test / test_heal_pg_size_2 (push) Successful in 3m51s
Test / test_heal_ec (push) Successful in 4m16s
Test / test_scrub (push) Successful in 35s
Test / test_scrub_zero_osd_2 (push) Successful in 32s
Test / test_scrub_xor (push) Successful in 32s
Test / test_scrub_pg_size_3 (push) Successful in 55s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 48s
Test / test_scrub_ec (push) Successful in 27s
2023-07-13 01:49:45 +03:00
ac7b834af3 Disable journal_no_same_sector_overwrites by default for HDD-only
All checks were successful
Test / test_interrupted_rebalance (push) Successful in 1m17s
Test / test_interrupted_rebalance_imm (push) Successful in 1m29s
Test / test_interrupted_rebalance_ec (push) Successful in 1m53s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m31s
Test / test_failure_domain (push) Successful in 9s
Test / test_snapshot (push) Successful in 20s
Test / test_snapshot_ec (push) Successful in 21s
Test / test_minsize_1 (push) Successful in 12s
Test / test_move_reappear (push) Successful in 39s
Test / test_rm (push) Successful in 12s
Test / test_snapshot_chain (push) Successful in 1m10s
Test / test_snapshot_chain_ec (push) Successful in 2m9s
Test / test_snapshot_down (push) Successful in 21s
Test / test_snapshot_down_ec (push) Successful in 22s
Test / test_splitbrain (push) Successful in 14s
Test / test_rebalance_verify (push) Successful in 2m56s
Test / test_rebalance_verify_imm (push) Successful in 2m53s
Test / test_rebalance_verify_ec (push) Successful in 5m39s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m44s
Test / test_write (push) Successful in 49s
Test / test_write_xor (push) Successful in 59s
Test / test_write_no_same (push) Successful in 13s
Test / test_heal_pg_size_2 (push) Successful in 3m41s
Test / test_heal_ec (push) Successful in 3m49s
Test / test_scrub (push) Successful in 33s
Test / test_scrub_zero_osd_2 (push) Successful in 30s
Test / test_scrub_xor (push) Successful in 25s
Test / test_scrub_pg_size_3 (push) Successful in 40s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 35s
Test / test_scrub_ec (push) Successful in 33s
2023-07-10 00:34:35 +03:00
ee0c78fd74 Fix default HDD block size in docs (actual size is 1 MB) 2023-07-09 13:12:30 +03:00
e6646a5b2f Bump QEMU version to vitastor3 2023-07-09 13:01:04 +03:00
ae69662b17 Add "Recommended drives" 2023-07-09 12:59:18 +03:00
57ad4c3636 Add a note about HDD, enable throttling only for hybrid OSDs
Some checks failed
Test / test_interrupted_rebalance (push) Successful in 2m4s
Test / test_interrupted_rebalance_imm (push) Successful in 1m30s
Test / test_interrupted_rebalance_ec (push) Successful in 1m45s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m23s
Test / test_failure_domain (push) Successful in 9s
Test / test_snapshot (push) Successful in 19s
Test / test_snapshot_ec (push) Successful in 19s
Test / test_minsize_1 (push) Successful in 12s
Test / test_move_reappear (push) Failing after 1m32s
Test / test_rm (push) Successful in 12s
Test / test_snapshot_chain (push) Successful in 1m8s
Test / test_snapshot_chain_ec (push) Successful in 2m2s
Test / test_snapshot_down (push) Successful in 22s
Test / test_snapshot_down_ec (push) Successful in 21s
Test / test_splitbrain (push) Successful in 13s
Test / test_rebalance_verify (push) Successful in 2m52s
Test / test_rebalance_verify_imm (push) Successful in 2m46s
Test / test_rebalance_verify_ec (push) Successful in 5m9s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m5s
Test / test_write (push) Successful in 40s
Test / test_write_xor (push) Successful in 49s
Test / test_write_no_same (push) Successful in 15s
Test / test_heal_pg_size_2 (push) Successful in 3m43s
Test / test_heal_ec (push) Successful in 4m35s
Test / test_scrub (push) Successful in 37s
Test / test_scrub_zero_osd_2 (push) Successful in 40s
Test / test_scrub_xor (push) Successful in 31s
Test / test_scrub_pg_size_3 (push) Successful in 40s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 26s
Test / test_scrub_ec (push) Successful in 22s
2023-07-09 12:45:11 +03:00
b7e4d0c9bf Fix journal dirty_start position tracking and some debug prints
Fixes two bugs found during HDD testing :-)
1) OSD crashed with "BUG: Attempt to overwrite used offset of the journal" during
   `fio -bs=900k -iodepth=128` test with 16 MB journal
2) OSD stalled during `fio -bs=512k -iodepth=128` test with 64 MB journal
2023-07-09 01:17:55 +03:00
161a23c966 Support reloading state when etcd says "revisions were compacted"
All checks were successful
Test / test_interrupted_rebalance (push) Successful in 3m9s
Test / test_interrupted_rebalance_imm (push) Successful in 1m38s
Test / test_interrupted_rebalance_ec (push) Successful in 1m54s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m36s
Test / test_failure_domain (push) Successful in 9s
Test / test_snapshot (push) Successful in 23s
Test / test_snapshot_ec (push) Successful in 22s
Test / test_minsize_1 (push) Successful in 14s
Test / test_move_reappear (push) Successful in 19s
Test / test_rm (push) Successful in 12s
Test / test_snapshot_chain (push) Successful in 2m2s
Test / test_snapshot_chain_ec (push) Successful in 2m38s
Test / test_snapshot_down (push) Successful in 21s
Test / test_snapshot_down_ec (push) Successful in 24s
Test / test_splitbrain (push) Successful in 15s
Test / test_rebalance_verify (push) Successful in 3m10s
Test / test_rebalance_verify_imm (push) Successful in 3m10s
Test / test_rebalance_verify_ec (push) Successful in 3m27s
Test / test_rebalance_verify_ec_imm (push) Successful in 6m2s
Test / test_write (push) Successful in 35s
Test / test_write_xor (push) Successful in 45s
Test / test_write_no_same (push) Successful in 22s
Test / test_heal_pg_size_2 (push) Successful in 4m0s
Test / test_heal_ec (push) Successful in 3m52s
Test / test_scrub (push) Successful in 1m1s
Test / test_scrub_zero_osd_2 (push) Successful in 42s
Test / test_scrub_xor (push) Successful in 34s
Test / test_scrub_pg_size_3 (push) Successful in 53s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 45s
Test / test_scrub_ec (push) Successful in 26s
Before this change, OSDs almost always died when one of the etcds was restarted,
even though the rest of them was still in quorum and the lease was still active
2023-07-07 01:33:48 +03:00
2f999d8607 Reduce etcd memory usage
All checks were successful
Test / test_etcd_fail (push) Successful in 1m34s
Test / test_interrupted_rebalance (push) Successful in 1m52s
Test / test_interrupted_rebalance_imm (push) Successful in 2m4s
Test / test_interrupted_rebalance_ec (push) Successful in 2m13s
Test / test_failure_domain (push) Successful in 12s
Test / test_snapshot (push) Successful in 19s
Test / test_snapshot_ec (push) Successful in 20s
Test / test_minsize_1 (push) Successful in 13s
Test / test_move_reappear (push) Successful in 19s
Test / test_rm (push) Successful in 11s
Test / test_snapshot_chain (push) Successful in 1m56s
Test / test_snapshot_chain_ec (push) Successful in 2m36s
Test / test_snapshot_down (push) Successful in 22s
Test / test_snapshot_down_ec (push) Successful in 23s
Test / test_splitbrain (push) Successful in 15s
Test / test_rebalance_verify (push) Successful in 3m6s
Test / test_rebalance_verify_imm (push) Successful in 3m9s
Test / test_rebalance_verify_ec (push) Successful in 3m27s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m30s
Test / test_write (push) Successful in 35s
Test / test_write_xor (push) Successful in 41s
Test / test_write_no_same (push) Successful in 14s
Test / test_heal_pg_size_2 (push) Successful in 3m48s
Test / test_heal_ec (push) Successful in 4m15s
Test / test_scrub (push) Successful in 33s
Test / test_scrub_zero_osd_2 (push) Successful in 31s
Test / test_scrub_xor (push) Successful in 28s
Test / test_scrub_pg_size_3 (push) Successful in 53s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 35s
Test / test_scrub_ec (push) Successful in 28s
With default --snapshot-count 100000 and GOGC=100 it easily reaches 6.6 GB
even when we only store 1-2 MB of data in it
2023-07-06 00:46:26 +03:00
d007a374f2 Delete extra /pool/stats/ keys for non-existing pools
Some checks failed
Test / test_interrupted_rebalance (push) Failing after 10m5s
Test / test_interrupted_rebalance_imm (push) Successful in 1m29s
Test / test_interrupted_rebalance_ec (push) Failing after 10m7s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m32s
Test / test_failure_domain (push) Successful in 8s
Test / test_snapshot (push) Successful in 19s
Test / test_snapshot_ec (push) Successful in 19s
Test / test_minsize_1 (push) Successful in 12s
Test / test_move_reappear (push) Successful in 17s
Test / test_rm (push) Successful in 11s
Test / test_snapshot_chain (push) Successful in 1m1s
Test / test_snapshot_chain_ec (push) Successful in 1m25s
Test / test_snapshot_down (push) Successful in 20s
Test / test_snapshot_down_ec (push) Successful in 19s
Test / test_splitbrain (push) Successful in 12s
Test / test_rebalance_verify (push) Successful in 3m1s
Test / test_rebalance_verify_imm (push) Successful in 4m11s
Test / test_rebalance_verify_ec (push) Successful in 4m19s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m51s
Test / test_write (push) Successful in 31s
Test / test_write_xor (push) Successful in 41s
Test / test_write_no_same (push) Successful in 12s
Test / test_heal_pg_size_2 (push) Successful in 4m10s
Test / test_heal_ec (push) Failing after 10m11s
Test / test_scrub (push) Successful in 43s
Test / test_scrub_zero_osd_2 (push) Successful in 36s
Test / test_scrub_xor (push) Successful in 37s
Test / test_scrub_pg_size_3 (push) Successful in 48s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 23s
Test / test_scrub_ec (push) Successful in 22s
2023-07-06 00:40:13 +03:00
45c0694853 Clear etcd_local addresses on reload and also skip duplicates 2023-07-06 00:39:39 +03:00
57bcba2406 Add notes about VDUSE 2023-07-04 16:51:46 +03:00
30ac899074 Make QEMU driver compatible with older vitastor_client and with systems without io_uring
All checks were successful
Test / test_interrupted_rebalance (push) Successful in 1m45s
Test / test_interrupted_rebalance_imm (push) Successful in 1m25s
Test / test_interrupted_rebalance_ec (push) Successful in 1m23s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m37s
Test / test_failure_domain (push) Successful in 8s
Test / test_snapshot (push) Successful in 20s
Test / test_snapshot_ec (push) Successful in 18s
Test / test_minsize_1 (push) Successful in 12s
Test / test_move_reappear (push) Successful in 16s
Test / test_rm (push) Successful in 11s
Test / test_snapshot_chain (push) Successful in 1m26s
Test / test_snapshot_chain_ec (push) Successful in 2m8s
Test / test_snapshot_down (push) Successful in 23s
Test / test_snapshot_down_ec (push) Successful in 21s
Test / test_splitbrain (push) Successful in 12s
Test / test_rebalance_verify (push) Successful in 2m48s
Test / test_rebalance_verify_imm (push) Successful in 2m48s
Test / test_rebalance_verify_ec (push) Successful in 4m10s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m13s
Test / test_write (push) Successful in 1m16s
Test / test_write_xor (push) Successful in 38s
Test / test_write_no_same (push) Successful in 15s
Test / test_heal_pg_size_2 (push) Successful in 4m10s
Test / test_heal_ec (push) Successful in 3m28s
Test / test_scrub (push) Successful in 43s
Test / test_scrub_zero_osd_2 (push) Successful in 41s
Test / test_scrub_xor (push) Successful in 34s
Test / test_scrub_pg_size_3 (push) Successful in 44s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 38s
Test / test_scrub_ec (push) Successful in 36s
2023-07-04 15:51:43 +03:00
2348d39cf4 Avoid repeated qemu_uring_handlers, add 2.0-2.7 compatibility 2023-07-04 00:28:23 +03:00
3de7929fe5 Integrate v2 - direct epoll 2023-07-04 00:28:23 +03:00
07b2196bc2 Integrate QEMU driver with io_uring 2023-07-04 00:28:23 +03:00
b8e30608d6 Bump QEMU version to vitastor2
All checks were successful
Test / test_etcd_fail (push) Successful in 1m40s
Test / test_interrupted_rebalance (push) Successful in 2m45s
Test / test_interrupted_rebalance_imm (push) Successful in 7m42s
Test / test_interrupted_rebalance_ec (push) Successful in 1m59s
Test / test_failure_domain (push) Successful in 17s
Test / test_snapshot (push) Successful in 56s
Test / test_snapshot_ec (push) Successful in 51s
Test / test_minsize_1 (push) Successful in 11s
Test / test_move_reappear (push) Successful in 17s
Test / test_rm (push) Successful in 11s
Test / test_snapshot_chain (push) Successful in 1m14s
Test / test_snapshot_chain_ec (push) Successful in 1m50s
Test / test_snapshot_down (push) Successful in 23s
Test / test_snapshot_down_ec (push) Successful in 20s
Test / test_splitbrain (push) Successful in 13s
Test / test_rebalance_verify (push) Successful in 3m18s
Test / test_rebalance_verify_imm (push) Successful in 3m15s
Test / test_rebalance_verify_ec (push) Successful in 3m38s
Test / test_rebalance_verify_ec_imm (push) Successful in 5m45s
Test / test_write (push) Successful in 50s
Test / test_write_xor (push) Successful in 46s
Test / test_write_no_same (push) Successful in 15s
Test / test_heal_pg_size_2 (push) Successful in 3m36s
Test / test_heal_ec (push) Successful in 4m37s
Test / test_scrub (push) Successful in 37s
Test / test_scrub_zero_osd_2 (push) Successful in 35s
Test / test_scrub_xor (push) Successful in 29s
Test / test_scrub_pg_size_3 (push) Successful in 33s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 31s
Test / test_scrub_ec (push) Successful in 27s
2023-07-01 00:55:32 +03:00
a612cdca47 Release 0.9.3
- Add patch for libvirt 9.0
- Add support for Proxmox VE 8.0
- Fix compatibility of the QEMU driver with iothread (QEMU rebuilds are coming)
- Fix vitastor-cli rm-data/rm/merge hanging when some OSDs are down.
  Allow deletions in unclean cluster at the cost of some data possibly
  "reappearing" when those OSDs start back. In that case you can just repeat
  the deletion request using rm-data.
- A bunch of bug fixes for snapshots:
  - Fix snapshot reads often not working at all with snapshot chain size > 2
  - Fix optimized snapshot data merge (children to parent)
  - Fix updating of image name index key during optimized merge
  - Fix auto-selection preventing the use of optimized merge with only 1 snapshot
  - Fix incorrect CAS retries during snapshot merge
  - Fix snapshot merge progress reporting
- Fix primary_read bitmap buffers use-after-free which could lead to
  incorrect allocation map reads
- Remove /usr/local/bin path from make-etcd
- Some documentation fixes
2023-07-01 00:25:58 +03:00
164 changed files with 7052 additions and 1666 deletions

View File

@@ -622,6 +622,114 @@ jobs:
echo ""
done
test_heal_csum_32k_dmj:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 10
run: TEST_NAME=csum_32k_dmj OSD_ARGS="--data_csum_type crc32c --csum_block_size 32k --inmemory_metadata false --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS /root/vitastor/tests/test_heal.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_heal_csum_32k_dj:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 10
run: TEST_NAME=csum_32k_dj OSD_ARGS="--data_csum_type crc32c --csum_block_size 32k --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS /root/vitastor/tests/test_heal.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_heal_csum_32k:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 10
run: TEST_NAME=csum_32k OSD_ARGS="--data_csum_type crc32c --csum_block_size 32k" OFFSET_ARGS=$OSD_ARGS /root/vitastor/tests/test_heal.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_heal_csum_4k_dmj:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 10
run: TEST_NAME=csum_4k_dmj OSD_ARGS="--data_csum_type crc32c --inmemory_metadata false --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS /root/vitastor/tests/test_heal.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_heal_csum_4k_dj:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 10
run: TEST_NAME=csum_4k_dj OSD_ARGS="--data_csum_type crc32c --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS /root/vitastor/tests/test_heal.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_heal_csum_4k:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 10
run: TEST_NAME=csum_4k OSD_ARGS="--data_csum_type crc32c" OFFSET_ARGS=$OSD_ARGS /root/vitastor/tests/test_heal.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_scrub:
runs-on: ubuntu-latest
needs: build

View File

@@ -7,7 +7,8 @@ for my $line (<>)
if ($line =~ /\.\/(test_[^\.]+)/s)
{
chomp $line;
my $test_name = $1;
my $base_name = $1;
my $test_name = $base_name;
my $timeout = 3;
if ($test_name eq 'test_etcd_fail' || $test_name eq 'test_heal' || $test_name eq 'test_add_osd' ||
$test_name eq 'test_interrupted_rebalance' || $test_name eq 'test_rebalance_verify')
@@ -16,7 +17,12 @@ for my $line (<>)
}
while ($line =~ /([^\s=]+)=(\S+)/gs)
{
if ($1 eq 'SCHEME' && $2 eq 'ec')
if ($1 eq 'TEST_NAME')
{
$test_name = $base_name.'_'.$2;
last;
}
elsif ($1 eq 'SCHEME' && $2 eq 'ec')
{
$test_name .= '_ec';
}

View File

@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
project(vitastor)
set(VERSION "0.9.2")
set(VERSION "1.2.0")
add_subdirectory(src)

View File

@@ -15,7 +15,7 @@ Vitastor архитектурно похож на Ceph, что означает
и автоматическое распределение данных по любому числу дисков любого размера с настраиваемыми схемами
избыточности - репликацией или с произвольными кодами коррекции ошибок.
Vitastor нацелен на SSD и SSD+HDD кластеры с как минимум 10 Гбит/с сетью, поддерживает
Vitastor нацелен в первую очередь на SSD и SSD+HDD кластеры с как минимум 10 Гбит/с сетью, поддерживает
TCP и RDMA и на хорошем железе может достигать задержки 4 КБ чтения и записи на уровне ~0.1 мс,
что примерно в 10 раз быстрее, чем Ceph и другие популярные программные СХД.
@@ -50,6 +50,7 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
- Параметры
- [Общие](docs/config/common.ru.md)
- [Сетевые](docs/config/network.ru.md)
- [Клиентский код](docs/config/client.en.md)
- [Глобальные дисковые параметры](docs/config/layout-cluster.ru.md)
- [Дисковые параметры OSD](docs/config/layout-osd.ru.md)
- [Прочие параметры OSD](docs/config/osd.ru.md)

View File

@@ -14,8 +14,8 @@ Vitastor is architecturally similar to Ceph which means strong consistency,
primary-replication, symmetric clustering and automatic data distribution over any
number of drives of any size with configurable redundancy (replication or erasure codes/XOR).
Vitastor targets SSD and SSD+HDD clusters with at least 10 Gbit/s network, supports
TCP and RDMA and may achieve 4 KB read and write latency as low as ~0.1 ms
Vitastor targets primarily SSD and SSD+HDD clusters with at least 10 Gbit/s network,
supports TCP and RDMA and may achieve 4 KB read and write latency as low as ~0.1 ms
with proper hardware which is ~10 times faster than other popular SDS's like Ceph
or internal systems of public clouds.
@@ -50,6 +50,7 @@ Read more details below in the documentation.
- Parameter Reference
- [Common](docs/config/common.en.md)
- [Network](docs/config/network.en.md)
- [Client](docs/config/client.en.md)
- [Global Disk Layout](docs/config/layout-cluster.en.md)
- [OSD Disk Layout](docs/config/layout-osd.en.md)
- [OSD Runtime Parameters](docs/config/osd.en.md)

View File

@@ -1,4 +1,4 @@
VERSION ?= v0.9.2
VERSION ?= v1.2.0
all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v0.9.2
image: vitalif/vitastor-csi:v1.2.0
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -35,10 +35,13 @@ rules:
verbs: ["get", "list", "watch"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshots"]
verbs: ["get", "list"]
verbs: ["get", "list", "patch"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshots/status"]
verbs: ["get", "list", "patch"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshotcontents"]
verbs: ["create", "get", "list", "watch", "update", "delete"]
verbs: ["create", "get", "list", "watch", "update", "delete", "patch"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshotclasses"]
verbs: ["get", "list", "watch"]
@@ -53,7 +56,7 @@ rules:
verbs: ["get", "list", "watch"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshotcontents/status"]
verbs: ["update"]
verbs: ["update", "patch"]
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["get"]

View File

@@ -23,6 +23,11 @@ metadata:
name: csi-vitastor-provisioner
spec:
replicas: 3
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
maxSurge: 0
selector:
matchLabels:
app: csi-vitastor-provisioner
@@ -46,7 +51,7 @@ spec:
priorityClassName: system-cluster-critical
containers:
- name: csi-provisioner
image: k8s.gcr.io/sig-storage/csi-provisioner:v2.2.0
image: k8s.gcr.io/sig-storage/csi-provisioner:v3.0.0
args:
- "--csi-address=$(ADDRESS)"
- "--v=5"
@@ -116,7 +121,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v0.9.2
image: vitalif/vitastor-csi:v1.2.0
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -17,3 +17,4 @@ parameters:
# multiple etcdUrls may be specified, delimited by comma
#etcdUrl: "http://192.168.7.2:2379"
#etcdPrefix: "/vitastor"
allowVolumeExpansion: true

View File

@@ -0,0 +1,7 @@
apiVersion: snapshot.storage.k8s.io/v1
kind: VolumeSnapshotClass
metadata:
name: vitastor-snapclass
driver: csi.vitastor.io
deletionPolicy: Delete
parameters:

View File

@@ -0,0 +1,16 @@
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: test-vitastor-clone
spec:
storageClassName: vitastor
dataSource:
name: snap1
kind: VolumeSnapshot
apiGroup: snapshot.storage.k8s.io
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi

View File

@@ -0,0 +1,8 @@
apiVersion: snapshot.storage.k8s.io/v1
kind: VolumeSnapshot
metadata:
name: snap1
spec:
volumeSnapshotClassName: vitastor-snapclass
source:
persistentVolumeClaimName: test-vitastor-pvc

View File

@@ -9,6 +9,7 @@ require (
golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
google.golang.org/grpc v1.33.1
google.golang.org/protobuf v1.24.0
k8s.io/klog v1.0.0
k8s.io/utils v0.0.0-20210305010621-2afb4311ab10
)

View File

@@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "0.9.2"
vitastorCSIDriverVersion = "1.2.0"
)
// Config struct fills the parameters of request or user input

View File

@@ -20,6 +20,7 @@ import (
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"google.golang.org/protobuf/types/known/timestamppb"
"github.com/container-storage-interface/spec/lib/go/csi"
)
@@ -45,6 +46,7 @@ type InodeConfig struct
ParentPool uint64 `json:"parent_pool,omitempty"`
ParentId uint64 `json:"parent_id,omitempty"`
Readonly bool `json:"readonly,omitempty"`
CreateTs uint64 `json:"create_ts,omitempty"`
}
type ControllerServer struct
@@ -178,27 +180,43 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
}
args := []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) }
// Support creation from snapshot
var src *csi.VolumeContentSource
if (req.VolumeContentSource.GetSnapshot() != nil)
{
snapId := req.VolumeContentSource.GetSnapshot().GetSnapshotId()
if (snapId != "")
{
snapVars := make(map[string]string)
err := json.Unmarshal([]byte(snapId), &snapVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
args = append(args, "--parent", snapVars["name"]+"@"+snapVars["snapshot"])
src = &csi.VolumeContentSource{
Type: &csi.VolumeContentSource_Snapshot{
Snapshot: &csi.VolumeContentSource_SnapshotSource{
SnapshotId: snapId,
},
},
}
}
}
// Create image using vitastor-cli
_, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) })
_, err := invokeCLI(ctxVars, args)
if (err != nil)
{
if (strings.Index(err.Error(), "already exists") > 0)
{
stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
inodeCfg, err := invokeList(ctxVars, volName, true)
if (err != nil)
{
return nil, err
}
var inodeCfg []InodeConfig
err = json.Unmarshal(stat, &inodeCfg)
if (err != nil)
{
return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
}
if (len(inodeCfg) == 0)
{
return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
}
if (inodeCfg[0].Size < uint64(volSize))
{
return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
@@ -217,6 +235,7 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
// Ugly, but VolumeContext isn't passed to DeleteVolume :-(
VolumeId: string(volumeIdJson),
CapacityBytes: volSize,
ContentSource: src,
},
}, nil
}
@@ -230,15 +249,15 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
}
ctxVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
volVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &volVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := ctxVars["name"]
volName := volVars["name"]
ctxVars, _, _ = GetConnectionParams(ctxVars)
ctxVars, _, _ := GetConnectionParams(volVars)
_, err = invokeCLI(ctxVars, []string{ "rm", volName })
if (err != nil)
@@ -344,6 +363,8 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
csi.ControllerServiceCapability_RPC_LIST_VOLUMES,
csi.ControllerServiceCapability_RPC_EXPAND_VOLUME,
csi.ControllerServiceCapability_RPC_CREATE_DELETE_SNAPSHOT,
csi.ControllerServiceCapability_RPC_LIST_SNAPSHOTS,
// TODO: csi.ControllerServiceCapability_RPC_CLONE_VOLUME,
} {
controllerServerCapabilities = append(controllerServerCapabilities, functionControllerServerCapabilities(capability))
}
@@ -353,28 +374,214 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
}, nil
}
func invokeList(ctxVars map[string]string, pattern string, expectExist bool) ([]InodeConfig, error)
{
stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", pattern })
if (err != nil)
{
return nil, err
}
var inodeCfg []InodeConfig
err = json.Unmarshal(stat, &inodeCfg)
if (err != nil)
{
return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
}
if (expectExist && len(inodeCfg) == 0)
{
return nil, status.Error(codes.Internal, "Can't find expected image "+pattern+" via vitastor-cli ls")
}
return inodeCfg, nil
}
// CreateSnapshot create snapshot of an existing PV
func (cs *ControllerServer) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (*csi.CreateSnapshotResponse, error)
{
return nil, status.Error(codes.Unimplemented, "")
klog.Infof("received controller create snapshot request %+v", protosanitizer.StripSecrets(req))
if (req == nil)
{
return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
}
if (req.SourceVolumeId == "" || req.Name == "")
{
return nil, status.Error(codes.InvalidArgument, "source volume ID and snapshot name are required fields")
}
// snapshot name
snapName := req.Name
// req.VolumeId is an ugly json string in our case :)
ctxVars := make(map[string]string)
err := json.Unmarshal([]byte(req.SourceVolumeId), &ctxVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := ctxVars["name"]
// Create image using vitastor-cli
_, err = invokeCLI(ctxVars, []string{ "create", "--snapshot", snapName, volName })
if (err != nil && strings.Index(err.Error(), "already exists") <= 0)
{
return nil, err
}
// Check created snapshot
inodeCfg, err := invokeList(ctxVars, volName+"@"+snapName, true)
if (err != nil)
{
return nil, err
}
// Use ugly JSON snapshot ID again, DeleteSnapshot doesn't have context :-(
ctxVars["snapshot"] = snapName
snapIdJson, _ := json.Marshal(ctxVars)
return &csi.CreateSnapshotResponse{
Snapshot: &csi.Snapshot{
SizeBytes: int64(inodeCfg[0].Size),
SnapshotId: string(snapIdJson),
SourceVolumeId: req.SourceVolumeId,
CreationTime: &timestamppb.Timestamp{ Seconds: int64(inodeCfg[0].CreateTs) },
ReadyToUse: true,
},
}, nil
}
// DeleteSnapshot delete provided snapshot of a PV
func (cs *ControllerServer) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequest) (*csi.DeleteSnapshotResponse, error)
{
return nil, status.Error(codes.Unimplemented, "")
klog.Infof("received controller delete snapshot request %+v", protosanitizer.StripSecrets(req))
if (req == nil)
{
return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
}
if (req.SnapshotId == "")
{
return nil, status.Error(codes.InvalidArgument, "snapshot ID is a required field")
}
volVars := make(map[string]string)
err := json.Unmarshal([]byte(req.SnapshotId), &volVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "snapshot ID not in JSON format")
}
volName := volVars["name"]
snapName := volVars["snapshot"]
ctxVars, _, _ := GetConnectionParams(volVars)
_, err = invokeCLI(ctxVars, []string{ "rm", volName+"@"+snapName })
if (err != nil)
{
return nil, err
}
return &csi.DeleteSnapshotResponse{}, nil
}
// ListSnapshots list the snapshots of a PV
func (cs *ControllerServer) ListSnapshots(ctx context.Context, req *csi.ListSnapshotsRequest) (*csi.ListSnapshotsResponse, error)
{
return nil, status.Error(codes.Unimplemented, "")
klog.Infof("received controller list snapshots request %+v", protosanitizer.StripSecrets(req))
if (req == nil)
{
return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
}
volVars := make(map[string]string)
err := json.Unmarshal([]byte(req.SourceVolumeId), &volVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := volVars["name"]
ctxVars, _, _ := GetConnectionParams(volVars)
inodeCfg, err := invokeList(ctxVars, volName+"@*", false)
if (err != nil)
{
return nil, err
}
resp := &csi.ListSnapshotsResponse{}
for _, ino := range inodeCfg
{
snapName := ino.Name[len(volName)+1:]
if (len(req.StartingToken) > 0 && snapName < req.StartingToken)
{
}
else if (req.MaxEntries == 0 || len(resp.Entries) < int(req.MaxEntries))
{
volVars["snapshot"] = snapName
snapIdJson, _ := json.Marshal(volVars)
resp.Entries = append(resp.Entries, &csi.ListSnapshotsResponse_Entry{
Snapshot: &csi.Snapshot{
SizeBytes: int64(ino.Size),
SnapshotId: string(snapIdJson),
SourceVolumeId: req.SourceVolumeId,
CreationTime: &timestamppb.Timestamp{ Seconds: int64(ino.CreateTs) },
ReadyToUse: true,
},
})
}
else
{
resp.NextToken = snapName
break
}
}
return resp, nil
}
// ControllerExpandVolume resizes a volume
// ControllerExpandVolume increases the size of a volume
func (cs *ControllerServer) ControllerExpandVolume(ctx context.Context, req *csi.ControllerExpandVolumeRequest) (*csi.ControllerExpandVolumeResponse, error)
{
return nil, status.Error(codes.Unimplemented, "")
klog.Infof("received controller expand volume request %+v", protosanitizer.StripSecrets(req))
if (req == nil)
{
return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
}
if (req.VolumeId == "" || req.CapacityRange == nil || req.CapacityRange.RequiredBytes == 0)
{
return nil, status.Error(codes.InvalidArgument, "VolumeId, CapacityRange and RequiredBytes are required fields")
}
volVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &volVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := volVars["name"]
ctxVars, _, _ := GetConnectionParams(volVars)
inodeCfg, err := invokeList(ctxVars, volName, true)
if (err != nil)
{
return nil, err
}
if (req.CapacityRange.RequiredBytes > 0 && inodeCfg[0].Size < uint64(req.CapacityRange.RequiredBytes))
{
sz := ((req.CapacityRange.RequiredBytes+4095)/4096)*4096
_, err := invokeCLI(ctxVars, []string{ "modify", "--inc_size", "1", "--resize", fmt.Sprintf("%d", sz), volName })
if (err != nil)
{
return nil, err
}
inodeCfg, err = invokeList(ctxVars, volName, true)
if (err != nil)
{
return nil, err
}
}
return &csi.ControllerExpandVolumeResponse{
CapacityBytes: int64(inodeCfg[0].Size),
NodeExpansionRequired: false,
}, nil
}
// ControllerGetVolume get volume info

View File

@@ -49,6 +49,13 @@ func (is *IdentityServer) GetPluginCapabilities(ctx context.Context, req *csi.Ge
},
},
},
{
Type: &csi.PluginCapability_VolumeExpansion_{
VolumeExpansion: &csi.PluginCapability_VolumeExpansion{
Type: csi.PluginCapability_VolumeExpansion_OFFLINE,
},
},
},
},
}, nil
}

View File

@@ -70,10 +70,10 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
isBlock := req.GetVolumeCapability().GetBlock() != nil
// Check that it's not already mounted
_, error := mount.IsNotMountPoint(ns.mounter, targetPath)
if (error != nil)
_, err := mount.IsNotMountPoint(ns.mounter, targetPath)
if (err != nil)
{
if (os.IsNotExist(error))
if (os.IsNotExist(err))
{
if (isBlock)
{
@@ -102,12 +102,12 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
}
else
{
return nil, status.Error(codes.Internal, error.Error())
return nil, status.Error(codes.Internal, err.Error())
}
}
ctxVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
err = json.Unmarshal([]byte(req.VolumeId), &ctxVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
@@ -147,70 +147,74 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
}
devicePath := strings.TrimSpace(stdoutStr)
// Check existing format
diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
existingFormat, err := diskMounter.GetDiskFormat(devicePath)
if (err != nil)
{
klog.Errorf("failed to get disk format for path %s, error: %v", err)
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
return nil, err
}
// Format the device (ext4 or xfs)
fsType := req.GetVolumeCapability().GetMount().GetFsType()
opt := req.GetVolumeCapability().GetMount().GetMountFlags()
opt = append(opt, "_netdev")
if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
!Contains(opt, "ro"))
{
opt = append(opt, "ro")
}
if (fsType == "xfs")
{
opt = append(opt, "nouuid")
}
readOnly := Contains(opt, "ro")
if (existingFormat == "" && !readOnly)
{
args := []string{}
switch fsType
{
case "ext4":
args = []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
case "xfs":
args = []string{"-K", devicePath}
}
if (len(args) > 0)
{
cmdOut, cmdErr := diskMounter.Exec.Command("mkfs."+fsType, args...).CombinedOutput()
if (cmdErr != nil)
{
klog.Errorf("failed to run mkfs error: %v, output: %v", cmdErr, string(cmdOut))
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
return nil, status.Error(codes.Internal, cmdErr.Error())
}
}
}
if (isBlock)
{
opt = append(opt, "bind")
err = diskMounter.Mount(devicePath, targetPath, fsType, opt)
err = diskMounter.Mount(devicePath, targetPath, "", []string{"bind"})
}
else
{
// Check existing format
existingFormat, err := diskMounter.GetDiskFormat(devicePath)
if (err != nil)
{
klog.Errorf("failed to get disk format for path %s, error: %v", err)
goto unmap
}
// Format the device (ext4 or xfs)
fsType := req.GetVolumeCapability().GetMount().GetFsType()
opt := req.GetVolumeCapability().GetMount().GetMountFlags()
opt = append(opt, "_netdev")
if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
!Contains(opt, "ro"))
{
opt = append(opt, "ro")
}
if (fsType == "xfs")
{
opt = append(opt, "nouuid")
}
readOnly := Contains(opt, "ro")
if (existingFormat == "" && !readOnly)
{
var cmdOut []byte
switch fsType
{
case "ext4":
args := []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
cmdOut, err = diskMounter.Exec.Command("mkfs.ext4", args...).CombinedOutput()
case "xfs":
cmdOut, err = diskMounter.Exec.Command("mkfs.xfs", "-K", devicePath).CombinedOutput()
}
if (err != nil)
{
klog.Errorf("failed to run mkfs error: %v, output: %v", err, string(cmdOut))
goto unmap
}
}
err = diskMounter.FormatAndMount(devicePath, targetPath, fsType, opt)
// Try to run online resize on mount.
// FIXME: Implement online resize. It requires online resize support in vitastor-nbd.
if (err == nil && existingFormat != "" && !readOnly)
{
var cmdOut []byte
switch (fsType)
{
case "ext4":
cmdOut, err = diskMounter.Exec.Command("resize2fs", devicePath).CombinedOutput()
case "xfs":
cmdOut, err = diskMounter.Exec.Command("xfs_growfs", devicePath).CombinedOutput()
}
if (err != nil)
{
klog.Errorf("failed to run resizefs error: %v, output: %v", err, string(cmdOut))
goto unmap
}
}
}
if (err != nil)
{
@@ -218,15 +222,18 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
"failed to mount device path (%s) to path (%s) for volume (%s) error: %s",
devicePath, targetPath, volName, err,
)
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
return nil, status.Error(codes.Internal, err.Error())
goto unmap
}
return &csi.NodePublishVolumeResponse{}, nil
unmap:
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
return nil, status.Error(codes.Internal, err.Error())
}
// NodeUnpublishVolume unmounts the volume from the target path

4
debian/changelog vendored
View File

@@ -1,10 +1,10 @@
vitastor (0.9.2-1) unstable; urgency=medium
vitastor (1.2.0-1) unstable; urgency=medium
* Bugfixes
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
vitastor (0.9.2-1) unstable; urgency=medium
vitastor (1.2.0-1) unstable; urgency=medium
* Implement NFS proxy
* Add documentation

2
debian/control vendored
View File

@@ -2,7 +2,7 @@ Source: vitastor
Section: admin
Priority: optional
Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev, cmake, pkg-config
Standards-Version: 4.5.0
Homepage: https://vitastor.io/
Rules-Requires-Root: no

View File

@@ -28,13 +28,19 @@ RUN apt-get --download-only source qemu
ADD patches /root/vitastor/patches
ADD src/qemu_driver.c /root/vitastor/src/qemu_driver.c
#RUN set -e; \
# apt-get install -y wget; \
# wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg; \
# (echo deb http://vitastor.io/debian $REL main > /etc/apt/sources.list.d/vitastor.list); \
# (echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
# apt-get update; \
# apt-get install -y vitastor-client vitastor-client-dev quilt
RUN set -e; \
apt-get install -y wget; \
wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg; \
(echo deb http://vitastor.io/debian $REL main > /etc/apt/sources.list.d/vitastor.list); \
(echo "APT::Install-Recommends false;" > /etc/apt/apt.conf) && \
dpkg -i /root/packages/vitastor-$REL/vitastor-client_*.deb /root/packages/vitastor-$REL/vitastor-client-dev_*.deb; \
apt-get update; \
apt-get install -y vitastor-client vitastor-client-dev quilt; \
apt-get install -y quilt; \
mkdir -p /root/packages/qemu-$REL; \
rm -rf /root/packages/qemu-$REL/*; \
cd /root/packages/qemu-$REL; \
@@ -48,7 +54,8 @@ RUN set -e; \
quilt add block/vitastor.c; \
cp /root/vitastor/src/qemu_driver.c block/vitastor.c; \
quilt refresh; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor1; \
V=$(head -n1 debian/changelog | perl -pe 's/5\.2\+dfsg-9/5.2+dfsg-11/; s/^.*\((.*?)(~bpo[\d\+]*)?\).*$/$1/')+vitastor4; \
if [ "$REL" = bullseye ]; then V=${V}bullseye; fi; \
DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v $V 'Plug Vitastor block driver'; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
rm -rf /root/packages/qemu-$REL/qemu-*/

View File

@@ -35,8 +35,8 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-0.9.2; \
cd vitastor-0.9.2; \
cp -r /root/vitastor vitastor-1.2.0; \
cd vitastor-1.2.0; \
ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -49,8 +49,8 @@ RUN set -e -x; \
rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_0.9.2.orig.tar.xz vitastor-0.9.2; \
cd vitastor-0.9.2; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.2.0.orig.tar.xz vitastor-1.2.0; \
cd vitastor-1.2.0; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

View File

@@ -33,6 +33,7 @@ In the future, additional configuration methods may be added:
- [Common](config/common.en.md)
- [Network](config/network.en.md)
- [Client](config/client.en.md)
- [Global Disk Layout](config/layout-cluster.en.md)
- [OSD Disk Layout](config/layout-osd.en.md)
- [OSD Runtime Parameters](config/osd.en.md)

View File

@@ -36,6 +36,7 @@
- [Общие](config/common.ru.md)
- [Сеть](config/network.ru.md)
- [Клиентский код](config/client.ru.md)
- [Глобальные дисковые параметры](config/layout-cluster.ru.md)
- [Дисковые параметры OSD](config/layout-osd.ru.md)
- [Прочие параметры OSD](config/osd.ru.md)

103
docs/config/client.en.md Normal file
View File

@@ -0,0 +1,103 @@
[Documentation](../../README.md#documentation) → [Configuration](../config.en.md) → Client Parameters
-----
[Читать на русском](client.ru.md)
# Client Parameters
These parameters apply only to clients and affect their interaction with
the cluster.
- [client_max_dirty_bytes](#client_max_dirty_bytes)
- [client_max_dirty_ops](#client_max_dirty_ops)
- [client_enable_writeback](#client_enable_writeback)
- [client_max_buffered_bytes](#client_max_buffered_bytes)
- [client_max_buffered_ops](#client_max_buffered_ops)
- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
## client_max_dirty_bytes
- Type: integer
- Default: 33554432
- Can be changed online: yes
Without [immediate_commit](layout-cluster.en.md#immediate_commit)=all this parameter sets the limit of "dirty"
(not committed by fsync) data allowed by the client before forcing an
additional fsync and committing the data. Also note that the client always
holds a copy of uncommitted data in memory so this setting also affects
RAM usage of clients.
## client_max_dirty_ops
- Type: integer
- Default: 1024
- Can be changed online: yes
Same as client_max_dirty_bytes, but instead of total size, limits the number
of uncommitted write operations.
## client_enable_writeback
- Type: boolean
- Default: false
- Can be changed online: yes
This parameter enables client-side write buffering. This means that write
requests are accumulated in memory for a short time before being sent to
a Vitastor cluster which allows to send them in parallel and increase
performance of some applications. Writes are buffered until client forces
a flush with fsync() or until the amount of buffered writes exceeds the
limit.
Write buffering significantly increases performance of some applications,
for example, CrystalDiskMark under Windows (LOL :-D), but also any other
applications if they do writes in one of two non-optimal ways: either if
they do a lot of small (4 kb or so) sequential writes, or if they do a lot
of small random writes, but without any parallelism or asynchrony, and also
without calling fsync().
With write buffering enabled, you can expect around 22000 T1Q1 random write
iops in QEMU more or less regardless of the quality of your SSDs, and this
number is in fact bound by QEMU itself rather than Vitastor (check it
yourself by adding a "driver=null-co" disk in QEMU). Without write
buffering, the current record is 9900 iops, but the number is usually
even lower with non-ideal hardware, for example, it may be 5000 iops.
Even when this parameter is enabled, write buffering isn't enabled until
the client explicitly allows it, because enabling it without the client
being aware of the fact that his writes may be buffered may lead to data
loss. Because of this, older versions of clients don't support write
buffering at all, newer versions of the QEMU driver allow write buffering
only if it's enabled in disk settings with `-blockdev cache.direct=false`,
and newer versions of FIO only allow write buffering if you don't specify
`-direct=1`. NBD and NFS drivers allow write buffering by default.
You can overcome this restriction too with the `client_writeback_allowed`
parameter, but you shouldn't do that unless you **really** know what you
are doing.
## client_max_buffered_bytes
- Type: integer
- Default: 33554432
- Can be changed online: yes
Maximum total size of buffered writes which triggers write-back when reached.
## client_max_buffered_ops
- Type: integer
- Default: 1024
- Can be changed online: yes
Maximum number of buffered writes which triggers write-back when reached.
Multiple consecutive modified data regions are counted as 1 write here.
## client_max_writeback_iodepth
- Type: integer
- Default: 256
- Can be changed online: yes
Maximum number of parallel writes when flushing buffered data to the server.

103
docs/config/client.ru.md Normal file
View File

@@ -0,0 +1,103 @@
[Документация](../../README-ru.md#документация) → [Конфигурация](../config.ru.md) → Параметры клиентского кода
-----
[Read in English](client.en.md)
# Параметры клиентского кода
Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD) и
затрагивают логику их работы с кластером.
- [client_max_dirty_bytes](#client_max_dirty_bytes)
- [client_max_dirty_ops](#client_max_dirty_ops)
- [client_enable_writeback](#client_enable_writeback)
- [client_max_buffered_bytes](#client_max_buffered_bytes)
- [client_max_buffered_ops](#client_max_buffered_ops)
- [client_max_writeback_iodepth](#client_max_writeback_iodepth)
## client_max_dirty_bytes
- Тип: целое число
- Значение по умолчанию: 33554432
- Можно менять на лету: да
При работе без [immediate_commit](layout-cluster.ru.md#immediate_commit)=all - это лимит объёма "грязных" (не
зафиксированных fsync-ом) данных, при достижении которого клиент будет
принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
что в этом случае до момента fsync клиент хранит копию незафиксированных
данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
## client_max_dirty_ops
- Тип: целое число
- Значение по умолчанию: 1024
- Можно менять на лету: да
Аналогично client_max_dirty_bytes, но ограничивает количество
незафиксированных операций записи вместо их общего объёма.
## client_enable_writeback
- Тип: булево (да/нет)
- Значение по умолчанию: false
- Можно менять на лету: да
Данный параметр разрешает включать буферизацию записи в памяти. Буферизация
означает, что операции записи отправляются на кластер Vitastor не сразу, а
могут небольшое время накапливаться в памяти и сбрасываться сразу пакетами,
до тех пор, пока либо не будет превышен лимит неотправленных записей, либо
пока клиент не вызовет fsync.
Буферизация значительно повышает производительность некоторых приложений,
например, CrystalDiskMark в Windows (ха-ха :-D), но также и любых других,
которые пишут на диск неоптимально: либо последовательно, но мелкими блоками
(например, по 4 кб), либо случайно, но без параллелизма и без fsync - то
есть, например, отправляя 128 операций записи в разные места диска, но не
все сразу с помощью асинхронного I/O, а по одной.
В QEMU с буферизацией записи можно ожидать показателя примерно 22000
операций случайной записи в секунду в 1 поток и с глубиной очереди 1 (T1Q1)
без fsync, почти вне зависимости от того, насколько хороши ваши диски - эта
цифра упирается в сам QEMU. Без буферизации рекорд пока что - 9900 операций
в секунду, но на железе похуже может быть и поменьше, например, 5000 операций
в секунду.
При этом, даже если данный параметр включён, буферизация не включается, если
явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
буферизуются, это может приводить к потере данных. Поэтому в старых версиях
клиентских драйверов буферизация записи не включается вообще, в новых
версиях QEMU-драйвера включается, только если разрешена опцией диска
`-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,
но делать так не надо, если только вы не уверены в том, что делаете, на все
100%. :-)
## client_max_buffered_bytes
- Тип: целое число
- Значение по умолчанию: 33554432
- Можно менять на лету: да
Максимальный общий размер буферизованных записей, при достижении которого
начинается процесс сброса данных на сервер.
## client_max_buffered_ops
- Тип: целое число
- Значение по умолчанию: 1024
- Можно менять на лету: да
Максимальное количество буферизованных записей, при достижении которого
начинается процесс сброса данных на сервер. При этом несколько
последовательных изменённых областей здесь считаются 1 записью.
## client_max_writeback_iodepth
- Тип: целое число
- Значение по умолчанию: 256
- Можно менять на лету: да
Максимальное число параллельных операций записи при сбросе буферов на сервер.

View File

@@ -33,12 +33,13 @@ Size of objects (data blocks) into which all physical and virtual drives
in Vitastor, affects memory usage, write amplification and I/O load
distribution effectiveness.
Recommended default block size is 128 KB for SSD and 4 MB for HDD. In fact,
it's possible to use 4 MB for SSD too - it will lower memory usage, but
Recommended default block size is 128 KB for SSD and 1 MB for HDD. In fact,
it's possible to use 1 MB for SSD too - it will lower memory usage, but
may increase average WA and reduce linear performance.
OSD memory usage is roughly (SIZE / BLOCK * 68 bytes) which is roughly
544 MB per 1 TB of used disk space with the default 128 KB block size.
With 1 MB it's 8 times lower.
## bitmap_granularity
@@ -95,8 +96,9 @@ SSD cache or "media-cache" - for example, a lot of Seagate EXOS drives have
it (they have internal SSD cache even though it's not stated in datasheets).
Setting this parameter to "all" or "small" in OSD parameters requires enabling
disable_journal_fsync and disable_meta_fsync, setting it to "all" also requires
enabling disable_data_fsync.
[disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
[disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
"all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).
TLDR: For optimal performance, set immediate_commit to "all" if you only use
SSDs with supercapacitor-based power loss protection (nonvolatile

View File

@@ -33,14 +33,14 @@ OSD) могут сосуществовать в одном кластере Vita
настроек, влияет на потребление памяти, объём избыточной записи (write
amplification) и эффективность распределения нагрузки по OSD.
Рекомендуемые по умолчанию размеры блока - 128 килобайт для SSD и 4
мегабайта для HDD. В принципе, для SSD можно тоже использовать 4 мегабайта,
Рекомендуемые по умолчанию размеры блока - 128 килобайт для SSD и 1 мегабайт
для HDD. В принципе, для SSD можно тоже использовать блок размером 1 мегабайт,
это понизит использование памяти, но ухудшит распределение нагрузки и в
среднем увеличит WA.
Потребление памяти OSD составляет примерно (РАЗМЕР / БЛОК * 68 байт),
т.е. примерно 544 МБ памяти на 1 ТБ занятого места на диске при
стандартном 128 КБ блоке.
стандартном 128 КБ блоке. При 1 МБ блоке памяти нужно в 8 раз меньше.
## bitmap_granularity
@@ -103,8 +103,9 @@ HDD-дисках с внутренним SSD или "медиа" кэшем - н
указано в спецификациях).
Указание "all" или "small" в настройках / командной строке OSD требует
включения disable_journal_fsync и disable_meta_fsync, значение "all" также
требует включения disable_data_fsync.
включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
[disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).
Итого, вкратце: для оптимальной производительности установите
immediate_commit в значение "all", если вы используете в кластере только SSD

View File

@@ -24,6 +24,8 @@ initialization and can't be changed after it without losing data.
- [disable_journal_fsync](#disable_journal_fsync)
- [disable_device_lock](#disable_device_lock)
- [disk_alignment](#disk_alignment)
- [data_csum_type](#data_csum_type)
- [csum_block_size](#csum_block_size)
## data_device
@@ -174,3 +176,43 @@ Intel Optane (probably, not tested yet).
Clients don't need to be aware of disk_alignment, so it's not required to
put a modified value into etcd key /vitastor/config/global.
## data_csum_type
- Type: string
- Default: none
Data checksum type to use. May be "crc32c" or "none". Set to "crc32c" to
enable data checksums.
## csum_block_size
- Type: integer
- Default: 4096
Checksum calculation block size.
Must be equal or a multiple of [bitmap_granularity](layout-cluster.en.md#bitmap_granularity)
(which is usually 4 KB).
Checksums increase metadata size by 4 bytes per each csum_block_size of data.
Checksums are always a tradeoff:
1. You either sacrifice +1 GB RAM per 1 TB of data
2. Or you raise csum_block_size, for example, to 32k and sacrifice
50% random write iops due to checksum read-modify-write
3. Or you turn off [inmemory_metadata](osd.en.md#inmemory_metadata) and
sacrifice 50% random read iops due to checksum reads
All-flash clusters usually have enough RAM to use default csum_block_size,
which uses 1 GB RAM per 1 TB of data. HDD clusters usually don't.
Thus, recommended setups are:
1. All-flash, 1 GB RAM per 1 TB data: default (csum_block_size=4k)
2. All-flash, less RAM: csum_block_size=4k + inmemory_metadata=false
3. Hybrid HDD+SSD: csum_block_size=4k + inmemory_metadata=false
4. HDD-only, faster random read: csum_block_size=32k
5. HDD-only, faster random write: csum_block_size=4k +
inmemory_metadata=false + meta_io=cached
See also [meta_io](osd.en.md#meta_io).

View File

@@ -25,6 +25,8 @@
- [disable_journal_fsync](#disable_journal_fsync)
- [disable_device_lock](#disable_device_lock)
- [disk_alignment](#disk_alignment)
- [data_csum_type](#data_csum_type)
- [csum_block_size](#csum_block_size)
## data_device
@@ -183,3 +185,47 @@ journal_block_size и meta_block_size. Однако единственные SSD
Клиентам не обязательно знать про disk_alignment, так что помещать значение
этого параметра в etcd в /vitastor/config/global не нужно.
## data_csum_type
- Тип: строка
- Значение по умолчанию: none
Тип используемых OSD контрольных сумм данных. Может быть "crc32c" или "none".
Установите в "crc32c", чтобы включить расчёт и проверку контрольных сумм данных.
Следует понимать, что контрольные суммы в зависимости от размера блока их
расчёта либо увеличивают потребление памяти, либо снижают производительность.
Подробнее смотрите в описании параметра [csum_block_size](#csum_block_size).
## csum_block_size
- Тип: целое число
- Значение по умолчанию: 4096
Размер блока расчёта контрольных сумм.
Должен быть равен или кратен [bitmap_granularity](layout-cluster.ru.md#bitmap_granularity)
(который обычно равен 4 КБ).
Контрольные суммы увеличивают размер метаданных на 4 байта на каждые
csum_block_size данных.
Контрольные суммы - это всегда компромисс:
1. Вы либо жертвуете потреблением +1 ГБ памяти на 1 ТБ дискового пространства
2. Либо вы повышаете csum_block_size до, скажем, 32k и жертвуете 50%
скорости случайной записи из-за цикла чтения-изменения-записи для расчёта
новых контрольных сумм
3. Либо вы отключаете [inmemory_metadata](osd.ru.md#inmemory_metadata) и
жертвуете 50% скорости случайного чтения из-за чтения контрольных сумм
с диска
Таким образом, рекомендуются следующие варианты настроек:
1. All-flash, 1 ГБ памяти на 1 ТБ данных: по умолчанию (csum_block_size=4k)
2. All-flash, меньше памяти: csum_block_size=4k + inmemory_metadata=false
3. Гибридные HDD+SSD: csum_block_size=4k + inmemory_metadata=false
4. Только HDD, быстрее случайное чтение: csum_block_size=32k
5. Только HDD, быстрее случайная запись: csum_block_size=4k +
inmemory_metadata=false + meta_io=cached
Смотрите также [meta_io](osd.ru.md#meta_io).

View File

@@ -30,7 +30,6 @@ between clients, OSDs and etcd.
- [etcd_slow_timeout](#etcd_slow_timeout)
- [etcd_keepalive_timeout](#etcd_keepalive_timeout)
- [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
- [client_dirty_limit](#client_dirty_limit)
## tcp_header_buffer_size
@@ -240,17 +239,3 @@ etcd_report_interval to guarantee that keepalive actually works.
etcd websocket ping interval required to keep the connection alive and
detect disconnections quickly.
## client_dirty_limit
- Type: integer
- Default: 33554432
- Can be changed online: yes
Without immediate_commit=all this parameter sets the limit of "dirty"
(not committed by fsync) data allowed by the client before forcing an
additional fsync and committing the data. Also note that the client always
holds a copy of uncommitted data in memory so this setting also affects
RAM usage of clients.
This parameter doesn't affect OSDs themselves.

View File

@@ -30,7 +30,6 @@
- [etcd_slow_timeout](#etcd_slow_timeout)
- [etcd_keepalive_timeout](#etcd_keepalive_timeout)
- [etcd_ws_keepalive_timeout](#etcd_ws_keepalive_timeout)
- [client_dirty_limit](#client_dirty_limit)
## tcp_header_buffer_size
@@ -251,17 +250,3 @@ etcd_report_interval, чтобы keepalive гарантированно рабо
- Можно менять на лету: да
Интервал проверки живости вебсокет-подключений к etcd.
## client_dirty_limit
- Тип: целое число
- Значение по умолчанию: 33554432
- Можно менять на лету: да
При работе без immediate_commit=all - это лимит объёма "грязных" (не
зафиксированных fsync-ом) данных, при достижении которого клиент будет
принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
что в этом случае до момента fsync клиент хранит копию незафиксированных
данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
Параметр не влияет на сами OSD.

View File

@@ -11,6 +11,7 @@ initialization and can be changed - either with an OSD restart or, for some of
them, even without restarting by updating configuration in etcd.
- [etcd_report_interval](#etcd_report_interval)
- [etcd_stats_interval](#etcd_stats_interval)
- [run_primary](#run_primary)
- [osd_network](#osd_network)
- [bind_address](#bind_address)
@@ -31,6 +32,9 @@ them, even without restarting by updating configuration in etcd.
- [max_flusher_count](#max_flusher_count)
- [inmemory_metadata](#inmemory_metadata)
- [inmemory_journal](#inmemory_journal)
- [data_io](#data_io)
- [meta_io](#meta_io)
- [journal_io](#journal_io)
- [journal_sector_buffer_count](#journal_sector_buffer_count)
- [journal_no_same_sector_overwrites](#journal_no_same_sector_overwrites)
- [throttle_small_writes](#throttle_small_writes)
@@ -53,11 +57,21 @@ them, even without restarting by updating configuration in etcd.
- Type: seconds
- Default: 5
Interval at which OSDs report their state to etcd. Affects OSD lease time
Interval at which OSDs report their liveness to etcd. Affects OSD lease time
and thus the failover speed. Lease time is equal to this parameter value
plus max_etcd_attempts * etcd_quick_timeout because it should be guaranteed
that every OSD always refreshes its lease in time.
## etcd_stats_interval
- Type: seconds
- Default: 30
Interval at which OSDs report their statistics to etcd. Highly affects the
imposed load on etcd, because statistics include a key for every OSD and
for every PG. At the same time, low statistic intervals make `vitastor-cli`
statistics more responsive.
## run_primary
- Type: boolean
@@ -255,6 +269,60 @@ is typically very small because it's sufficient to have 16-32 MB journal
for SSD OSDs. However, in theory it's possible that you'll want to turn it
off for hybrid (HDD+SSD) OSDs with large journals on quick devices.
## data_io
- Type: string
- Default: direct
I/O mode for *data*. One of "direct", "cached" or "directsync". Corresponds
to O_DIRECT, O_SYNC and O_DIRECT|O_SYNC, respectively.
Choose "cached" to use Linux page cache. This may improve read performance
for hot data and slower disks - HDDs and maybe SATA SSDs - but will slightly
decrease write performance for fast disks because page cache is an overhead
itself.
Choose "directsync" to use [immediate_commit](layout-cluster.ru.md#immediate_commit)
(which requires disable_data_fsync) with drives having write-back cache
which can't be turned off, for example, Intel Optane. Also note that *some*
desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
disable_data_fsync unsafe even with "directsync".
## meta_io
- Type: string
- Default: direct
I/O mode for *metadata*. One of "direct", "cached" or "directsync".
"cached" may improve read performance, but only under the following conditions:
1. your drives are relatively slow (HDD, SATA SSD), and
2. checksums are enabled, and
3. [inmemory_metadata](#inmemory_metadata) is disabled.
Under all these conditions, metadata blocks are read from disk on every
read request to verify checksums and caching them may reduce this extra
read load. Without (3) metadata is never read from the disk after starting,
and without (2) metadata blocks are read from disk only during journal
flushing.
"directsync" is the same as above.
If the same device is used for data and metadata, meta_io by default is set
to the same value as [data_io](#data_io).
## journal_io
- Type: string
- Default: direct
I/O mode for *journal*. One of "direct", "cached" or "directsync".
Here, "cached" may only improve read performance for recent writes and
only if [inmemory_journal](#inmemory_journal) is turned off.
If the same device is used for metadata and journal, journal_io by default
is set to the same value as [meta_io](#meta_io).
## journal_sector_buffer_count
- Type: integer

View File

@@ -12,6 +12,7 @@
изменения конфигурации в etcd.
- [etcd_report_interval](#etcd_report_interval)
- [etcd_stats_interval](#etcd_stats_interval)
- [run_primary](#run_primary)
- [osd_network](#osd_network)
- [bind_address](#bind_address)
@@ -32,6 +33,9 @@
- [max_flusher_count](#max_flusher_count)
- [inmemory_metadata](#inmemory_metadata)
- [inmemory_journal](#inmemory_journal)
- [data_io](#data_io)
- [meta_io](#meta_io)
- [journal_io](#journal_io)
- [journal_sector_buffer_count](#journal_sector_buffer_count)
- [journal_no_same_sector_overwrites](#journal_no_same_sector_overwrites)
- [throttle_small_writes](#throttle_small_writes)
@@ -54,11 +58,21 @@
- Тип: секунды
- Значение по умолчанию: 5
Интервал, с которым OSD обновляет своё состояние в etcd. Значение параметра
влияет на время резервации (lease) OSD и поэтому на скорость переключения
Интервал, с которым OSD сообщает о том, что жив, в etcd. Значение параметра
влияет на время резервации (lease) OSD и поэтому - на скорость переключения
при падении OSD. Время lease равняется значению этого параметра плюс
max_etcd_attempts * etcd_quick_timeout.
## etcd_stats_interval
- Тип: секунды
- Значение по умолчанию: 30
Интервал, с которым OSD обновляет свою статистику в etcd. Сильно влияет на
создаваемую нагрузку на etcd, потому что статистика содержит по ключу на
каждый OSD и на каждую PG. В то же время низкий интервал делает
статистику, печатаемую `vitastor-cli`, отзывчивей.
## run_primary
- Тип: булево (да/нет)
@@ -263,6 +277,63 @@ Flusher - это микро-поток (корутина), которая коп
параметра может оказаться полезным для гибридных OSD (HDD+SSD) с большими
журналами, расположенными на быстром по сравнению с HDD устройстве.
## data_io
- Тип: строка
- Значение по умолчанию: direct
Режим ввода-вывода для *данных*. Одно из значений "direct", "cached" или
"directsync", означающих O_DIRECT, O_SYNC и O_DIRECT|O_SYNC, соответственно.
Выберите "cached", чтобы использовать системный кэш Linux (page cache) при
чтении и записи. Это может улучшить скорость чтения горячих данных с
относительно медленных дисков - HDD и, возможно, SATA SSD - но немного
снижает производительность записи для быстрых дисков, так как кэш сам по
себе тоже добавляет накладные расходы.
Выберите "directsync", если хотите задействовать
[immediate_commit](layout-cluster.ru.md#immediate_commit) (требующий
включенияd disable_data_fsync) на дисках с неотключаемым кэшем. Пример таких
дисков - Intel Optane. При этом также стоит иметь в виду, что *некоторые*
настольные SSD (например, HP EX950) игнорируют флаг O_SYNC, делая отключение
fsync небезопасным даже с режимом "directsync".
## meta_io
- Тип: строка
- Значение по умолчанию: direct
Режим ввода-вывода для *метаданных*. Одно из значений "direct", "cached" или
"directsync".
"cached" может улучшить скорость чтения, если:
1. у вас медленные диски (HDD, SATA SSD)
2. контрольные суммы включены
3. параметр [inmemory_metadata](#inmemory_metadata) отключён.
При этих условиях блоки метаданных читаются с диска при каждом запросе чтения
для проверки контрольных сумм и их кэширование может снизить дополнительную
нагрузку на диск. Без (3) метаданные никогда не читаются с диска после
запуска OSD, а без (2) блоки метаданных читаются только при сбросе журнала.
Если одно и то же устройство используется для данных и метаданных, режим
ввода-вывода метаданных по умолчанию устанавливается равным [data_io](#data_io).
## journal_io
- Тип: строка
- Значение по умолчанию: direct
Режим ввода-вывода для *журнала*. Одно из значений "direct", "cached" или
"directsync".
Здесь "cached" может улучшить скорость чтения только недавно записанных
данных и только если параметр [inmemory_journal](#inmemory_journal)
отключён.
Если одно и то же устройство используется для метаданных и журнала,
режим ввода-вывода журнала по умолчанию устанавливается равным
[meta_io](#meta_io).
## journal_sector_buffer_count
- Тип: целое число

View File

@@ -205,9 +205,8 @@ This parameter usually doesn't require to be changed.
- Default: 131072
Block size for this pool. The value from /vitastor/config/global is used when
unspecified. If your cluster has OSDs with different block sizes then pool must
be restricted by [osd_tags](#osd_tags) to only include OSDs with matching block
size.
unspecified. Only OSDs with matching block_size are used for each pool. If you
want to further restrict OSDs for the pool, use [osd_tags](#osd_tags).
Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#block_size).
@@ -216,10 +215,9 @@ Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-c
- Type: integer
- Default: 4096
"Sector" size of virtual disks in this pool. The value from
/vitastor/config/global is used when unspecified. Similar to block_size, the
pool must be restricted by [osd_tags](#osd_tags) to only include OSDs with
matching bitmap_granularity.
"Sector" size of virtual disks in this pool. The value from /vitastor/config/global
is used when unspecified. Similarly to block_size, only OSDs with matching
bitmap_granularity are used for each pool.
Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#bitmap_granularity).
@@ -229,10 +227,11 @@ Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-c
- Default: none
Immediate commit setting for this pool. The value from /vitastor/config/global
is used when unspecified. Similar to block_size, the pool must be restricted by
[osd_tags](#osd_tags) to only include OSDs with compatible immediate_commit.
Compatible means that a pool with non-immediate commit will work with OSDs with
immediate commit enabled, but not vice versa.
is used when unspecified. Similarly to block_size, only OSDs with compatible
bitmap_granularity are used for each pool. "Compatible" means that a pool with
non-immediate commit will use OSDs with immediate commit enabled, but not vice
versa. I.e., pools with "none" use all OSDs, pools with "small" only use OSDs
with "all" or "small", and pools with "all" only use OSDs with "all".
Read more about this parameter in [Cluster-Wide Disk Layout Parameters](layout-cluster.en.md#immediate_commit).

View File

@@ -208,8 +208,9 @@ PG в Vitastor эферемерны, то есть вы можете менят
Размер блока для данного пула. Если не задан, используется значение из
/vitastor/config/global. Если в вашем кластере есть OSD с разными размерами
блока, пул должен быть ограничен только OSD, блок которых равен блоку пула,
с помощью [osd_tags](#osd_tags).
блока, пул будет использовать только OSD с размером блока, равным размеру блока
пула. Если вы хотите сильнее ограничить набор используемых для пула OSD -
используйте [osd_tags](#osd_tags).
О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#block_size).
@@ -219,9 +220,8 @@ PG в Vitastor эферемерны, то есть вы можете менят
- По умолчанию: 4096
Размер "сектора" виртуальных дисков в данном пуле. Если не задан, используется
значение из /vitastor/config/global. Аналогично block_size, пул должен быть
ограничен OSD со значением bitmap_granularity, равным значению пула, с помощью
[osd_tags](#osd_tags).
значение из /vitastor/config/global. Аналогично block_size, каждый пул будет
использовать только OSD с совпадающей с пулом настройкой bitmap_granularity.
О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#bitmap_granularity).
@@ -231,11 +231,13 @@ PG в Vitastor эферемерны, то есть вы можете менят
- По умолчанию: none
Настройка мгновенного коммита для данного пула. Если не задана, используется
значение из /vitastor/config/global. Аналогично block_size, пул должен быть
ограничен OSD со значением bitmap_granularity, совместимым со значением пула, с
помощью [osd_tags](#osd_tags). Совместимость означает, что пул с отключенным
мгновенным коммитом может работать на OSD с включённым мгновенным коммитом, но
не наоборот.
значение из /vitastor/config/global. Аналогично block_size, каждый пул будет
использовать только OSD с *совместимыми* настройками immediate_commit.
"Совместимыми" означает, что пул с отключенным мгновенным коммитом будет
использовать OSD с включённым мгновенным коммитом, но не наоборот. То есть,
пул со значением "none" будет использовать все OSD, пул со "small" будет
использовать OSD с "all" или "small", а пул с "all" будет использовать только
OSD с "all".
О самом параметре читайте в разделе [Дисковые параметры уровня кластера](layout-cluster.ru.md#immediate_commit).

View File

@@ -0,0 +1,4 @@
# Client Parameters
These parameters apply only to clients and affect their interaction with
the cluster.

View File

@@ -0,0 +1,4 @@
# Параметры клиентского кода
Данные параметры применяются только к клиентам Vitastor (QEMU, fio, NBD) и
затрагивают логику их работы с кластером.

124
docs/config/src/client.yml Normal file
View File

@@ -0,0 +1,124 @@
- name: client_max_dirty_bytes
type: int
default: 33554432
online: true
info: |
Without [immediate_commit](layout-cluster.en.md#immediate_commit)=all this parameter sets the limit of "dirty"
(not committed by fsync) data allowed by the client before forcing an
additional fsync and committing the data. Also note that the client always
holds a copy of uncommitted data in memory so this setting also affects
RAM usage of clients.
info_ru: |
При работе без [immediate_commit](layout-cluster.ru.md#immediate_commit)=all - это лимит объёма "грязных" (не
зафиксированных fsync-ом) данных, при достижении которого клиент будет
принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
что в этом случае до момента fsync клиент хранит копию незафиксированных
данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
- name: client_max_dirty_ops
type: int
default: 1024
online: true
info: |
Same as client_max_dirty_bytes, but instead of total size, limits the number
of uncommitted write operations.
info_ru: |
Аналогично client_max_dirty_bytes, но ограничивает количество
незафиксированных операций записи вместо их общего объёма.
- name: client_enable_writeback
type: bool
default: false
online: true
info: |
This parameter enables client-side write buffering. This means that write
requests are accumulated in memory for a short time before being sent to
a Vitastor cluster which allows to send them in parallel and increase
performance of some applications. Writes are buffered until client forces
a flush with fsync() or until the amount of buffered writes exceeds the
limit.
Write buffering significantly increases performance of some applications,
for example, CrystalDiskMark under Windows (LOL :-D), but also any other
applications if they do writes in one of two non-optimal ways: either if
they do a lot of small (4 kb or so) sequential writes, or if they do a lot
of small random writes, but without any parallelism or asynchrony, and also
without calling fsync().
With write buffering enabled, you can expect around 22000 T1Q1 random write
iops in QEMU more or less regardless of the quality of your SSDs, and this
number is in fact bound by QEMU itself rather than Vitastor (check it
yourself by adding a "driver=null-co" disk in QEMU). Without write
buffering, the current record is 9900 iops, but the number is usually
even lower with non-ideal hardware, for example, it may be 5000 iops.
Even when this parameter is enabled, write buffering isn't enabled until
the client explicitly allows it, because enabling it without the client
being aware of the fact that his writes may be buffered may lead to data
loss. Because of this, older versions of clients don't support write
buffering at all, newer versions of the QEMU driver allow write buffering
only if it's enabled in disk settings with `-blockdev cache.direct=false`,
and newer versions of FIO only allow write buffering if you don't specify
`-direct=1`. NBD and NFS drivers allow write buffering by default.
You can overcome this restriction too with the `client_writeback_allowed`
parameter, but you shouldn't do that unless you **really** know what you
are doing.
info_ru: |
Данный параметр разрешает включать буферизацию записи в памяти. Буферизация
означает, что операции записи отправляются на кластер Vitastor не сразу, а
могут небольшое время накапливаться в памяти и сбрасываться сразу пакетами,
до тех пор, пока либо не будет превышен лимит неотправленных записей, либо
пока клиент не вызовет fsync.
Буферизация значительно повышает производительность некоторых приложений,
например, CrystalDiskMark в Windows (ха-ха :-D), но также и любых других,
которые пишут на диск неоптимально: либо последовательно, но мелкими блоками
(например, по 4 кб), либо случайно, но без параллелизма и без fsync - то
есть, например, отправляя 128 операций записи в разные места диска, но не
все сразу с помощью асинхронного I/O, а по одной.
В QEMU с буферизацией записи можно ожидать показателя примерно 22000
операций случайной записи в секунду в 1 поток и с глубиной очереди 1 (T1Q1)
без fsync, почти вне зависимости от того, насколько хороши ваши диски - эта
цифра упирается в сам QEMU. Без буферизации рекорд пока что - 9900 операций
в секунду, но на железе похуже может быть и поменьше, например, 5000 операций
в секунду.
При этом, даже если данный параметр включён, буферизация не включается, если
явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
буферизуются, это может приводить к потере данных. Поэтому в старых версиях
клиентских драйверов буферизация записи не включается вообще, в новых
версиях QEMU-драйвера включается, только если разрешена опцией диска
`-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,
но делать так не надо, если только вы не уверены в том, что делаете, на все
100%. :-)
- name: client_max_buffered_bytes
type: int
default: 33554432
online: true
info: |
Maximum total size of buffered writes which triggers write-back when reached.
info_ru: |
Максимальный общий размер буферизованных записей, при достижении которого
начинается процесс сброса данных на сервер.
- name: client_max_buffered_ops
type: int
default: 1024
online: true
info: |
Maximum number of buffered writes which triggers write-back when reached.
Multiple consecutive modified data regions are counted as 1 write here.
info_ru: |
Максимальное количество буферизованных записей, при достижении которого
начинается процесс сброса данных на сервер. При этом несколько
последовательных изменённых областей здесь считаются 1 записью.
- name: client_max_writeback_iodepth
type: int
default: 256
online: true
info: |
Maximum number of parallel writes when flushing buffered data to the server.
info_ru: |
Максимальное число параллельных операций записи при сбросе буферов на сервер.

View File

@@ -28,6 +28,8 @@
{{../../config/network.en.md|indent=2}}
{{../../config/client.en.md|indent=2}}
{{../../config/layout-cluster.en.md|indent=2}}
{{../../config/layout-osd.en.md|indent=2}}

View File

@@ -28,6 +28,8 @@
{{../../config/network.ru.md|indent=2}}
{{../../config/client.ru.md|indent=2}}
{{../../config/layout-cluster.ru.md|indent=2}}
{{../../config/layout-osd.ru.md|indent=2}}

View File

@@ -7,26 +7,27 @@
in Vitastor, affects memory usage, write amplification and I/O load
distribution effectiveness.
Recommended default block size is 128 KB for SSD and 4 MB for HDD. In fact,
it's possible to use 4 MB for SSD too - it will lower memory usage, but
Recommended default block size is 128 KB for SSD and 1 MB for HDD. In fact,
it's possible to use 1 MB for SSD too - it will lower memory usage, but
may increase average WA and reduce linear performance.
OSD memory usage is roughly (SIZE / BLOCK * 68 bytes) which is roughly
544 MB per 1 TB of used disk space with the default 128 KB block size.
With 1 MB it's 8 times lower.
info_ru: |
Размер объектов (блоков данных), на которые делятся физические и виртуальные
диски в Vitastor (в рамках каждого пула). Одна из ключевых на данный момент
настроек, влияет на потребление памяти, объём избыточной записи (write
amplification) и эффективность распределения нагрузки по OSD.
Рекомендуемые по умолчанию размеры блока - 128 килобайт для SSD и 4
мегабайта для HDD. В принципе, для SSD можно тоже использовать 4 мегабайта,
Рекомендуемые по умолчанию размеры блока - 128 килобайт для SSD и 1 мегабайт
для HDD. В принципе, для SSD можно тоже использовать блок размером 1 мегабайт,
это понизит использование памяти, но ухудшит распределение нагрузки и в
среднем увеличит WA.
Потребление памяти OSD составляет примерно (РАЗМЕР / БЛОК * 68 байт),
т.е. примерно 544 МБ памяти на 1 ТБ занятого места на диске при
стандартном 128 КБ блоке.
стандартном 128 КБ блоке. При 1 МБ блоке памяти нужно в 8 раз меньше.
- name: bitmap_granularity
type: int
default: 4096
@@ -86,8 +87,9 @@
it (they have internal SSD cache even though it's not stated in datasheets).
Setting this parameter to "all" or "small" in OSD parameters requires enabling
disable_journal_fsync and disable_meta_fsync, setting it to "all" also requires
enabling disable_data_fsync.
[disable_journal_fsync](layout-osd.en.yml#disable_journal_fsync) and
[disable_meta_fsync](layout-osd.en.yml#disable_meta_fsync), setting it to
"all" also requires enabling [disable_data_fsync](layout-osd.en.yml#disable_data_fsync).
TLDR: For optimal performance, set immediate_commit to "all" if you only use
SSDs with supercapacitor-based power loss protection (nonvolatile
@@ -139,8 +141,9 @@
указано в спецификациях).
Указание "all" или "small" в настройках / командной строке OSD требует
включения disable_journal_fsync и disable_meta_fsync, значение "all" также
требует включения disable_data_fsync.
включения [disable_journal_fsync](layout-osd.ru.yml#disable_journal_fsync) и
[disable_meta_fsync](layout-osd.ru.yml#disable_meta_fsync), значение "all"
также требует включения [disable_data_fsync](layout-osd.ru.yml#disable_data_fsync).
Итого, вкратце: для оптимальной производительности установите
immediate_commit в значение "all", если вы используете в кластере только SSD

View File

@@ -204,3 +204,73 @@
Клиентам не обязательно знать про disk_alignment, так что помещать значение
этого параметра в etcd в /vitastor/config/global не нужно.
- name: data_csum_type
type: string
default: none
info: |
Data checksum type to use. May be "crc32c" or "none". Set to "crc32c" to
enable data checksums.
info_ru: |
Тип используемых OSD контрольных сумм данных. Может быть "crc32c" или "none".
Установите в "crc32c", чтобы включить расчёт и проверку контрольных сумм данных.
Следует понимать, что контрольные суммы в зависимости от размера блока их
расчёта либо увеличивают потребление памяти, либо снижают производительность.
Подробнее смотрите в описании параметра [csum_block_size](#csum_block_size).
- name: csum_block_size
type: int
default: 4096
info: |
Checksum calculation block size.
Must be equal or a multiple of [bitmap_granularity](layout-cluster.en.md#bitmap_granularity)
(which is usually 4 KB).
Checksums increase metadata size by 4 bytes per each csum_block_size of data.
Checksums are always a tradeoff:
1. You either sacrifice +1 GB RAM per 1 TB of data
2. Or you raise csum_block_size, for example, to 32k and sacrifice
50% random write iops due to checksum read-modify-write
3. Or you turn off [inmemory_metadata](osd.en.md#inmemory_metadata) and
sacrifice 50% random read iops due to checksum reads
All-flash clusters usually have enough RAM to use default csum_block_size,
which uses 1 GB RAM per 1 TB of data. HDD clusters usually don't.
Thus, recommended setups are:
1. All-flash, 1 GB RAM per 1 TB data: default (csum_block_size=4k)
2. All-flash, less RAM: csum_block_size=4k + inmemory_metadata=false
3. Hybrid HDD+SSD: csum_block_size=4k + inmemory_metadata=false
4. HDD-only, faster random read: csum_block_size=32k
5. HDD-only, faster random write: csum_block_size=4k +
inmemory_metadata=false + meta_io=cached
See also [meta_io](osd.en.md#meta_io).
info_ru: |
Размер блока расчёта контрольных сумм.
Должен быть равен или кратен [bitmap_granularity](layout-cluster.ru.md#bitmap_granularity)
(который обычно равен 4 КБ).
Контрольные суммы увеличивают размер метаданных на 4 байта на каждые
csum_block_size данных.
Контрольные суммы - это всегда компромисс:
1. Вы либо жертвуете потреблением +1 ГБ памяти на 1 ТБ дискового пространства
2. Либо вы повышаете csum_block_size до, скажем, 32k и жертвуете 50%
скорости случайной записи из-за цикла чтения-изменения-записи для расчёта
новых контрольных сумм
3. Либо вы отключаете [inmemory_metadata](osd.ru.md#inmemory_metadata) и
жертвуете 50% скорости случайного чтения из-за чтения контрольных сумм
с диска
Таким образом, рекомендуются следующие варианты настроек:
1. All-flash, 1 ГБ памяти на 1 ТБ данных: по умолчанию (csum_block_size=4k)
2. All-flash, меньше памяти: csum_block_size=4k + inmemory_metadata=false
3. Гибридные HDD+SSD: csum_block_size=4k + inmemory_metadata=false
4. Только HDD, быстрее случайное чтение: csum_block_size=32k
5. Только HDD, быстрее случайная запись: csum_block_size=4k +
inmemory_metadata=false + meta_io=cached
Смотрите также [meta_io](osd.ru.md#meta_io).

View File

@@ -259,23 +259,3 @@
detect disconnections quickly.
info_ru: |
Интервал проверки живости вебсокет-подключений к etcd.
- name: client_dirty_limit
type: int
default: 33554432
online: true
info: |
Without immediate_commit=all this parameter sets the limit of "dirty"
(not committed by fsync) data allowed by the client before forcing an
additional fsync and committing the data. Also note that the client always
holds a copy of uncommitted data in memory so this setting also affects
RAM usage of clients.
This parameter doesn't affect OSDs themselves.
info_ru: |
При работе без immediate_commit=all - это лимит объёма "грязных" (не
зафиксированных fsync-ом) данных, при достижении которого клиент будет
принудительно вызывать fsync и фиксировать данные. Также стоит иметь в виду,
что в этом случае до момента fsync клиент хранит копию незафиксированных
данных в памяти, то есть, настройка влияет на потребление памяти клиентами.
Параметр не влияет на сами OSD.

View File

@@ -2,15 +2,28 @@
type: sec
default: 5
info: |
Interval at which OSDs report their state to etcd. Affects OSD lease time
Interval at which OSDs report their liveness to etcd. Affects OSD lease time
and thus the failover speed. Lease time is equal to this parameter value
plus max_etcd_attempts * etcd_quick_timeout because it should be guaranteed
that every OSD always refreshes its lease in time.
info_ru: |
Интервал, с которым OSD обновляет своё состояние в etcd. Значение параметра
влияет на время резервации (lease) OSD и поэтому на скорость переключения
Интервал, с которым OSD сообщает о том, что жив, в etcd. Значение параметра
влияет на время резервации (lease) OSD и поэтому - на скорость переключения
при падении OSD. Время lease равняется значению этого параметра плюс
max_etcd_attempts * etcd_quick_timeout.
- name: etcd_stats_interval
type: sec
default: 30
info: |
Interval at which OSDs report their statistics to etcd. Highly affects the
imposed load on etcd, because statistics include a key for every OSD and
for every PG. At the same time, low statistic intervals make `vitastor-cli`
statistics more responsive.
info_ru: |
Интервал, с которым OSD обновляет свою статистику в etcd. Сильно влияет на
создаваемую нагрузку на etcd, потому что статистика содержит по ключу на
каждый OSD и на каждую PG. В то же время низкий интервал делает
статистику, печатаемую `vitastor-cli`, отзывчивей.
- name: run_primary
type: bool
default: true
@@ -260,6 +273,96 @@
достаточно 16- или 32-мегабайтного журнала. Однако в теории отключение
параметра может оказаться полезным для гибридных OSD (HDD+SSD) с большими
журналами, расположенными на быстром по сравнению с HDD устройстве.
- name: data_io
type: string
default: direct
info: |
I/O mode for *data*. One of "direct", "cached" or "directsync". Corresponds
to O_DIRECT, O_SYNC and O_DIRECT|O_SYNC, respectively.
Choose "cached" to use Linux page cache. This may improve read performance
for hot data and slower disks - HDDs and maybe SATA SSDs - but will slightly
decrease write performance for fast disks because page cache is an overhead
itself.
Choose "directsync" to use [immediate_commit](layout-cluster.ru.md#immediate_commit)
(which requires disable_data_fsync) with drives having write-back cache
which can't be turned off, for example, Intel Optane. Also note that *some*
desktop SSDs (for example, HP EX950) may ignore O_SYNC thus making
disable_data_fsync unsafe even with "directsync".
info_ru: |
Режим ввода-вывода для *данных*. Одно из значений "direct", "cached" или
"directsync", означающих O_DIRECT, O_SYNC и O_DIRECT|O_SYNC, соответственно.
Выберите "cached", чтобы использовать системный кэш Linux (page cache) при
чтении и записи. Это может улучшить скорость чтения горячих данных с
относительно медленных дисков - HDD и, возможно, SATA SSD - но немного
снижает производительность записи для быстрых дисков, так как кэш сам по
себе тоже добавляет накладные расходы.
Выберите "directsync", если хотите задействовать
[immediate_commit](layout-cluster.ru.md#immediate_commit) (требующий
включенияd disable_data_fsync) на дисках с неотключаемым кэшем. Пример таких
дисков - Intel Optane. При этом также стоит иметь в виду, что *некоторые*
настольные SSD (например, HP EX950) игнорируют флаг O_SYNC, делая отключение
fsync небезопасным даже с режимом "directsync".
- name: meta_io
type: string
default: direct
info: |
I/O mode for *metadata*. One of "direct", "cached" or "directsync".
"cached" may improve read performance, but only under the following conditions:
1. your drives are relatively slow (HDD, SATA SSD), and
2. checksums are enabled, and
3. [inmemory_metadata](#inmemory_metadata) is disabled.
Under all these conditions, metadata blocks are read from disk on every
read request to verify checksums and caching them may reduce this extra
read load. Without (3) metadata is never read from the disk after starting,
and without (2) metadata blocks are read from disk only during journal
flushing.
"directsync" is the same as above.
If the same device is used for data and metadata, meta_io by default is set
to the same value as [data_io](#data_io).
info_ru: |
Режим ввода-вывода для *метаданных*. Одно из значений "direct", "cached" или
"directsync".
"cached" может улучшить скорость чтения, если:
1. у вас медленные диски (HDD, SATA SSD)
2. контрольные суммы включены
3. параметр [inmemory_metadata](#inmemory_metadata) отключён.
При этих условиях блоки метаданных читаются с диска при каждом запросе чтения
для проверки контрольных сумм и их кэширование может снизить дополнительную
нагрузку на диск. Без (3) метаданные никогда не читаются с диска после
запуска OSD, а без (2) блоки метаданных читаются только при сбросе журнала.
Если одно и то же устройство используется для данных и метаданных, режим
ввода-вывода метаданных по умолчанию устанавливается равным [data_io](#data_io).
- name: journal_io
type: string
default: direct
info: |
I/O mode for *journal*. One of "direct", "cached" or "directsync".
Here, "cached" may only improve read performance for recent writes and
only if [inmemory_journal](#inmemory_journal) is turned off.
If the same device is used for metadata and journal, journal_io by default
is set to the same value as [meta_io](#meta_io).
info_ru: |
Режим ввода-вывода для *журнала*. Одно из значений "direct", "cached" или
"directsync".
Здесь "cached" может улучшить скорость чтения только недавно записанных
данных и только если параметр [inmemory_journal](#inmemory_journal)
отключён.
Если одно и то же устройство используется для метаданных и журнала,
режим ввода-вывода журнала по умолчанию устанавливается равным
[meta_io](#meta_io).
- name: journal_sector_buffer_count
type: int
default: 32

View File

@@ -17,4 +17,15 @@ and apply all `NNN-*.yaml` manifests to your Kubernetes installation:
for i in ./???-*.yaml; do kubectl apply -f $i; done
```
After that you'll be able to create PersistentVolumes. See example in [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
After that you'll be able to create PersistentVolumes.
## Features
Vitastor CSI supports:
- Kubernetes starting with 1.20 (or 1.17 for older vitastor-csi <= 1.1.0)
- Filesystem RWO (ReadWriteOnce) volumes. Example: [PVC](../../csi/deploy/example-pvc.yaml), [pod](../../csi/deploy/example-test-pod.yaml)
- Raw block RWX (ReadWriteMany) volumes. Example: [PVC](../../csi/deploy/example-pvc-block.yaml), [pod](../../csi/deploy/example-test-pod-block.yaml)
- Volume expansion
- Volume snapshots. Example: [snapshot class](../../csi/deploy/example-snapshot-class.yaml), [snapshot](../../csi/deploy/example-snapshot.yaml), [clone](../../csi/deploy/example-snapshot-clone.yaml)
Remember that to use snapshots with CSI you also have to install [Snapshot Controller and CRDs](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).

View File

@@ -17,4 +17,15 @@
for i in ./???-*.yaml; do kubectl apply -f $i; done
```
После этого вы сможете создавать PersistentVolume. Пример смотрите в файле [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
После этого вы сможете создавать PersistentVolume.
## Возможности
CSI-плагин Vitastor поддерживает:
- Версии Kubernetes, начиная с 1.20 (или с 1.17 для более старых vitastor-csi <= 1.1.0)
- Файловые RWO (ReadWriteOnce) тома. Пример: [PVC](../../csi/deploy/example-pvc.yaml), [под](../../csi/deploy/example-test-pod.yaml)
- Сырые блочные RWX (ReadWriteMany) тома. Пример: [PVC](../../csi/deploy/example-pvc-block.yaml), [под](../../csi/deploy/example-test-pod-block.yaml)
- Расширение размера томов
- Снимки томов. Пример: [класс снимков](../../csi/deploy/example-snapshot-class.yaml), [снимок](../../csi/deploy/example-snapshot.yaml), [клон снимка](../../csi/deploy/example-snapshot-clone.yaml)
Не забывайте, что для использования снимков нужно сначала установить [контроллер снимков и CRD](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).

View File

@@ -14,6 +14,8 @@
- Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
- Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
- Add `-oldstable` to bookworm/bullseye/buster in this line to install the last
stable version from 0.9.x branch instead of 1.x
- For Debian 10 (Buster) also enable backports repository:
`deb http://deb.debian.org/debian buster-backports main`
- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu`

View File

@@ -14,6 +14,8 @@
- Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
- Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
- Добавьте `-oldstable` к слову bookworm/bullseye/buster в этой строке, чтобы
установить последнюю стабильную версию из ветки 0.9.x вместо 1.x
- Для Debian 10 (Buster) также включите репозиторий backports:
`deb http://deb.debian.org/debian buster-backports main`
- Установите пакеты: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu`

View File

@@ -1,4 +1,4 @@
[Документация](../../README-ru.md#документация) → Установка → Proxmox
[Документация](../../README-ru.md#документация) → Установка → Proxmox VE
-----

View File

@@ -21,7 +21,7 @@
## Basic instructions
Download source, for example using git: `git clone --recurse-submodules https://yourcmc.ru/git/vitalif/vitastor/`
Download source, for example using git: `git clone --recurse-submodules https://git.yourcmc.ru/vitalif/vitastor/`
Get `fio` source and symlink it into `<vitastor>/fio`. If you don't want to build fio engine,
you can disable it by passing `-DWITH_FIO=no` to cmake.
@@ -41,7 +41,7 @@ It's recommended to build the QEMU driver (qemu_driver.c) in-tree, as a part of
QEMU build process. To do that:
- Install vitastor client library headers (from source or from vitastor-client-dev package)
- Take a corresponding patch from `patches/qemu-*-vitastor.patch` and apply it to QEMU source
- Copy `src/qemu_driver.c` to QEMU source directory as `block/block-vitastor.c`
- Copy `src/qemu_driver.c` to QEMU source directory as `block/vitastor.c`
- Build QEMU as usual
But it is also possible to build it out-of-tree. To do that:

View File

@@ -21,7 +21,7 @@
## Базовая инструкция
Скачайте исходные коды, например, из git: `git clone --recurse-submodules https://yourcmc.ru/git/vitalif/vitastor/`
Скачайте исходные коды, например, из git: `git clone --recurse-submodules https://git.yourcmc.ru/vitalif/vitastor/`
Скачайте исходные коды пакета `fio`, распакуйте их и создайте символическую ссылку на них
в директории исходников Vitastor: `<vitastor>/fio`. Либо, если вы не хотите собирать плагин fio,
@@ -41,7 +41,7 @@ cmake .. && make -j8 install
Драйвер QEMU (qemu_driver.c) рекомендуется собирать вместе с самим QEMU. Для этого:
- Установите заголовки клиентской библиотеки Vitastor (из исходников или из пакета vitastor-client-dev)
- Возьмите соответствующий патч из `patches/qemu-*-vitastor.patch` и примените его к исходникам QEMU
- Скопируйте [src/qemu_driver.c](../../src/qemu_driver.c) в директорию исходников QEMU как `block/block-vitastor.c`
- Скопируйте [src/qemu_driver.c](../../src/qemu_driver.c) в директорию исходников QEMU как `block/vitastor.c`
- Соберите QEMU как обычно
Однако в целях отладки драйвер также можно собирать отдельно от QEMU. Для этого:
@@ -60,7 +60,7 @@ cmake .. && make -j8 install
* Для QEMU 2.0+: `<qemu>/qapi-types.h` &rarr; `<vitastor>/qemu/b/qemu/qapi-types.h`
- `config-host.h` и `qapi` нужны, т.к. в них содержатся автогенерируемые заголовки
- Сконфигурируйте cmake Vitastor с `WITH_QEMU=yes` (`cmake .. -DWITH_QEMU=yes`) и, если вы
используете RHEL-подобый дистрибутив, также с `QEMU_PLUGINDIR=qemu-kvm`.
используете RHEL-подобный дистрибутив, также с `QEMU_PLUGINDIR=qemu-kvm`.
- После этого в процессе сборки Vitastor также будет собираться подходящий для вашей
версии QEMU `block-vitastor.so`.
- Таким образом можно использовать драйвер даже с немодифицированным QEMU, но в этом случае

View File

@@ -29,7 +29,9 @@
- Snapshots and copy-on-write image clones
- [Write throttling to smooth random write workloads in SSD+HDD configurations](../config/osd.en.md#throttle_small_writes)
- [RDMA/RoCEv2 support via libibverbs](../config/network.en.md#rdma_device)
- [Scrubbing without checksums](../config/osd.en.md#auto_scrub) (verification of copies)
- [Scrubbing](../config/osd.en.md#auto_scrub) (verification of copies)
- [Checksums](../config/layout-osd.en.md#data_csum_type)
- [Client write-back cache](../config/client.en.md#client_enable_writeback)
## Plugins and tools
@@ -49,14 +51,15 @@
The following features are planned for the future:
- File system
- Control plane optimisation
- Other administrative tools
- Web GUI
- OpenNebula plugin
- iSCSI proxy
- iSCSI and NVMeoF gateways
- Multi-threaded client
- Faster failover
- Checksums
- S3
- Tiered storage (SSD caching)
- NVDIMM support
- Compression (possibly)
- Read caching using system page cache (possibly)

View File

@@ -31,7 +31,9 @@
- Снапшоты и copy-on-write клоны
- [Сглаживание производительности случайной записи в SSD+HDD конфигурациях](../config/osd.ru.md#throttle_small_writes)
- [Поддержка RDMA/RoCEv2 через libibverbs](../config/network.ru.md#rdma_device)
- [Фоновая проверка целостности без контрольных сумм](../config/osd.ru.md#auto_scrub) (сверка копий)
- [Фоновая проверка целостности](../config/osd.ru.md#auto_scrub) (сверка копий)
- [Контрольные суммы](../config/layout-osd.ru.md#data_csum_type)
- [Буферизация записи на стороне клиента](../config/client.ru.md#client_enable_writeback)
## Драйверы и инструменты
@@ -49,13 +51,15 @@
## Планы развития
- Файловая система
- Оптимизация слоя управления
- Другие инструменты администрирования
- Web-интерфейс
- Плагин для OpenNebula
- iSCSI-прокси
- iSCSI и NVMeoF прокси
- Многопоточный клиент
- Более быстрое переключение при отказах
- Контрольные суммы
- S3
- Поддержка SSD-кэширования (tiered storage)
- Поддержка NVDIMM
- Возможно, сжатие

View File

@@ -7,6 +7,7 @@
# Quick Start
- [Preparation](#preparation)
- [Recommended drives](#recommended-drives)
- [Configure monitors](#configure-monitors)
- [Configure OSDs](#configure-osds)
- [Create a pool](#create-a-pool)
@@ -19,10 +20,20 @@
- Get some SATA or NVMe SSDs with capacitors (server-grade drives). You can use desktop SSDs
with lazy fsync, but prepare for inferior single-thread latency. Read more about capacitors
[here](../config/layout-cluster.en.md#immediate_commit).
- If you want to use HDDs, get modern HDDs with Media Cache or SSD Cache: HGST Ultrastar,
Toshiba MG08, Seagate EXOS or something similar. If your drives don't have such cache then
you also need small SSDs for journal and metadata (even 2 GB per 1 TB of HDD space is enough).
- Get a fast network (at least 10 Gbit/s). Something like Mellanox ConnectX-4 with RoCEv2 is ideal.
- Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
- [Install Vitastor packages](../installation/packages.en.md).
## Recommended drives
- SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
- NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
Intel DC-P3700/P4500/P4600, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
- HDD: HGST Ultrastar, Toshiba MG06/MG07/MG08, Seagate EXOS
## Configure monitors
On the monitor hosts:
@@ -45,9 +56,10 @@ On the monitor hosts:
}
```
- Initialize OSDs:
- SSD-only: `vitastor-disk prepare /dev/sdXXX [/dev/sdYYY ...]`. You can add
`--disable_data_fsync off` to leave disk cache enabled if you use desktop
SSDs without capacitors.
- SSD-only or HDD-only: `vitastor-disk prepare /dev/sdXXX [/dev/sdYYY ...]`.
Add `--disable_data_fsync off` to leave disk write cache enabled if you use
desktop SSDs without capacitors. Do NOT add `--disable_data_fsync off` if you
use HDDs or SSD+HDD.
- Hybrid, SSD+HDD: `vitastor-disk prepare --hybrid /dev/sdXXX [/dev/sdYYY ...]`.
Pass all your devices (HDD and SSD) to this script &mdash; it will partition disks and initialize journals on its own.
This script skips HDDs which are already partitioned so if you want to use non-empty disks for

View File

@@ -7,6 +7,7 @@
# Быстрый старт
- [Подготовка](#подготовка)
- [Рекомендуемые диски](#рекомендуемые-диски)
- [Настройте мониторы](#настройте-мониторы)
- [Настройте OSD](#настройте-osd)
- [Создайте пул](#создайте-пул)
@@ -19,10 +20,20 @@
- Возьмите серверы с SSD (SATA или NVMe), желательно с конденсаторами (серверные SSD). Можно
использовать и десктопные SSD, включив режим отложенного fsync, но производительность будет хуже.
О конденсаторах читайте [здесь](../config/layout-cluster.ru.md#immediate_commit).
- Если хотите использовать HDD, берите современные модели с Media или SSD кэшем - HGST Ultrastar,
Toshiba MG08, Seagate EXOS или что-то похожее. Если такого кэша у ваших дисков нет,
обязательно возьмите SSD под метаданные и журнал (маленькие, буквально 2 ГБ на 1 ТБ HDD-места).
- Возьмите быструю сеть, минимум 10 гбит/с. Идеал - что-то вроде Mellanox ConnectX-4 с RoCEv2.
- Для лучшей производительности отключите энергосбережение CPU: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`.
- [Установите пакеты Vitastor](../installation/packages.ru.md).
## Рекомендуемые диски
- SATA SSD: Micron 5100/5200/5300/5400, Samsung PM863/PM883/PM893, Intel D3-S4510/4520/4610/4620, Kingston DC500M
- NVMe: Micron 9100/9200/9300/9400, Micron 7300/7450, Samsung PM983/PM9A3, Samsung PM1723/1735/1743,
Intel DC-P3700/P4500/P4600, Intel D7-P5500/P5600, Intel Optane, Kingston DC1000B/DC1500M
- HDD: HGST Ultrastar, Toshiba MG06/MG07/MG08, Seagate EXOS
## Настройте мониторы
На хостах, выделенных под мониторы:
@@ -45,9 +56,10 @@
}
```
- Инициализуйте OSD:
- SSD: `vitastor-disk prepare /dev/sdXXX [/dev/sdYYY ...]`. Если вы используете
десктопные SSD без конденсаторов, можете оставить кэш включённым, добавив
опцию `--disable_data_fsync off`.
- Только SSD или только HDD: `vitastor-disk prepare /dev/sdXXX [/dev/sdYYY ...]`.
Если вы используете десктопные SSD без конденсаторов, добавьте опцию `--disable_data_fsync off`,
чтобы оставить кэш записи диска включённым. НЕ добавляйте эту опцию, если используете
жёсткие диски (HDD).
- Гибридные, SSD+HDD: `vitastor-disk prepare --hybrid /dev/sdXXX [/dev/sdYYY ...]`.
Передайте все ваши SSD и HDD скрипту в командной строке подряд, скрипт автоматически выделит
разделы под журналы на SSD и данные на HDD. Скрипт пропускает HDD, на которых уже есть разделы

View File

@@ -86,6 +86,8 @@ Options (both modes):
--journal_size 1G/32M Set journal size (area or partition size)
--block_size 1M/128k Set blockstore object size
--bitmap_granularity 4k Set bitmap granularity
--data_csum_type none Set data checksum type (crc32c or none)
--csum_block_size 4k Set data checksum block size
--data_device_block 4k Override data device block size
--meta_device_block 4k Override metadata device block size
--journal_device_block 4k Override journal device block size
@@ -100,8 +102,9 @@ checks the device cache status on start and tries to disable cache for SATA/SAS
If it doesn't succeed it issues a warning in the system log.
You can also pass other OSD options here as arguments and they'll be persisted
to the superblock: max_write_iodepth, max_write_iodepth, min_flusher_count,
max_flusher_count, inmemory_metadata, inmemory_journal, journal_sector_buffer_count,
in the superblock: cached_io_data, cached_io_meta, cached_io_journal,
inmemory_metadata, inmemory_journal, max_write_iodepth,
min_flusher_count, max_flusher_count, journal_sector_buffer_count,
journal_no_same_sector_overwrites, throttle_small_writes, throttle_target_iops,
throttle_target_mbs, throttle_target_parallelism, throttle_threshold_us.
See [Runtime OSD Parameters](../config/osd.en.md) for details.
@@ -249,7 +252,9 @@ Options (see also [Cluster-Wide Disk Layout Parameters](../config/layout-cluster
```
--object_size 128k Set blockstore block size
--bitmap_granularity 4k Set bitmap granularity
--journal_size 32M Set journal size
--journal_size 16M Set journal size
--data_csum_type none Set data checksum type (crc32c or none)
--csum_block_size 4k Set data checksum block size
--device_block_size 4k Set device block size
--journal_offset 0 Set journal offset
--device_size 0 Set device size

View File

@@ -87,6 +87,8 @@ vitastor-disk - инструмент командной строки для уп
--journal_size 1G/32M Задать размер журнала (области или раздела журнала)
--block_size 1M/128k Задать размер объекта хранилища
--bitmap_granularity 4k Задать гранулярность битовых карт
--data_csum_type none Задать тип контрольных сумм (crc32c или none)
--csum_block_size 4k Задать размер блока расчёта контрольных сумм
--data_device_block 4k Задать размер блока устройства данных
--meta_device_block 4k Задать размер блока метаданных
--journal_device_block 4k Задать размер блока журнала
@@ -101,8 +103,9 @@ vitastor-disk - инструмент командной строки для уп
это не удаётся, в системный журнал выводится предупреждение.
Вы можете передать данной команде и некоторые другие опции OSD в качестве аргументов
и они тоже будут сохранены в суперблок: max_write_iodepth, max_write_iodepth, min_flusher_count,
max_flusher_count, inmemory_metadata, inmemory_journal, journal_sector_buffer_count,
и они тоже будут сохранены в суперблок: cached_io_data, cached_io_meta,
cached_io_journal, inmemory_metadata, inmemory_journal, max_write_iodepth,
min_flusher_count, max_flusher_count, journal_sector_buffer_count,
journal_no_same_sector_overwrites, throttle_small_writes, throttle_target_iops,
throttle_target_mbs, throttle_target_parallelism, throttle_threshold_us.
Читайте об этих параметрах подробнее в разделе [Изменяемые параметры OSD](../config/osd.ru.md).
@@ -254,7 +257,9 @@ OSD отключены fsync-и.
```
--object_size 128k Размер блока хранилища
--bitmap_granularity 4k Гранулярность битовых карт
--journal_size 32M Размер журнала
--journal_size 16M Размер журнала
--data_csum_type none Задать тип контрольных сумм (crc32c или none)
--csum_block_size 4k Задать размер блока расчёта контрольных сумм
--device_block_size 4k Размер блока устройства
--journal_offset 0 Смещение журнала
--device_size 0 Размер устройства

View File

@@ -13,6 +13,8 @@ remains decent (see an example [here](../performance/comparison1.en.md#vitastor-
Vitastor Kubernetes CSI driver is based on NBD.
See also [VDUSE](qemu.en.md#vduse).
## Map image
To create a local block device for a Vitastor image run:

View File

@@ -16,6 +16,8 @@ NBD немного снижает производительность из-за
CSI-драйвер Kubernetes Vitastor основан на NBD.
Смотрите также [VDUSE](qemu.ru.md#vduse).
## Подключить устройство
Чтобы создать локальное блочное устройство для образа, выполните команду:

View File

@@ -34,6 +34,20 @@ qemu-system-x86_64 -enable-kvm -m 1024 \
-vnc 0.0.0.0:0
```
With a separate I/O thread:
```
qemu-system-x86_64 -enable-kvm -m 1024 \
-object iothread,id=vitastor1 \
-blockdev '{"node-name":"drive-virtio-disk0","driver":"vitastor","image":"debian9",
"cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
-device 'virtio-blk-pci,iothread=vitastor1,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
id=virtio-disk0,bootindex=1,write-cache=off' \
-vnc 0.0.0.0:0
```
You can also specify inode ID, pool and size manually instead of `:image=<IMAGE>` option: `:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
## qemu-img
For qemu-img, you should use `vitastor:etcd_host=<HOST>:image=<IMAGE>` as filename.
@@ -83,3 +97,67 @@ qemu-img rebase -u -b '' testimg.qcow2
This can be used for backups. Just note that exporting an image that is currently being written to
is of course unsafe and doesn't produce a consistent result, so only export snapshots if you do this
on a live VM.
## vhost-user-blk
QEMU, starting with 6.0, includes support for attaching disks via a separate
userspace worker process, called `vhost-user-blk`. It usually has slightly (20-30 us)
lower latency.
Example commands to use it with Vitastor:
```
qemu-storage-daemon \
--daemonize \
--blockdev '{"node-name":"drive-virtio-disk1","driver":"vitastor","image":"testosd1","cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
--export type=vhost-user-blk,id=vitastor1,node-name=drive-virtio-disk1,addr.type=unix,addr.path=/run/vitastor1-user-blk.sock,writable=on,num-queues=1
qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
-object memory-backend-memfd,id=mem,size=2G,share=on \
-chardev socket,id=vitastor1,reconnect=1,path=/run/vitastor1-user-blk.sock \
-device vhost-user-blk-pci,chardev=vitastor1,num-queues=1,config-wce=off \
-vnc 0.0.0.0:0
```
memfd memory-backend is crucial, vhost-user-blk does not work without it.
## VDUSE
Linux kernel, starting with version 5.15, supports a new interface for attaching virtual disks
to the host - VDUSE (vDPA Device in Userspace). QEMU, starting with 7.2, has support for
exporting QEMU block devices over this protocol using qemu-storage-daemon.
VDUSE has the same problem as other FUSE-like interfaces in Linux: if a userspace process hangs,
for example, if it loses connectivity with Vitastor cluster - active processes doing I/O may
hang in the D state (uninterruptible sleep) and you won't be able to kill them even with kill -9.
In this case reboot will be the only way to remove VDUSE devices from system.
On the other hand, VDUSE is faster than [NBD](nbd.en.md), so you may prefer to use it if
performance is important for you. Approximate performance numbers:
direct fio benchmark - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
To try VDUSE you need at least Linux 5.15, built with VDUSE support
(CONFIG_VIRTIO_VDPA=m and CONFIG_VDPA_USER=m). Debian Linux kernels have these options
disabled by now, so if you want to try it on Debian, use a kernel from Ubuntu
[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) or Proxmox.
Commands to attach Vitastor image as a VDUSE device:
```
modprobe vduse
modprobe virtio-vdpa
qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitastor",\
"etcd-host":"192.168.7.2:2379/v3","image":"testosd1","cache":{"direct":true,"no-flush":false},"discard":"unmap"}' \
--export vduse-blk,id=test1,node-name=test1,name=test1,num-queues=16,queue-size=128,writable=true
vdpa dev add name test1 mgmtdev vduse
```
After running these commands /dev/vda device will appear in the system and you'll be able to
use it as a normal disk.
To remove the device:
```
vdpa dev del test1
kill <qemu-storage-daemon_process_PID>
```

View File

@@ -36,6 +36,18 @@ qemu-system-x86_64 -enable-kvm -m 1024 \
-vnc 0.0.0.0:0
```
С отдельным потоком ввода-вывода:
```
qemu-system-x86_64 -enable-kvm -m 1024 \
-object iothread,id=vitastor1 \
-blockdev '{"node-name":"drive-virtio-disk0","driver":"vitastor","image":"debian9",
"cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
-device 'virtio-blk-pci,iothread=vitastor1,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,
id=virtio-disk0,bootindex=1,write-cache=off' \
-vnc 0.0.0.0:0
```
Вместо `:image=<IMAGE>` также можно указывать номер инода, пул и размер: `:pool=<POOL>:inode=<INODE>:size=<SIZE>`.
## qemu-img
@@ -87,3 +99,67 @@ qemu-img rebase -u -b '' testimg.qcow2
Это можно использовать для резервного копирования. Только помните, что экспортировать образ, в который
в то же время идёт запись, небезопасно - результат чтения не будет целостным. Так что если вы работаете
с активными виртуальными машинами, экспортируйте только их снимки, но не сам образ.
## vhost-user-blk
QEMU, начиная с 6.0, позволяет подключать диски через отдельный рабочий процесс.
Этот метод подключения называется `vhost-user-blk` и обычно имеет чуть меньшую
задержку (ниже на 20-30 микросекунд, чем при обычном методе).
Пример команд для использования vhost-user-blk с Vitastor:
```
qemu-storage-daemon \
--daemonize \
--blockdev '{"node-name":"drive-virtio-disk1","driver":"vitastor","image":"testosd1","cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \
--export type=vhost-user-blk,id=vitastor1,node-name=drive-virtio-disk1,addr.type=unix,addr.path=/run/vitastor1-user-blk.sock,writable=on,num-queues=1
qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
-object memory-backend-memfd,id=mem,size=2G,share=on \
-chardev socket,id=vitastor1,reconnect=1,path=/run/vitastor1-user-blk.sock \
-device vhost-user-blk-pci,chardev=vitastor1,num-queues=1,config-wce=off \
-vnc 0.0.0.0:0
```
Здесь критична опция memory-backend-memfd, vhost-user-blk без неё не работает.
## VDUSE
В Linux, начиная с версии ядра 5.15, доступен новый интерфейс для подключения виртуальных дисков
к системе - VDUSE (vDPA Device in Userspace), а в QEMU, начиная с версии 7.2, есть поддержка
экспорта блочных устройств QEMU по этому протоколу через qemu-storage-daemon.
VDUSE страдает общей проблемой FUSE-подобных интерфейсов в Linux: если пользовательский процесс
подвиснет, например, если будет потеряна связь с кластером Vitastor - читающие/пишущие в кластер
процессы могут "залипнуть" в состоянии D (непрерываемый сон) и их будет невозможно убить даже
через kill -9. В этом случае удалить из системы устройство можно только перезагрузившись.
С другой стороны, VDUSE быстрее по сравнению с [NBD](nbd.ru.md), поэтому его может
быть предпочтительно использовать там, где производительность важнее. Порядок показателей:
прямое тестирование через fio - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
Чтобы использовать VDUSE, вам нужно ядро Linux версии хотя бы 5.15, собранное с поддержкой
VDUSE (CONFIG_VIRTIO_VDPA=m и CONFIG_VDPA_USER=m). В ядрах в Debian Linux поддержка пока
отключена - если хотите попробовать эту функцию на Debian, поставьте ядро из Ubuntu
[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) или из Proxmox.
Команды для подключения виртуального диска через VDUSE:
```
modprobe vduse
modprobe virtio-vdpa
qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitastor",\
"etcd-host":"192.168.7.2:2379/v3","image":"testosd1","cache":{"direct":true,"no-flush":false},"discard":"unmap"}' \
--export vduse-blk,id=test1,node-name=test1,name=test1,num-queues=16,queue-size=128,writable=true
vdpa dev add name test1 mgmtdev vduse
```
После этого в системе появится устройство /dev/vda, которое можно будет использовать как
обычный диск.
Для удаления устройства из системы:
```
vdpa dev del test1
kill <PID_процесса_qemu-storage-daemon>
```

2
json11

Submodule json11 updated: fd37016cf8...52a3af664f

View File

@@ -3,5 +3,5 @@ SUBSYSTEM=="block", ENV{ID_PART_ENTRY_TYPE}=="e7009fac-a5a1-4d72-af72-53de130599
IMPORT{program}="/usr/bin/vitastor-disk udev $devnode", \
SYMLINK+="vitastor/$env{VITASTOR_ALIAS}"
ENV{VITASTOR_OSD_NUM}!="", ACTION=="add", RUN{program}+="/usr/bin/systemctl enable --now vitastor-osd@$env{VITASTOR_OSD_NUM}"
ENV{VITASTOR_OSD_NUM}!="", ACTION=="remove", RUN{program}+="/usr/bin/systemctl disable --now vitastor-osd@$env{VITASTOR_OSD_NUM}"
ENV{VITASTOR_OSD_NUM}!="", ACTION=="add", RUN{program}+="/usr/bin/systemctl enable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"
ENV{VITASTOR_OSD_NUM}!="", ACTION=="remove", RUN{program}+="/usr/bin/systemctl disable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"

View File

@@ -63,8 +63,9 @@ Wants=network-online.target local-fs.target time-sync.target
[Service]
Restart=always
Environment=GOGC=50
ExecStart=etcd -name etcd${num} --data-dir /var/lib/etcd${num}.etcd \\
--advertise-client-urls http://${etcds[num]}:2379 --listen-client-urls http://${etcds[num]}:2379 \\
--snapshot-count 10000 --advertise-client-urls http://${etcds[num]}:2379 --listen-client-urls http://${etcds[num]}:2379 \\
--initial-advertise-peer-urls http://${etcds[num]}:2380 --listen-peer-urls http://${etcds[num]}:2380 \\
--initial-cluster-token vitastor-etcd-1 --initial-cluster ${etcd_cluster} \\
--initial-cluster-state new --max-txn-ops=100000 --max-request-bytes=104857600 \\

View File

@@ -78,9 +78,15 @@ const etcd_tree = {
disk_alignment: 4096,
bitmap_granularity: 4096,
immediate_commit: false, // 'all' or 'small'
// client - configurable online
client_max_dirty_bytes: 33554432,
client_max_dirty_ops: 1024,
client_enable_writeback: false,
client_max_buffered_bytes: 33554432,
client_max_buffered_ops: 1024,
client_max_writeback_iodepth: 256,
// client and osd - configurable online
log_level: 0,
client_dirty_limit: 33554432,
peer_connect_interval: 5, // seconds. min: 1
peer_connect_timeout: 5, // seconds. min: 1
osd_idle_timeout: 5, // seconds. min: 1
@@ -93,6 +99,7 @@ const etcd_tree = {
etcd_ws_keepalive_interval: 30, // seconds
// osd
etcd_report_interval: 5, // seconds
etcd_stats_interval: 30, // seconds
run_primary: true,
osd_network: null, // "192.168.7.0/24" or an array of masks
bind_address: "0.0.0.0",
@@ -390,12 +397,13 @@ class Mon
this.etcd_prefix = this.etcd_prefix.replace(/\/\/+/g, '/').replace(/^\/?(.*[^\/])\/?$/, '/$1');
this.etcd_start_timeout = (config.etcd_start_timeout || 5) * 1000;
this.state = JSON.parse(JSON.stringify(this.constructor.etcd_tree));
this.prev_stats = { osd_stats: {}, osd_diff: {} };
this.signals_set = false;
this.stat_time = Date.now();
this.ws = null;
this.ws_alive = false;
this.ws_keepalive_timer = null;
this.on_stop_cb = () => this.on_stop(0).catch(console.error);
this.recheck_pgs_active = false;
}
parse_etcd_addresses(addrs)
@@ -539,10 +547,18 @@ class Mon
{
retries = 1;
}
const tried = {};
while (retries < 0 || retry < retries)
{
const cur_addr = this.pick_next_etcd();
const base = 'ws'+cur_addr.substr(4);
let now = Date.now();
if (tried[base] && now-tried[base] < this.etcd_start_timeout)
{
await new Promise(ok => setTimeout(ok, this.etcd_start_timeout-(now-tried[base])));
now = Date.now();
}
tried[base] = now;
const ok = await new Promise((ok, no) =>
{
const timer_id = setTimeout(() =>
@@ -677,8 +693,27 @@ class Mon
});
}
// Schedule save_last_clean() to to run after a small timeout (1s) (to not spam etcd)
schedule_save_last_clean()
{
if (!this.save_last_clean_timer)
{
this.save_last_clean_timer = setTimeout(() =>
{
this.save_last_clean_timer = null;
this.save_last_clean().catch(this.die);
}, this.config.mon_change_timeout || 1000);
}
}
async save_last_clean()
{
if (this.save_last_clean_running)
{
this.schedule_save_last_clean();
return;
}
this.save_last_clean_running = true;
// last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
const new_clean_pgs = { items: {} };
next_pool:
@@ -715,6 +750,7 @@ class Mon
value: b64(JSON.stringify(this.state.history.last_clean_pgs))
} } ],
}, this.etcd_start_timeout, 0);
this.save_last_clean_running = false;
}
get_mon_state()
@@ -1148,6 +1184,33 @@ class Mon
}
}
filter_osds_by_block_layout(flat_tree, block_size, bitmap_granularity, immediate_commit)
{
for (const host in flat_tree)
{
let found = 0;
for (const osd in flat_tree[host])
{
const osd_stat = this.state.osd.stats[osd];
if (osd_stat && (osd_stat.bs_block_size && osd_stat.bs_block_size != block_size ||
osd_stat.bitmap_granularity && osd_stat.bitmap_granularity != bitmap_granularity ||
osd_stat.immediate_commit == 'small' && immediate_commit == 'all' ||
osd_stat.immediate_commit == 'none' && immediate_commit != 'none'))
{
delete flat_tree[host][osd];
}
else
{
found++;
}
}
if (!found)
{
delete flat_tree[host];
}
}
}
get_affinity_osds(pool_cfg, up_osds, osd_tree)
{
let aff_osds = up_osds;
@@ -1161,6 +1224,12 @@ class Mon
async recheck_pgs()
{
if (this.recheck_pgs_active)
{
this.schedule_recheck();
return;
}
this.recheck_pgs_active = true;
// Take configuration and state, check it against the stored configuration hash
// Recalculate PGs and save them to etcd if the configuration is changed
// FIXME: Do not change anything if the distribution is good and random enough and no PGs are degraded
@@ -1182,6 +1251,7 @@ class Mon
// Pool deleted. Delete all PGs, but first stop them.
if (!await this.stop_all_pgs(pool_id))
{
this.recheck_pgs_active = false;
this.schedule_recheck();
return;
}
@@ -1208,6 +1278,12 @@ class Mon
pool_tree = pool_tree ? pool_tree.children : [];
pool_tree = LPOptimizer.flatten_tree(pool_tree, levels, pool_cfg.failure_domain, 'osd');
this.filter_osds_by_tags(osd_tree, pool_tree, pool_cfg.osd_tags);
this.filter_osds_by_block_layout(
pool_tree,
pool_cfg.block_size || this.config.block_size || 131072,
pool_cfg.bitmap_granularity || this.config.bitmap_granularity || 4096,
pool_cfg.immediate_commit || this.config.immediate_commit || 'none'
);
// These are for the purpose of building history.osd_sets
const real_prev_pgs = [];
let pg_history = [];
@@ -1244,9 +1320,16 @@ class Mon
// PG count changed. Need to bring all PGs down.
if (!await this.stop_all_pgs(pool_id))
{
this.recheck_pgs_active = false;
this.schedule_recheck();
return;
}
}
if (prev_pgs.length != pool_cfg.pg_count)
{
// Scale PG count
// Do it even if old_pg_count is already equal to pool_cfg.pg_count,
// because last_clean_pgs may still contain the old number of PGs
const new_pg_history = [];
PGUtil.scale_pg_count(prev_pgs, real_prev_pgs, pg_history, new_pg_history, pool_cfg.pg_count);
pg_history = new_pg_history;
@@ -1348,6 +1431,7 @@ class Mon
await this.save_pg_config(new_config_pgs);
}
}
this.recheck_pgs_active = false;
}
async save_pg_config(new_config_pgs, etcd_request = { compare: [], success: [] })
@@ -1397,7 +1481,6 @@ class Mon
}
// Schedule a recheck to run after a small timeout (1s)
// If already scheduled, cancel previous timer and schedule it again
// This is required for multiple change events to trigger at most 1 recheck in 1s
schedule_recheck()
{
@@ -1411,15 +1494,15 @@ class Mon
}
}
derive_osd_stats(st, prev)
derive_osd_stats(st, prev, prev_diff)
{
const zero_stats = { op: { bps: 0n, iops: 0n, lat: 0n }, subop: { iops: 0n, lat: 0n }, recovery: { bps: 0n, iops: 0n } };
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
if (!st || !st.time || prev && (prev.time || this.stat_time/1000) >= st.time)
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
if (!st || !st.time || !prev || prev.time >= st.time)
{
return diff;
return prev_diff || diff;
}
const timediff = BigInt(st.time*1000 - (prev && prev.time*1000 || this.stat_time));
const timediff = BigInt(st.time*1000 - prev.time*1000);
for (const op in st.op_stats||{})
{
const pr = prev && prev.op_stats && prev.op_stats[op];
@@ -1451,25 +1534,47 @@ class Mon
if (n > 0)
diff.recovery_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff };
}
for (const pool_id in st.inode_stats||{})
{
const pool_diff = diff.inode_stats[pool_id] = {};
for (const inode_num in st.inode_stats[pool_id])
{
const inode_diff = diff.inode_stats[pool_id][inode_num] = {};
for (const op of [ 'read', 'write', 'delete' ])
{
const c = st.inode_stats[pool_id][inode_num][op];
const pr = prev && prev.inode_stats && prev.inode_stats[pool_id] &&
prev.inode_stats[pool_id][inode_num] && prev.inode_stats[pool_id][inode_num][op];
const n = BigInt(c.count||0) - BigInt(pr && pr.count||0);
inode_diff[op] = {
bps: (BigInt(c.bytes||0) - BigInt(pr && pr.bytes||0))*1000n/timediff,
iops: n*1000n/timediff,
lat: (BigInt(c.usec||0) - BigInt(pr && pr.usec||0))/(n || 1n),
};
}
}
}
return diff;
}
sum_op_stats(timestamp, prev_stats)
sum_op_stats()
{
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
if (!prev_stats || prev_stats.timestamp >= timestamp)
for (const osd in this.state.osd.stats)
{
return sum_diff;
const cur = { ...this.state.osd.stats[osd], inode_stats: this.state.osd.inodestats[osd]||{} };
this.prev_stats.osd_diff[osd] = this.derive_osd_stats(
cur, this.prev_stats.osd_stats[osd], this.prev_stats.osd_diff[osd]
);
this.prev_stats.osd_stats[osd] = cur;
}
const tm = BigInt(timestamp - (prev_stats.timestamp || 0));
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
// Sum derived values instead of deriving summed
for (const osd in this.state.osd.stats)
{
const derived = this.derive_osd_stats(this.state.osd.stats[osd],
this.prev_stats && this.prev_stats.osd_stats && this.prev_stats.osd_stats[osd]);
for (const type in derived)
const derived = this.prev_stats.osd_diff[osd];
for (const type in sum_diff)
{
for (const op in derived[type])
for (const op in derived[type]||{})
{
for (const k in derived[type][op])
{
@@ -1497,10 +1602,14 @@ class Mon
break;
}
}
const pool_cfg = (this.state.config.pools[pool_id]||{});
if (!object_size)
{
object_size = (this.state.config.pools[pool_id]||{}).block_size ||
this.config.block_size || 131072;
object_size = pool_cfg.block_size || this.config.block_size || 131072;
}
if (pool_cfg.scheme !== 'replicated')
{
object_size *= ((pool_cfg.pg_size||0) - (pool_cfg.parity_chunks||0));
}
object_size = BigInt(object_size);
for (const pg_num in this.state.pg.stats[pool_id])
@@ -1522,14 +1631,14 @@ class Mon
return { object_counts, object_bytes };
}
sum_inode_stats(prev_stats, timestamp, prev_timestamp)
sum_inode_stats()
{
const inode_stats = {};
const inode_stub = () => ({
raw_used: 0n,
read: { count: 0n, usec: 0n, bytes: 0n },
write: { count: 0n, usec: 0n, bytes: 0n },
delete: { count: 0n, usec: 0n, bytes: 0n },
read: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
write: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
delete: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
});
const seen_pools = {};
for (const pool_id in this.state.config.pools)
@@ -1581,11 +1690,25 @@ class Mon
}
}
}
if (prev_stats && prev_timestamp >= timestamp)
for (const osd in this.prev_stats.osd_diff)
{
prev_stats = null;
for (const pool_id in this.prev_stats.osd_diff[osd].inode_stats)
{
for (const inode_num in this.prev_stats.osd_diff[osd].inode_stats[pool_id])
{
inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
for (const op of [ 'read', 'write', 'delete' ])
{
const op_diff = this.prev_stats.osd_diff[osd].inode_stats[pool_id][inode_num][op] || {};
const op_st = inode_stats[pool_id][inode_num][op];
op_st.bps += op_diff.bps;
op_st.iops += op_diff.iops;
op_st.lat += op_diff.lat;
op_st.n_osd = (op_st.n_osd || 0) + 1;
}
}
}
}
const tm = prev_stats ? BigInt(timestamp - prev_timestamp) : 0;
for (const pool_id in inode_stats)
{
for (const inode_num in inode_stats[pool_id])
@@ -1594,11 +1717,12 @@ class Mon
for (const op of [ 'read', 'write', 'delete' ])
{
const op_st = inode_stats[pool_id][inode_num][op];
const prev_st = prev_stats && prev_stats[pool_id] && prev_stats[pool_id][inode_num] && prev_stats[pool_id][inode_num][op];
op_st.bps = prev_st ? (op_st.bytes - prev_st.bytes) * 1000n / tm : 0;
op_st.iops = prev_st ? (op_st.count - prev_st.count) * 1000n / tm : 0;
op_st.lat = prev_st ? (op_st.usec - prev_st.usec) / ((op_st.count - prev_st.count) || 1n) : 0;
if (op_st.bps > 0 || op_st.iops > 0 || op_st.lat > 0)
if (op_st.n_osd)
{
op_st.lat /= BigInt(op_st.n_osd);
delete op_st.n_osd;
}
if (op_st.bps > 0 || op_st.iops > 0)
nonzero = true;
}
if (!nonzero && (!this.state.config.inode[pool_id] || !this.state.config.inode[pool_id][inode_num]))
@@ -1608,7 +1732,7 @@ class Mon
}
}
}
return inode_stats;
return { inode_stats, seen_pools };
}
serialize_bigints(obj)
@@ -1631,15 +1755,9 @@ class Mon
async update_total_stats()
{
const txn = [];
const timestamp = Date.now();
const { object_counts, object_bytes } = this.sum_object_counts();
let stats = this.sum_op_stats(timestamp, this.prev_stats);
let inode_stats = this.sum_inode_stats(
this.prev_stats ? this.prev_stats.inode_stats : null,
timestamp, this.prev_stats ? this.prev_stats.timestamp : null
);
this.prev_stats = { timestamp, inode_stats, osd_stats: { ...this.state.osd.stats } };
this.stat_time = Date.now();
let stats = this.sum_op_stats();
let { inode_stats, seen_pools } = this.sum_inode_stats();
stats.object_counts = object_counts;
stats.object_bytes = object_bytes;
stats = this.serialize_bigints(stats);
@@ -1669,12 +1787,22 @@ class Mon
}
for (const pool_id in this.state.pool.stats)
{
const pool_stats = { ...this.state.pool.stats[pool_id] };
this.serialize_bigints(pool_stats);
txn.push({ requestPut: {
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
value: b64(JSON.stringify(pool_stats)),
} });
if (!seen_pools[pool_id])
{
txn.push({ requestDeleteRange: {
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
} });
delete this.state.pool.stats[pool_id];
}
else
{
const pool_stats = { ...this.state.pool.stats[pool_id] };
this.serialize_bigints(pool_stats);
txn.push({ requestPut: {
key: b64(this.etcd_prefix+'/pool/stats/'+pool_id),
value: b64(JSON.stringify(pool_stats)),
} });
}
}
if (txn.length)
{
@@ -1774,10 +1902,18 @@ class Mon
{
retries = 1;
}
const tried = {};
while (retries < 0 || retry < retries)
{
retry++;
const base = this.pick_next_etcd();
let now = Date.now();
if (tried[base] && now-tried[base] < timeout)
{
await new Promise(ok => setTimeout(ok, timeout-(now-tried[base])));
now = Date.now();
}
tried[base] = now;
const res = await POST(base+path, body, timeout);
if (res.error)
{

View File

@@ -1,6 +1,6 @@
{
"name": "vitastor-mon",
"version": "1.0.0",
"version": "1.2.0",
"description": "Vitastor SDS monitor service",
"main": "mon-main.js",
"scripts": {

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VERSION = '0.9.2'
VERSION = '1.2.0'
LOG = logging.getLogger(__name__)

View File

@@ -0,0 +1,176 @@
diff --git a/block/Makefile.objs b/block/Makefile.objs
index d644bac60a..e404236291 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -19,6 +19,7 @@ block-obj-$(if $(CONFIG_LIBISCSI),y,n) += iscsi-opts.o
block-obj-$(CONFIG_LIBNFS) += nfs.o
block-obj-$(CONFIG_CURL) += curl.o
block-obj-$(CONFIG_RBD) += rbd.o
+block-obj-$(CONFIG_VITASTOR) += vitastor.o
block-obj-$(CONFIG_GLUSTERFS) += gluster.o
block-obj-$(CONFIG_VXHS) += vxhs.o
block-obj-$(CONFIG_LIBSSH2) += ssh.o
@@ -39,6 +40,8 @@ curl.o-cflags := $(CURL_CFLAGS)
curl.o-libs := $(CURL_LIBS)
rbd.o-cflags := $(RBD_CFLAGS)
rbd.o-libs := $(RBD_LIBS)
+vitastor.o-cflags := $(VITASTOR_CFLAGS)
+vitastor.o-libs := $(VITASTOR_LIBS)
gluster.o-cflags := $(GLUSTERFS_CFLAGS)
gluster.o-libs := $(GLUSTERFS_LIBS)
vxhs.o-libs := $(VXHS_LIBS)
diff --git a/configure b/configure
index 0a19b033bc..58b7fbf24c 100755
--- a/configure
+++ b/configure
@@ -398,6 +398,7 @@ trace_backends="log"
trace_file="trace"
spice=""
rbd=""
+vitastor=""
smartcard=""
libusb=""
usb_redir=""
@@ -1213,6 +1214,10 @@ for opt do
;;
--enable-rbd) rbd="yes"
;;
+ --disable-vitastor) vitastor="no"
+ ;;
+ --enable-vitastor) vitastor="yes"
+ ;;
--disable-xfsctl) xfs="no"
;;
--enable-xfsctl) xfs="yes"
@@ -1601,6 +1606,7 @@ disabled with --disable-FEATURE, default is enabled if available:
vhost-crypto vhost-crypto acceleration support
spice spice
rbd rados block device (rbd)
+ vitastor vitastor block device
libiscsi iscsi support
libnfs nfs support
smartcard smartcard support (libcacard)
@@ -3594,6 +3600,27 @@ EOF
fi
fi
+##########################################
+# vitastor probe
+if test "$vitastor" != "no" ; then
+ cat > $TMPC <<EOF
+#include <vitastor_c.h>
+int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+}
+EOF
+ vitastor_libs="-lvitastor_client"
+ if compile_prog "" "$vitastor_libs" ; then
+ vitastor=yes
+ else
+ if test "$vitastor" = "yes" ; then
+ feature_not_found "vitastor block device" "Install vitastor-client-dev"
+ fi
+ vitastor=no
+ fi
+fi
+
##########################################
# libssh2 probe
min_libssh2_version=1.2.8
@@ -5837,6 +5864,7 @@ echo "Trace output file $trace_file-<pid>"
fi
echo "spice support $spice $(echo_version $spice $spice_protocol_version/$spice_server_version)"
echo "rbd support $rbd"
+echo "vitastor support $vitastor"
echo "xfsctl support $xfs"
echo "smartcard support $smartcard"
echo "libusb $libusb"
@@ -6416,6 +6444,11 @@ if test "$rbd" = "yes" ; then
echo "RBD_CFLAGS=$rbd_cflags" >> $config_host_mak
echo "RBD_LIBS=$rbd_libs" >> $config_host_mak
fi
+if test "$vitastor" = "yes" ; then
+ echo "CONFIG_VITASTOR=m" >> $config_host_mak
+ echo "VITASTOR_CFLAGS=$vitastor_cflags" >> $config_host_mak
+ echo "VITASTOR_LIBS=$vitastor_libs" >> $config_host_mak
+fi
echo "CONFIG_COROUTINE_BACKEND=$coroutine" >> $config_host_mak
if test "$coroutine_pool" = "yes" ; then
diff --git a/qapi/block-core.json b/qapi/block-core.json
index c50517bff3..c780bb2c1c 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2514,7 +2514,7 @@
'dmg', 'file', 'ftp', 'ftps', 'gluster', 'host_cdrom',
'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed',
- 'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh',
+ 'quorum', 'raw', 'rbd', 'vitastor', 'replication', 'sheepdog', 'ssh',
'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
##
@@ -3217,6 +3217,28 @@
'*snap-id': 'uint32',
'*tag': 'str' } }
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
##
# @ReplicationMode:
#
@@ -3547,6 +3569,7 @@
'rbd': 'BlockdevOptionsRbd',
'replication':'BlockdevOptionsReplication',
'sheepdog': 'BlockdevOptionsSheepdog',
+ 'vitastor': 'BlockdevOptionsVitastor',
'ssh': 'BlockdevOptionsSsh',
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
@@ -3991,6 +4014,17 @@
'*subformat': 'BlockdevVhdxSubformat',
'*block-state-zero': 'bool' } }
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
##
# @BlockdevVpcSubformat:
#
@@ -4074,6 +4108,7 @@
'rbd': 'BlockdevCreateOptionsRbd',
'replication': 'BlockdevCreateNotSupported',
'sheepdog': 'BlockdevCreateOptionsSheepdog',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'ssh': 'BlockdevCreateOptionsSsh',
'throttle': 'BlockdevCreateNotSupported',
'vdi': 'BlockdevCreateOptionsVdi',

View File

@@ -0,0 +1,181 @@
Index: qemu-5.2+dfsg/qapi/block-core.json
===================================================================
--- qemu-5.2+dfsg.orig/qapi/block-core.json
+++ qemu-5.2+dfsg/qapi/block-core.json
@@ -2831,7 +2831,7 @@
'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
{ 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
- 'sheepdog',
+ 'sheepdog', 'vitastor',
'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
##
@@ -3668,6 +3668,28 @@
'*tag': 'str' } }
##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
+##
# @ReplicationMode:
#
# An enumeration of replication modes.
@@ -4015,6 +4037,7 @@
'replication': { 'type': 'BlockdevOptionsReplication',
'if': 'defined(CONFIG_REPLICATION)' },
'sheepdog': 'BlockdevOptionsSheepdog',
+ 'vitastor': 'BlockdevOptionsVitastor',
'ssh': 'BlockdevOptionsSsh',
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
@@ -4404,6 +4427,17 @@
'*cluster-size' : 'size' } }
##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
+##
# @BlockdevVmdkSubformat:
#
# Subformat options for VMDK images
@@ -4665,6 +4699,7 @@
'qed': 'BlockdevCreateOptionsQed',
'rbd': 'BlockdevCreateOptionsRbd',
'sheepdog': 'BlockdevCreateOptionsSheepdog',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
Index: qemu-5.2+dfsg/block/meson.build
===================================================================
--- qemu-5.2+dfsg.orig/block/meson.build
+++ qemu-5.2+dfsg/block/meson.build
@@ -76,6 +76,7 @@ foreach m : [
['CONFIG_LIBNFS', 'nfs', libnfs, 'nfs.c'],
['CONFIG_LIBSSH', 'ssh', libssh, 'ssh.c'],
['CONFIG_RBD', 'rbd', rbd, 'rbd.c'],
+ ['CONFIG_VITASTOR', 'vitastor', vitastor, 'vitastor.c'],
]
if config_host.has_key(m[0])
if enable_modules
Index: qemu-5.2+dfsg/configure
===================================================================
--- qemu-5.2+dfsg.orig/configure
+++ qemu-5.2+dfsg/configure
@@ -372,6 +372,7 @@ trace_backends="log"
trace_file="trace"
spice=""
rbd=""
+vitastor=""
smartcard=""
u2f="auto"
libusb=""
@@ -1263,6 +1264,10 @@ for opt do
;;
--enable-rbd) rbd="yes"
;;
+ --disable-vitastor) vitastor="no"
+ ;;
+ --enable-vitastor) vitastor="yes"
+ ;;
--disable-xfsctl) xfs="no"
;;
--enable-xfsctl) xfs="yes"
@@ -1827,6 +1832,7 @@ disabled with --disable-FEATURE, default
vhost-vdpa vhost-vdpa kernel backend support
spice spice
rbd rados block device (rbd)
+ vitastor vitastor block device
libiscsi iscsi support
libnfs nfs support
smartcard smartcard support (libcacard)
@@ -3719,6 +3725,27 @@ EOF
fi
##########################################
+# vitastor probe
+if test "$vitastor" != "no" ; then
+ cat > $TMPC <<EOF
+#include <vitastor_c.h>
+int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+}
+EOF
+ vitastor_libs="-lvitastor_client"
+ if compile_prog "" "$vitastor_libs" ; then
+ vitastor=yes
+ else
+ if test "$vitastor" = "yes" ; then
+ feature_not_found "vitastor block device" "Install vitastor-client-dev"
+ fi
+ vitastor=no
+ fi
+fi
+
+##########################################
# libssh probe
if test "$libssh" != "no" ; then
if $pkg_config --exists libssh; then
@@ -6456,6 +6483,10 @@ if test "$rbd" = "yes" ; then
echo "CONFIG_RBD=y" >> $config_host_mak
echo "RBD_LIBS=$rbd_libs" >> $config_host_mak
fi
+if test "$vitastor" = "yes" ; then
+ echo "CONFIG_VITASTOR=y" >> $config_host_mak
+ echo "VITASTOR_LIBS=$vitastor_libs" >> $config_host_mak
+fi
echo "CONFIG_COROUTINE_BACKEND=$coroutine" >> $config_host_mak
if test "$coroutine_pool" = "yes" ; then
Index: qemu-5.2+dfsg/meson.build
===================================================================
--- qemu-5.2+dfsg.orig/meson.build
+++ qemu-5.2+dfsg/meson.build
@@ -596,6 +596,10 @@ rbd = not_found
if 'CONFIG_RBD' in config_host
rbd = declare_dependency(link_args: config_host['RBD_LIBS'].split())
endif
+vitastor = not_found
+if 'CONFIG_VITASTOR' in config_host
+ vitastor = declare_dependency(link_args: config_host['VITASTOR_LIBS'].split())
+endif
glusterfs = not_found
if 'CONFIG_GLUSTERFS' in config_host
glusterfs = declare_dependency(compile_args: config_host['GLUSTERFS_CFLAGS'].split(),
@@ -2145,6 +2149,7 @@ endif
# TODO: add back protocol and server version
summary_info += {'spice support': config_host.has_key('CONFIG_SPICE')}
summary_info += {'rbd support': config_host.has_key('CONFIG_RBD')}
+summary_info += {'vitastor support': config_host.has_key('CONFIG_VITASTOR')}
summary_info += {'xfsctl support': config_host.has_key('CONFIG_XFS')}
summary_info += {'smartcard support': config_host.has_key('CONFIG_SMARTCARD')}
summary_info += {'U2F support': u2f.found()}

View File

@@ -24,4 +24,4 @@ rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-0.9.2/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-0.9.2$(rpm --eval '%dist').tar.gz *
tar --transform 's#^#vitastor-1.2.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.2.0$(rpm --eval '%dist').tar.gz *

View File

@@ -22,7 +22,7 @@
Name: qemu-kvm
Version: 4.2.0
-Release: 29.vitastor%{?dist}.6
+Release: 32.vitastor%{?dist}.6
+Release: 34.vitastor%{?dist}.6
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
Epoch: 15
License: GPLv2 and GPLv2+ and CC-BY

View File

@@ -13,7 +13,7 @@
Name: qemu-kvm
Version: 4.2.0
-Release: 29%{?dist}.6
+Release: 32.vitastor%{?dist}.6
+Release: 33.vitastor%{?dist}.6
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
Epoch: 15
License: GPLv2 and GPLv2+ and CC-BY

View File

@@ -0,0 +1,103 @@
--- qemu-kvm-6.2.spec.orig 2023-07-18 13:52:57.636625440 +0000
+++ qemu-kvm-6.2.spec 2023-07-18 13:52:19.011683886 +0000
@@ -73,6 +73,7 @@ Requires: %{name}-hw-usbredir = %{epoch}
%endif \
Requires: %{name}-block-iscsi = %{epoch}:%{version}-%{release} \
Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release} \
+Requires: %{name}-block-vitastor = %{epoch}:%{version}-%{release}\
Requires: %{name}-block-ssh = %{epoch}:%{version}-%{release}
# Macro to properly setup RHEL/RHEV conflict handling
@@ -83,7 +84,7 @@ Obsoletes: %1-rhev <= %{epoch}:%{version
Summary: QEMU is a machine emulator and virtualizer
Name: qemu-kvm
Version: 6.2.0
-Release: 32%{?rcrel}%{?dist}
+Release: 32.vitastor%{?rcrel}%{?dist}
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
Epoch: 15
License: GPLv2 and GPLv2+ and CC-BY
@@ -122,6 +123,7 @@ Source37: tests_data_acpi_pc_SSDT.dimmpx
Source38: tests_data_acpi_q35_FACP.slic
Source39: tests_data_acpi_q35_SSDT.dimmpxm
Source40: tests_data_acpi_virt_SSDT.memhp
+Source41: qemu-vitastor.c
Patch0001: 0001-redhat-Adding-slirp-to-the-exploded-tree.patch
Patch0005: 0005-Initial-redhat-build.patch
@@ -652,6 +654,7 @@ Patch255: kvm-scsi-protect-req-aiocb-wit
Patch256: kvm-dma-helpers-prevent-dma_blk_cb-vs-dma_aio_cancel-rac.patch
# For bz#2090990 - qemu crash with error scsi_req_unref(SCSIRequest *): Assertion `req->refcount > 0' failed or scsi_dma_complete(void *, int): Assertion `r->req.aiocb != NULL' failed [8.7.0]
Patch257: kvm-virtio-scsi-reset-SCSI-devices-from-main-loop-thread.patch
+Patch258: qemu-6.2-vitastor.patch
BuildRequires: wget
BuildRequires: rpm-build
@@ -689,6 +692,7 @@ BuildRequires: libcurl-devel
BuildRequires: libssh-devel
BuildRequires: librados-devel
BuildRequires: librbd-devel
+BuildRequires: vitastor-client-devel
%if %{have_gluster}
# For gluster block driver
BuildRequires: glusterfs-api-devel
@@ -926,6 +930,14 @@ Install this package if you want to acce
using the rbd protocol.
+%package block-vitastor
+Summary: QEMU Vitastor block driver
+Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
+
+%description block-vitastor
+This package provides the additional Vitastor block driver for QEMU.
+
+
%package block-ssh
Summary: QEMU SSH block driver
Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
@@ -979,6 +991,7 @@ This package provides usbredir support.
rm -fr slirp
mkdir slirp
%autopatch -p1
+cp %{SOURCE41} ./block/vitastor.c
%global qemu_kvm_build qemu_kvm_build
mkdir -p %{qemu_kvm_build}
@@ -994,7 +1007,7 @@ cp -f %{SOURCE40} tests/data/acpi/virt/S
# --build-id option is used for giving info to the debug packages.
buildldflags="VL_LDFLAGS=-Wl,--build-id"
-%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle
+%global block_drivers_list qcow2,raw,file,host_device,nbd,iscsi,rbd,vitastor,blkdebug,luks,null-co,nvme,copy-on-read,throttle
%if 0%{have_gluster}
%global block_drivers_list %{block_drivers_list},gluster
@@ -1149,9 +1162,7 @@ pushd %{qemu_kvm_build}
--firmwarepath=%{_prefix}/share/qemu-firmware \
--meson="git" \
--target-list="%{buildarch}" \
- --block-drv-rw-whitelist=%{block_drivers_list} \
--audio-drv-list= \
- --block-drv-ro-whitelist=vmdk,vhdx,vpc,https,ssh \
--with-coroutine=ucontext \
--with-git=git \
--tls-priority=@QEMU,SYSTEM \
@@ -1197,6 +1208,7 @@ pushd %{qemu_kvm_build}
%endif
--enable-pie \
--enable-rbd \
+ --enable-vitastor \
%if 0%{have_librdma}
--enable-rdma \
%endif
@@ -1794,6 +1806,9 @@ sh %{_sysconfdir}/sysconfig/modules/kvm.
%files block-rbd
%{_libdir}/qemu-kvm/block-rbd.so
+%files block-vitastor
+%{_libdir}/qemu-kvm/block-vitastor.so
+
%files block-ssh
%{_libdir}/qemu-kvm/block-ssh.so

View File

@@ -0,0 +1,93 @@
--- qemu-kvm-7.2.spec.orig 2023-06-22 13:56:19.000000000 +0000
+++ qemu-kvm-7.2.spec 2023-07-18 07:55:22.347090196 +0000
@@ -100,8 +100,6 @@
%endif
%global target_list %{kvm_target}-softmmu
-%global block_drivers_rw_list qcow2,raw,file,host_device,nbd,iscsi,rbd,blkdebug,luks,null-co,nvme,copy-on-read,throttle,compress
-%global block_drivers_ro_list vdi,vmdk,vhdx,vpc,https
%define qemudocdir %{_docdir}/%{name}
%global firmwaredirs "%{_datadir}/qemu-firmware:%{_datadir}/ipxe/qemu:%{_datadir}/seavgabios:%{_datadir}/seabios"
@@ -126,6 +124,7 @@ Requires: %{name}-device-usb-host = %{ep
Requires: %{name}-device-usb-redirect = %{epoch}:%{version}-%{release} \
%endif \
Requires: %{name}-block-rbd = %{epoch}:%{version}-%{release} \
+Requires: %{name}-block-vitastor = %{epoch}:%{version}-%{release}\
Requires: %{name}-audio-pa = %{epoch}:%{version}-%{release}
# Since SPICE is removed from RHEL-9, the following Obsoletes:
@@ -148,7 +147,7 @@ Obsoletes: %{name}-block-ssh <= %{epoch}
Summary: QEMU is a machine emulator and virtualizer
Name: qemu-kvm
Version: 7.2.0
-Release: 14%{?rcrel}%{?dist}%{?cc_suffix}.1
+Release: 14.vitastor%{?rcrel}%{?dist}%{?cc_suffix}.1
# Epoch because we pushed a qemu-1.0 package. AIUI this can't ever be dropped
# Epoch 15 used for RHEL 8
# Epoch 17 used for RHEL 9 (due to release versioning offset in RHEL 8.5)
@@ -171,6 +170,7 @@ Source28: 95-kvm-memlock.conf
Source30: kvm-s390x.conf
Source31: kvm-x86.conf
Source36: README.tests
+Source37: qemu-vitastor.c
Patch0004: 0004-Initial-redhat-build.patch
@@ -418,6 +418,7 @@ Patch134: kvm-target-i386-Fix-BZHI-instr
Patch135: kvm-intel-iommu-fail-DEVIOTLB_UNMAP-without-dt-mode.patch
# For bz#2203745 - Disk detach is unsuccessful while the guest is still booting [rhel-9.2.0.z]
Patch136: kvm-acpi-pcihp-allow-repeating-hot-unplug-requests.patch
+Patch137: qemu-7.2-vitastor.patch
%if %{have_clang}
BuildRequires: clang
@@ -449,6 +450,7 @@ BuildRequires: libcurl-devel
%if %{have_block_rbd}
BuildRequires: librbd-devel
%endif
+BuildRequires: vitastor-client-devel
# We need both because the 'stap' binary is probed for by configure
BuildRequires: systemtap
BuildRequires: systemtap-sdt-devel
@@ -642,6 +644,14 @@ using the rbd protocol.
%endif
+%package block-vitastor
+Summary: QEMU Vitastor block driver
+Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
+
+%description block-vitastor
+This package provides the additional Vitastor block driver for QEMU.
+
+
%package audio-pa
Summary: QEMU PulseAudio audio driver
Requires: %{name}-common%{?_isa} = %{epoch}:%{version}-%{release}
@@ -719,6 +729,7 @@ This package provides usbredir support.
%prep
%setup -q -n qemu-%{version}%{?rcstr}
%autopatch -p1
+cp %{SOURCE37} ./block/vitastor.c
%global qemu_kvm_build qemu_kvm_build
mkdir -p %{qemu_kvm_build}
@@ -946,6 +957,7 @@ run_configure \
%if %{have_block_rbd}
--enable-rbd \
%endif
+ --enable-vitastor \
%if %{have_librdma}
--enable-rdma \
%endif
@@ -1426,6 +1438,9 @@ useradd -r -u 107 -g qemu -G kvm -d / -s
%files block-rbd
%{_libdir}/%{name}/block-rbd.so
%endif
+%files block-vitastor
+%{_libdir}/%{name}/block-vitastor.so
+
%files audio-pa
%{_libdir}/%{name}/audio-pa.so

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.9.2.el7.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.2.0.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.9.2
Version: 1.2.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.9.2.el7.tar.gz
Source0: vitastor-1.2.0.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.9.2.el8.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.2.0.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.9.2
Version: 1.2.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.9.2.el8.tar.gz
Source0: vitastor-1.2.0.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -18,7 +18,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-0.9.2.el9.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.2.0.el9.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 0.9.2
Version: 1.2.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-0.9.2.el9.tar.gz
Source0: vitastor-1.2.0.el9.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
add_definitions(-DVERSION="0.9.2")
add_definitions(-DVERSION="1.2.0")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
if (${WITH_ASAN})
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
@@ -137,6 +137,7 @@ endif (${WITH_FIO})
add_library(vitastor_client SHARED
cluster_client.cpp
cluster_client_list.cpp
cluster_client_wb.cpp
vitastor_c.cpp
cli_common.cpp
cli_alloc_osd.cpp
@@ -300,7 +301,7 @@ target_link_libraries(test_crc32
add_executable(test_cluster_client
EXCLUDE_FROM_ALL
test_cluster_client.cpp
pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
pg_states.cpp osd_ops.cpp cluster_client.cpp cluster_client_list.cpp cluster_client_wb.cpp msgr_op.cpp mock/messenger.cpp msgr_stop.cpp
etcd_state_client.cpp timerfd_manager.cpp str_util.cpp ../json11/json11.cpp
)
target_compile_definitions(test_cluster_client PUBLIC -D__MOCK__)

View File

@@ -19,8 +19,8 @@ bool string_to_addr(std::string str, bool parse_port, int default_port, struct s
if (p != std::string::npos && !(str.length() > 0 && str[p-1] == ']')) // "[ipv6]" which contains ':'
{
char null_byte = 0;
int n = sscanf(str.c_str()+p+1, "%d%c", &default_port, &null_byte);
if (n != 1 || default_port >= 0x10000)
int scanned = sscanf(str.c_str()+p+1, "%d%c", &default_port, &null_byte);
if (scanned != 1 || default_port >= 0x10000)
return false;
str = str.substr(0, p);
}

View File

@@ -143,34 +143,83 @@ uint64_t allocator::get_free_count()
return free;
}
// FIXME: Move to utils?
void bitmap_set(void *bitmap, uint64_t start, uint64_t len, uint64_t bitmap_granularity)
{
if (start == 0)
if (start == 0 && len == 32*bitmap_granularity)
*((uint32_t*)bitmap) = UINT32_MAX;
else if (start == 0 && len == 64*bitmap_granularity)
*((uint64_t*)bitmap) = UINT64_MAX;
else
{
if (len == 32*bitmap_granularity)
unsigned bit_start = start / bitmap_granularity;
unsigned bit_end = ((start + len) + bitmap_granularity - 1) / bitmap_granularity;
while (bit_start < bit_end)
{
*((uint32_t*)bitmap) = UINT32_MAX;
return;
}
else if (len == 64*bitmap_granularity)
{
*((uint64_t*)bitmap) = UINT64_MAX;
return;
}
}
unsigned bit_start = start / bitmap_granularity;
unsigned bit_end = ((start + len) + bitmap_granularity - 1) / bitmap_granularity;
while (bit_start < bit_end)
{
if (!(bit_start & 7) && bit_end >= bit_start+8)
{
((uint8_t*)bitmap)[bit_start / 8] = UINT8_MAX;
bit_start += 8;
}
else
{
((uint8_t*)bitmap)[bit_start / 8] |= 1 << (bit_start % 8);
bit_start++;
if (!(bit_start & 7) && bit_end >= bit_start+8)
{
((uint8_t*)bitmap)[bit_start / 8] = UINT8_MAX;
bit_start += 8;
}
else
{
((uint8_t*)bitmap)[bit_start / 8] |= 1 << (bit_start % 8);
bit_start++;
}
}
}
}
void bitmap_clear(void *bitmap, uint64_t start, uint64_t len, uint64_t bitmap_granularity)
{
if (start == 0 && len == 32*bitmap_granularity)
*((uint32_t*)bitmap) = 0;
else if (start == 0 && len == 64*bitmap_granularity)
*((uint64_t*)bitmap) = 0;
else
{
unsigned bit_start = start / bitmap_granularity;
unsigned bit_end = ((start + len) + bitmap_granularity - 1) / bitmap_granularity;
while (bit_start < bit_end)
{
if (!(bit_start & 7) && bit_end >= bit_start+8)
{
((uint8_t*)bitmap)[bit_start / 8] = 0;
bit_start += 8;
}
else
{
((uint8_t*)bitmap)[bit_start / 8] &= (0xFF ^ (1 << (bit_start % 8)));
bit_start++;
}
}
}
}
bool bitmap_check(void *bitmap, uint64_t start, uint64_t len, uint64_t bitmap_granularity)
{
bool r = false;
if (start == 0 && len == 32*bitmap_granularity)
r = !!*((uint32_t*)bitmap);
else if (start == 0 && len == 64*bitmap_granularity)
r = !!*((uint64_t*)bitmap);
else
{
unsigned bit_start = start / bitmap_granularity;
unsigned bit_end = ((start + len) + bitmap_granularity - 1) / bitmap_granularity;
while (bit_start < bit_end)
{
if (!(bit_start & 7) && bit_end >= bit_start+8)
{
r = r || !!((uint8_t*)bitmap)[bit_start / 8];
bit_start += 8;
}
else
{
r = r || (((uint8_t*)bitmap)[bit_start / 8] & (1 << (bit_start % 8)));
bit_start++;
}
}
}
return r;
}

View File

@@ -23,3 +23,5 @@ public:
};
void bitmap_set(void *bitmap, uint64_t start, uint64_t len, uint64_t bitmap_granularity);
void bitmap_clear(void *bitmap, uint64_t start, uint64_t len, uint64_t bitmap_granularity);
bool bitmap_check(void *bitmap, uint64_t start, uint64_t len, uint64_t bitmap_granularity);

View File

@@ -77,6 +77,7 @@ Output:
-EINVAL = invalid input parameters
-ENOENT = requested object/version does not exist for reads
-ENOSPC = no space left in the store for writes
-EDOM = checksum error.
- version = the version actually read or written
## BS_OP_DELETE

View File

@@ -40,10 +40,49 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
data_block_size = parse_size(config["block_size"]);
journal_device = config["journal_device"];
journal_offset = parse_size(config["journal_offset"]);
disk_alignment = strtoull(config["disk_alignment"].c_str(), NULL, 10);
journal_block_size = strtoull(config["journal_block_size"].c_str(), NULL, 10);
meta_block_size = strtoull(config["meta_block_size"].c_str(), NULL, 10);
bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10);
disk_alignment = parse_size(config["disk_alignment"]);
journal_block_size = parse_size(config["journal_block_size"]);
meta_block_size = parse_size(config["meta_block_size"]);
bitmap_granularity = parse_size(config["bitmap_granularity"]);
meta_format = stoull_full(config["meta_format"]);
if (config.find("data_io") == config.end() &&
config.find("meta_io") == config.end() &&
config.find("journal_io") == config.end())
{
bool cached_io_data = config["cached_io_data"] == "true" || config["cached_io_data"] == "yes" || config["cached_io_data"] == "1";
bool cached_io_meta = cached_io_data && (meta_device == data_device || meta_device == "") &&
config.find("cached_io_meta") == config.end() ||
config["cached_io_meta"] == "true" || config["cached_io_meta"] == "yes" || config["cached_io_meta"] == "1";
bool cached_io_journal = cached_io_meta && (journal_device == meta_device || journal_device == "") &&
config.find("cached_io_journal") == config.end() ||
config["cached_io_journal"] == "true" || config["cached_io_journal"] == "yes" || config["cached_io_journal"] == "1";
data_io = cached_io_data ? "cached" : "direct";
meta_io = cached_io_meta ? "cached" : "direct";
journal_io = cached_io_journal ? "cached" : "direct";
}
else
{
data_io = config.find("data_io") != config.end() ? config["data_io"] : "direct";
meta_io = config.find("meta_io") != config.end()
? config["meta_io"]
: (meta_device == data_device || meta_device == "" ? data_io : "direct");
journal_io = config.find("journal_io") != config.end()
? config["journal_io"]
: (journal_device == meta_device || journal_device == "" ? meta_io : "direct");
}
if (config["data_csum_type"] == "crc32c")
{
data_csum_type = BLOCKSTORE_CSUM_CRC32C;
}
else if (config["data_csum_type"] == "" || config["data_csum_type"] == "none")
{
data_csum_type = BLOCKSTORE_CSUM_NONE;
}
else
{
throw std::runtime_error("data_csum_type="+config["data_csum_type"]+" is unsupported, only \"crc32c\" and \"none\" are supported");
}
csum_block_size = parse_size(config["csum_block_size"]);
// Validate
if (!data_block_size)
{
@@ -91,7 +130,23 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
}
if (data_block_size % bitmap_granularity)
{
throw std::runtime_error("Block size must be a multiple of sparse write tracking granularity");
throw std::runtime_error("Data block size must be a multiple of sparse write tracking granularity");
}
if (!data_csum_type)
{
csum_block_size = 0;
}
else if (!csum_block_size)
{
csum_block_size = bitmap_granularity;
}
if (csum_block_size && (csum_block_size % bitmap_granularity))
{
throw std::runtime_error("Checksum block size must be a multiple of sparse write tracking granularity");
}
if (csum_block_size && (data_block_size % csum_block_size))
{
throw std::runtime_error("Checksum block size must be a divisor of data block size");
}
if (meta_device == "")
{
@@ -110,7 +165,9 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));
}
clean_entry_bitmap_size = data_block_size / bitmap_granularity / 8;
clean_entry_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
clean_dyn_size = clean_entry_bitmap_size*2 + (csum_block_size
? data_block_size/csum_block_size*(data_csum_type & 0xFF) : 0);
clean_entry_size = sizeof(clean_disk_entry) + clean_dyn_size + 4 /*entry_csum*/;
}
void blockstore_disk_t::calc_lengths(bool skip_meta_check)
@@ -160,6 +217,25 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check)
// required metadata size
block_count = data_len / data_block_size;
meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
if (meta_format == BLOCKSTORE_META_FORMAT_V1 ||
!meta_format && !skip_meta_check && meta_area_size < meta_len && !data_csum_type)
{
uint64_t clean_entry_v0_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
uint64_t meta_v0_len = (1 + (block_count - 1 + meta_block_size / clean_entry_v0_size)
/ (meta_block_size / clean_entry_v0_size)) * meta_block_size;
if (meta_format == BLOCKSTORE_META_FORMAT_V1 || meta_area_size >= meta_v0_len)
{
// Old metadata fits.
printf("Warning: Using old metadata format without checksums because the new format doesn't fit into provided area\n");
clean_entry_size = clean_entry_v0_size;
meta_len = meta_v0_len;
meta_format = BLOCKSTORE_META_FORMAT_V1;
}
else
meta_format = BLOCKSTORE_META_FORMAT_V2;
}
else
meta_format = BLOCKSTORE_META_FORMAT_V2;
if (!skip_meta_check && meta_area_size < meta_len)
{
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes");
@@ -214,9 +290,19 @@ static void check_size(int fd, uint64_t *size, uint64_t *sectsize, std::string n
}
}
static int bs_openmode(const std::string & mode)
{
if (mode == "directsync")
return O_DIRECT|O_SYNC;
else if (mode == "cached")
return O_SYNC;
else
return O_DIRECT;
}
void blockstore_disk_t::open_data()
{
data_fd = open(data_device.c_str(), O_DIRECT|O_RDWR);
data_fd = open(data_device.c_str(), bs_openmode(data_io) | O_RDWR);
if (data_fd == -1)
{
throw std::runtime_error("Failed to open data device "+data_device+": "+std::string(strerror(errno)));
@@ -241,9 +327,9 @@ void blockstore_disk_t::open_data()
void blockstore_disk_t::open_meta()
{
if (meta_device != data_device)
if (meta_device != data_device || meta_io != data_io)
{
meta_fd = open(meta_device.c_str(), O_DIRECT|O_RDWR);
meta_fd = open(meta_device.c_str(), bs_openmode(meta_io) | O_RDWR);
if (meta_fd == -1)
{
throw std::runtime_error("Failed to open metadata device "+meta_device+": "+std::string(strerror(errno)));
@@ -253,7 +339,7 @@ void blockstore_disk_t::open_meta()
{
throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_device_size));
}
if (!disable_flock && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
if (!disable_flock && meta_device != data_device && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
}
@@ -279,15 +365,15 @@ void blockstore_disk_t::open_meta()
void blockstore_disk_t::open_journal()
{
if (journal_device != meta_device)
if (journal_device != meta_device || journal_io != meta_io)
{
journal_fd = open(journal_device.c_str(), O_DIRECT|O_RDWR);
journal_fd = open(journal_device.c_str(), bs_openmode(journal_io) | O_RDWR);
if (journal_fd == -1)
{
throw std::runtime_error("Failed to open journal device "+journal_device+": "+std::string(strerror(errno)));
}
check_size(journal_fd, &journal_device_size, &journal_device_sect, "journal device");
if (!disable_flock && flock(journal_fd, LOCK_EX|LOCK_NB) != 0)
if (!disable_flock && journal_device != meta_device && flock(journal_fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno));
}

View File

@@ -8,6 +8,10 @@
#include <string>
#include <map>
#define BLOCKSTORE_CSUM_NONE 0
// Lower byte of checksum type is its length
#define BLOCKSTORE_CSUM_CRC32C 0x104
struct blockstore_disk_t
{
std::string data_device, meta_device, journal_device;
@@ -21,17 +25,24 @@ struct blockstore_disk_t
uint64_t meta_block_size = 4096;
// Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment
uint64_t bitmap_granularity = 4096;
// Data checksum type, BLOCKSTORE_CSUM_NONE or BLOCKSTORE_CSUM_CRC32C
uint32_t data_csum_type = BLOCKSTORE_CSUM_NONE;
// Checksum block size, must be a multiple of bitmap_granularity
uint32_t csum_block_size = 4096;
// By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking
bool disable_flock = false;
// I/O modes for data, metadata and journal: direct or "" = O_DIRECT, cached = O_SYNC, directsync = O_DIRECT|O_SYNC
// O_SYNC without O_DIRECT = use Linux page cache for reads and writes
std::string data_io, meta_io, journal_io;
int meta_fd = -1, data_fd = -1, journal_fd = -1;
uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len;
uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len, meta_format = 0;
uint64_t data_offset, data_device_sect, data_device_size, data_len;
uint64_t journal_offset, journal_device_sect, journal_device_size, journal_len;
uint32_t block_order;
uint64_t block_count;
uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0;
uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0, clean_dyn_size = 0;
void parse_config(std::map<std::string, std::string> & config);
void open_data();
@@ -39,4 +50,13 @@ struct blockstore_disk_t
void open_journal();
void calc_lengths(bool skip_meta_check = false);
void close_all();
inline uint64_t dirty_dyn_size(uint64_t offset, uint64_t len)
{
// Checksums may be partial if write is not aligned with csum_block_size
return clean_entry_bitmap_size + (csum_block_size && len > 0
? ((offset+len+csum_block_size-1)/csum_block_size - offset/csum_block_size)
* (data_csum_type & 0xFF)
: 0);
}
};

File diff suppressed because it is too large Load Diff

View File

@@ -1,10 +1,22 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#define COPY_BUF_JOURNAL 1
#define COPY_BUF_DATA 2
#define COPY_BUF_ZERO 4
#define COPY_BUF_CSUM_FILL 8
#define COPY_BUF_COALESCED 16
#define COPY_BUF_META_BLOCK 32
#define COPY_BUF_JOURNALED_BIG 64
struct copy_buffer_t
{
uint64_t offset, len;
int copy_flags;
uint64_t offset, len, disk_offset;
uint64_t journal_sector; // only for reads: sector+1 if used and !journal.inmemory, otherwise 0
void *buf;
uint8_t *csum_buf;
int *dyn_data;
};
struct meta_sector_t
@@ -37,7 +49,7 @@ class journal_flusher_co
{
blockstore_impl_t *bs;
journal_flusher_t *flusher;
int wait_state, wait_count;
int wait_state, wait_count, wait_journal_count;
struct io_uring_sqe *sqe;
struct ring_data_t *data;
@@ -46,28 +58,39 @@ class journal_flusher_co
obj_ver_id cur;
std::map<obj_ver_id, dirty_entry>::iterator dirty_it, dirty_start, dirty_end;
std::map<object_id, uint64_t>::iterator repeat_it;
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_rj, simple_callback_w;
bool skip_copy, has_delete, has_writes;
std::vector<copy_buffer_t> v;
std::vector<copy_buffer_t>::iterator it;
int i;
bool fill_incomplete, cleared_incomplete;
int read_to_fill_incomplete;
int copy_count;
uint64_t clean_loc, old_clean_loc;
uint64_t clean_loc, clean_ver, old_clean_loc, old_clean_ver;
flusher_meta_write_t meta_old, meta_new;
bool clean_init_bitmap;
uint64_t clean_bitmap_offset, clean_bitmap_len;
void *new_clean_bitmap;
uint8_t *clean_init_dyn_ptr;
uint8_t *new_clean_bitmap;
uint64_t new_trim_pos;
// local: scan_dirty()
uint64_t offset, end_offset, submit_offset, submit_len;
friend class journal_flusher_t;
bool scan_dirty(int wait_base);
void scan_dirty();
bool read_dirty(int wait_base);
bool modify_meta_do_reads(int wait_base);
bool wait_meta_reads(int wait_base);
bool modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base);
bool clear_incomplete_csum_block_bits(int wait_base);
void calc_block_checksums(uint32_t *new_data_csums, bool skip_overwrites);
void update_metadata_entry();
bool write_meta_block(flusher_meta_write_t & meta_block, int wait_base);
void update_clean_db();
void free_data_blocks();
bool fsync_batch(bool fsync_meta, int wait_base);
bool trim_journal(int wait_base);
void free_buffers();
public:
journal_flusher_co();
bool loop();
@@ -95,14 +118,16 @@ class journal_flusher_t
std::map<uint64_t, meta_sector_t> meta_sectors;
std::deque<object_id> flush_queue;
std::map<object_id, uint64_t> flush_versions;
std::map<object_id, uint64_t> flush_versions; // FIXME: consider unordered_map?
bool try_find_older(std::map<obj_ver_id, dirty_entry>::iterator & dirty_end, obj_ver_id & cur);
bool try_find_other(std::map<obj_ver_id, dirty_entry>::iterator & dirty_end, obj_ver_id & cur);
public:
journal_flusher_t(blockstore_impl_t *bs);
~journal_flusher_t();
void loop();
bool is_trim_wanted() { return trim_wanted; }
bool is_active();
void mark_trim_possible();
void request_trim();
@@ -111,4 +136,5 @@ public:
void unshift_flush(obj_ver_id oid, bool force);
void remove_flush(object_id oid);
void dump_diagnostics();
bool is_mutated(uint64_t clean_loc);
};

View File

@@ -13,6 +13,7 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
initialized = 0;
parse_config(config, true);
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
alloc_dyn_data = dsk.clean_dyn_size > sizeof(void*) || dsk.csum_block_size > 0;
try
{
dsk.open_data();
@@ -38,8 +39,8 @@ blockstore_impl_t::~blockstore_impl_t()
dsk.close_all();
if (metadata_buffer)
free(metadata_buffer);
if (clean_bitmap)
free(clean_bitmap);
if (clean_bitmaps)
free(clean_bitmaps);
}
bool blockstore_impl_t::is_started()
@@ -383,6 +384,10 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
return;
}
if (op->opcode == BS_OP_SYNC)
{
unsynced_queued_ops = 0;
}
init_op(op);
submit_queue.push_back(op);
ringloop->wakeup();
@@ -392,6 +397,7 @@ void blockstore_impl_t::init_op(blockstore_op_t *op)
{
// Call constructor without allocating memory. We'll call destructor before returning op back
new ((void*)op->private_data) blockstore_op_private_t;
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->wait_for = 0;
PRIV(op)->op_state = 0;
PRIV(op)->pending_ops = 0;

View File

@@ -93,11 +93,10 @@
// "VITAstor"
#define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
#define BLOCKSTORE_META_VERSION_V1 1
#define BLOCKSTORE_META_FORMAT_V1 1
#define BLOCKSTORE_META_FORMAT_V2 2
// metadata header (superblock)
// FIXME: After adding the OSD superblock, add a key to metadata
// and journal headers to check if they belong to the same OSD
struct __attribute__((__packed__)) blockstore_meta_header_v1_t
{
uint64_t zero;
@@ -108,14 +107,29 @@ struct __attribute__((__packed__)) blockstore_meta_header_v1_t
uint32_t bitmap_granularity;
};
struct __attribute__((__packed__)) blockstore_meta_header_v2_t
{
uint64_t zero;
uint64_t magic;
uint64_t version;
uint32_t meta_block_size;
uint32_t data_block_size;
uint32_t bitmap_granularity;
uint32_t data_csum_type;
uint32_t csum_block_size;
uint32_t header_csum;
};
// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
// per "clean" entry on disk with fixed metadata tables
// FIXME: maybe add crc32's to metadata
struct __attribute__((__packed__)) clean_disk_entry
{
object_id oid;
uint64_t version;
uint8_t bitmap[];
// Two more fields come after bitmap in metadata version 2:
// uint32_t data_csum[];
// uint32_t entry_csum;
};
// 32 = 16 + 16 bytes per "clean" entry in memory (object_id => clean_entry)
@@ -125,7 +139,7 @@ struct __attribute__((__packed__)) clean_entry
uint64_t location;
};
// 64 = 24 + 40 bytes per dirty entry in memory (obj_ver_id => dirty_entry)
// 64 = 24 + 40 bytes per dirty entry in memory (obj_ver_id => dirty_entry). Plus checksums
struct __attribute__((__packed__)) dirty_entry
{
uint32_t state;
@@ -134,7 +148,7 @@ struct __attribute__((__packed__)) dirty_entry
uint32_t offset; // data offset within object (stripe)
uint32_t len; // data length
uint64_t journal_sector; // journal sector used for this entry
void* bitmap; // either external bitmap itself when it fits, or a pointer to it when it doesn't
void* dyn_data; // dynamic data: external bitmap and data block checksums. may be a pointer to the in-memory journal
};
// - Sync must be submitted after previous writes/deletes (not before!)
@@ -163,12 +177,23 @@ struct __attribute__((__packed__)) dirty_entry
// Suspend operation until there is some free space on the data device
#define WAIT_FREE 5
struct fulfill_read_t
struct used_clean_obj_t
{
uint64_t offset, len;
uint64_t journal_sector; // sector+1 if used and !journal.inmemory, otherwise 0
int refs;
bool was_freed; // was freed by a parallel flush?
bool was_changed; // was changed by a parallel flush?
};
// https://github.com/algorithm-ninja/cpp-btree
// https://github.com/greg7mdp/sparsepp/ was used previously, but it was TERRIBLY slow after resizing
// with sparsepp, random reads dropped to ~700 iops very fast with just as much as ~32k objects in the DB
typedef btree::btree_map<object_id, clean_entry> blockstore_clean_db_t;
typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;
#include "blockstore_init.h"
#include "blockstore_flush.h"
#define PRIV(op) ((blockstore_op_private_t*)(op)->private_data)
#define FINISH_OP(op) PRIV(op)->~blockstore_op_private_t(); std::function<void (blockstore_op_t*)>(op->callback)(op)
@@ -181,10 +206,11 @@ struct blockstore_op_private_t
int op_state;
// Read
std::vector<fulfill_read_t> read_vec;
uint64_t clean_block_used;
std::vector<copy_buffer_t> read_vec;
// Sync, write
int min_flushed_journal_sector, max_flushed_journal_sector;
uint64_t min_flushed_journal_sector, max_flushed_journal_sector;
// Write
struct iovec iov_zerofill[3];
@@ -194,19 +220,8 @@ struct blockstore_op_private_t
// Sync
std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
int sync_small_checked, sync_big_checked;
};
// https://github.com/algorithm-ninja/cpp-btree
// https://github.com/greg7mdp/sparsepp/ was used previously, but it was TERRIBLY slow after resizing
// with sparsepp, random reads dropped to ~700 iops very fast with just as much as ~32k objects in the DB
typedef btree::btree_map<object_id, clean_entry> blockstore_clean_db_t;
typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;
#include "blockstore_init.h"
#include "blockstore_flush.h"
typedef uint32_t pool_id_t;
typedef uint64_t pool_pg_id_t;
@@ -247,17 +262,20 @@ class blockstore_impl_t
int throttle_target_parallelism = 1;
// Minimum difference in microseconds between target and real execution times to throttle the response
int throttle_threshold_us = 50;
// Maximum writes between automatically added fsync operations
uint64_t autosync_writes = 128;
/******* END OF OPTIONS *******/
struct ring_consumer_t ring_consumer;
std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
uint8_t *clean_bitmap = NULL;
uint8_t *clean_bitmaps = NULL;
blockstore_dirty_db_t dirty_db;
std::vector<blockstore_op_t*> submit_queue;
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
int unsynced_big_write_count = 0;
int unsynced_queued_ops = 0;
allocator *data_alloc = NULL;
uint8_t *zero_object;
@@ -267,6 +285,10 @@ class blockstore_impl_t
journal_flusher_t *flusher;
int big_to_flush = 0;
int write_iodepth = 0;
bool alloc_dyn_data = false;
// clean data blocks referenced by read operations
std::map<uint64_t, used_clean_obj_t> used_clean_objects;
bool live = false, queue_stall = false;
ring_loop_t *ringloop;
@@ -310,8 +332,30 @@ class blockstore_impl_t
// Read
int dequeue_read(blockstore_op_t *read_op);
int fulfill_read(blockstore_op_t *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
uint32_t item_state, uint64_t item_version, uint64_t item_location, uint64_t journal_sector);
void find_holes(std::vector<copy_buffer_t> & read_vec, uint32_t item_start, uint32_t item_end,
std::function<int(int, bool, uint32_t, uint32_t)> callback);
int fulfill_read(blockstore_op_t *read_op,
uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
uint32_t item_state, uint64_t item_version, uint64_t item_location,
uint64_t journal_sector, uint8_t *csum, int *dyn_data);
bool fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled,
uint8_t *clean_entry_bitmap, int *dyn_data,
uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver);
int fill_partial_checksum_blocks(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled,
uint8_t *clean_entry_bitmap, int *dyn_data, bool from_journal, uint8_t *read_buf, uint64_t read_offset, uint64_t read_end);
int pad_journal_read(std::vector<copy_buffer_t> & rv, copy_buffer_t & cp,
uint64_t dirty_offset, uint64_t dirty_end, uint64_t dirty_loc, uint8_t *csum_ptr, int *dyn_data,
uint64_t offset, uint64_t submit_len, uint64_t & blk_begin, uint64_t & blk_end, uint8_t* & blk_buf);
bool read_range_fulfilled(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled, uint8_t *read_buf,
uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end);
bool read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc);
uint8_t* read_clean_meta_block(blockstore_op_t *read_op, uint64_t clean_loc, int rv_pos);
bool verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
bool verify_journal_checksums(uint8_t *csums, uint32_t offset,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *dyn_data, bool from_journal,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
int fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
uint32_t item_state, uint64_t item_version);
void handle_read_event(ring_data_t *data, blockstore_op_t *op);
@@ -342,6 +386,7 @@ class blockstore_impl_t
int continue_rollback(blockstore_op_t *op);
void mark_rolled_back(const obj_ver_id & ov);
void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc);
void free_dirty_dyn_data(dirty_entry & e);
// List
void process_list(blockstore_op_t *op);

View File

@@ -77,13 +77,20 @@ resume_1:
if (iszero((uint64_t*)metadata_buffer, bs->dsk.meta_block_size / sizeof(uint64_t)))
{
{
blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)metadata_buffer;
blockstore_meta_header_v2_t *hdr = (blockstore_meta_header_v2_t *)metadata_buffer;
hdr->zero = 0;
hdr->magic = BLOCKSTORE_META_MAGIC_V1;
hdr->version = BLOCKSTORE_META_VERSION_V1;
hdr->version = bs->dsk.meta_format;
hdr->meta_block_size = bs->dsk.meta_block_size;
hdr->data_block_size = bs->dsk.data_block_size;
hdr->bitmap_granularity = bs->dsk.bitmap_granularity;
if (bs->dsk.meta_format >= BLOCKSTORE_META_FORMAT_V2)
{
hdr->data_csum_type = bs->dsk.data_csum_type;
hdr->csum_block_size = bs->dsk.csum_block_size;
hdr->header_csum = 0;
hdr->header_csum = crc32c(0, hdr, sizeof(*hdr));
}
}
if (bs->readonly)
{
@@ -109,28 +116,62 @@ resume_1:
}
else
{
blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)metadata_buffer;
if (hdr->zero != 0 ||
hdr->magic != BLOCKSTORE_META_MAGIC_V1 ||
hdr->version != BLOCKSTORE_META_VERSION_V1)
blockstore_meta_header_v2_t *hdr = (blockstore_meta_header_v2_t *)metadata_buffer;
if (hdr->zero != 0 || hdr->magic != BLOCKSTORE_META_MAGIC_V1 || hdr->version < BLOCKSTORE_META_FORMAT_V1)
{
printf(
"Metadata is corrupt or old version.\n"
" If this is a new OSD please zero out the metadata area before starting it.\n"
" If you need to upgrade from 0.5.x please request it via the issue tracker.\n"
"Metadata is corrupt or too old (pre-0.6.x).\n"
" If this is a new OSD, please zero out the metadata area before starting it.\n"
" If you need to upgrade from 0.5.x, convert metadata with vitastor-disk.\n"
);
exit(1);
}
if (hdr->version == BLOCKSTORE_META_FORMAT_V2)
{
uint32_t csum = hdr->header_csum;
hdr->header_csum = 0;
if (crc32c(0, hdr, sizeof(*hdr)) != csum)
{
printf("Metadata header is corrupt (checksum mismatch).\n");
exit(1);
}
hdr->header_csum = csum;
bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_V2;
}
else if (hdr->version == BLOCKSTORE_META_FORMAT_V1)
{
hdr->data_csum_type = 0;
hdr->csum_block_size = 0;
hdr->header_csum = 0;
// Enable compatibility mode - entries without checksums
bs->dsk.clean_entry_size = sizeof(clean_disk_entry) + bs->dsk.clean_entry_bitmap_size*2;
bs->dsk.meta_len = (1 + (bs->dsk.block_count - 1 + bs->dsk.meta_block_size / bs->dsk.clean_entry_size)
/ (bs->dsk.meta_block_size / bs->dsk.clean_entry_size)) * bs->dsk.meta_block_size;
bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_V1;
printf("Warning: Starting with metadata in the old format without checksums, as stored on disk\n");
}
else if (hdr->version > BLOCKSTORE_META_FORMAT_V2)
{
printf(
"Metadata format is too new for me (stored version is %lu, max supported %u).\n",
hdr->version, BLOCKSTORE_META_FORMAT_V2
);
exit(1);
}
if (hdr->meta_block_size != bs->dsk.meta_block_size ||
hdr->data_block_size != bs->dsk.data_block_size ||
hdr->bitmap_granularity != bs->dsk.bitmap_granularity)
hdr->bitmap_granularity != bs->dsk.bitmap_granularity ||
hdr->data_csum_type != bs->dsk.data_csum_type ||
hdr->csum_block_size != bs->dsk.csum_block_size)
{
printf(
"Configuration stored in metadata superblock"
" (meta_block_size=%u, data_block_size=%u, bitmap_granularity=%u)"
" differs from OSD configuration (%lu/%u/%lu).\n",
" (meta_block_size=%u, data_block_size=%u, bitmap_granularity=%u, data_csum_type=%u, csum_block_size=%u)"
" differs from OSD configuration (%lu/%u/%lu, %u/%u).\n",
hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity,
bs->dsk.meta_block_size, bs->dsk.data_block_size, bs->dsk.bitmap_granularity
hdr->data_csum_type, hdr->csum_block_size,
bs->dsk.meta_block_size, bs->dsk.data_block_size, bs->dsk.bitmap_granularity,
bs->dsk.data_csum_type, bs->dsk.csum_block_size
);
exit(1);
}
@@ -279,12 +320,22 @@ bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_
for (uint64_t i = 0; i < max_i; i++)
{
clean_disk_entry *entry = (clean_disk_entry*)(buf + i*bs->dsk.clean_entry_size);
if (!bs->inmemory_meta && bs->dsk.clean_entry_bitmap_size)
{
memcpy(bs->clean_bitmap + (done_cnt+i)*2*bs->dsk.clean_entry_bitmap_size, &entry->bitmap, 2*bs->dsk.clean_entry_bitmap_size);
}
if (entry->oid.inode > 0)
{
if (bs->dsk.meta_format >= BLOCKSTORE_META_FORMAT_V2)
{
// Check entry crc32
uint32_t *entry_csum = (uint32_t*)((uint8_t*)entry + bs->dsk.clean_entry_size - 4);
if (*entry_csum != crc32c(0, entry, bs->dsk.clean_entry_size - 4))
{
printf("Metadata entry %lu is corrupt (checksum mismatch), skipping\n", done_cnt+i);
continue;
}
}
if (!bs->inmemory_meta && bs->dsk.clean_entry_bitmap_size)
{
memcpy(bs->clean_bitmaps + (done_cnt+i) * 2 * bs->dsk.clean_entry_bitmap_size, &entry->bitmap, 2 * bs->dsk.clean_entry_bitmap_size);
}
auto & clean_db = bs->clean_db_shard(entry->oid);
auto clean_it = clean_db.find(entry->oid);
if (clean_it == clean_db.end() || clean_it->second.version < entry->version)
@@ -440,7 +491,9 @@ resume_1:
.size = sizeof(journal_entry_start),
.reserved = 0,
.journal_start = bs->journal.block_size,
.version = JOURNAL_VERSION,
.version = JOURNAL_VERSION_V2,
.data_csum_type = bs->dsk.data_csum_type,
.csum_block_size = bs->dsk.csum_block_size,
};
((journal_entry_start*)submitted_buf)->crc32 = je_crc32((journal_entry*)submitted_buf);
if (bs->readonly)
@@ -492,18 +545,37 @@ resume_1:
if (je_start->magic != JOURNAL_MAGIC ||
je_start->type != JE_START ||
je_crc32((journal_entry*)je_start) != je_start->crc32 ||
je_start->size != sizeof(journal_entry_start) && je_start->size != JE_START_LEGACY_SIZE)
je_start->size != JE_START_V0_SIZE && je_start->size != JE_START_V1_SIZE && je_start->size != JE_START_V2_SIZE)
{
// Entry is corrupt
fprintf(stderr, "First entry of the journal is corrupt\n");
fprintf(stderr, "First entry of the journal is corrupt or unsupported\n");
exit(1);
}
if (je_start->size == JE_START_LEGACY_SIZE || je_start->version != JOURNAL_VERSION)
if (je_start->size == JE_START_V0_SIZE ||
(je_start->version != JOURNAL_VERSION_V1 || je_start->size != JE_START_V1_SIZE) &&
(je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE && je_start->size != JE_START_V1_SIZE))
{
fprintf(
stderr, "The code only supports journal version %d, but it is %lu on disk."
" Please use the previous version to flush the journal before upgrading OSD\n",
JOURNAL_VERSION, je_start->size == JE_START_LEGACY_SIZE ? 0 : je_start->version
stderr, "The code only supports journal versions 2 and 1, but it is %lu on disk."
" Please use vitastor-disk to rewrite the journal\n",
je_start->size == JE_START_V0_SIZE ? 0 : je_start->version
);
exit(1);
}
if (je_start->version == JOURNAL_VERSION_V1 ||
je_start->version == JOURNAL_VERSION_V2 && je_start->size == JE_START_V1_SIZE)
{
je_start->data_csum_type = 0;
je_start->csum_block_size = 0;
}
if (je_start->data_csum_type != bs->dsk.data_csum_type ||
je_start->csum_block_size != bs->dsk.csum_block_size)
{
printf(
"Configuration stored in journal superblock (data_csum_type=%u, csum_block_size=%u)"
" differs from OSD configuration (%u/%u).\n",
je_start->data_csum_type, je_start->csum_block_size,
bs->dsk.data_csum_type, bs->dsk.csum_block_size
);
exit(1);
}
@@ -705,11 +777,14 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
snprintf(err, 1024, "BUG: calculated journal data offset (%08lx) != stored journal data offset (%08lx)", location, je->small_write.data_offset);
throw std::runtime_error(err);
}
uint32_t data_crc32 = 0;
small_write_data.clear();
if (location >= done_pos && location+je->small_write.len <= done_pos+len)
{
// data is within this buffer
data_crc32 = crc32c(0, (uint8_t*)buf + location - done_pos, je->small_write.len);
small_write_data.push_back((iovec){
.iov_base = (uint8_t*)buf + location - done_pos,
.iov_len = je->small_write.len,
});
}
else
{
@@ -724,7 +799,10 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
? location+je->small_write.len : done[i].pos+done[i].len);
uint64_t part_begin = (location < done[i].pos ? done[i].pos : location);
covered += part_end - part_begin;
data_crc32 = crc32c(data_crc32, (uint8_t*)done[i].buf + part_begin - done[i].pos, part_end - part_begin);
small_write_data.push_back((iovec){
.iov_base = (uint8_t*)done[i].buf + part_begin - done[i].pos,
.iov_len = part_end - part_begin,
});
}
}
if (covered < je->small_write.len)
@@ -734,12 +812,102 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
return 2;
}
}
if (data_crc32 != je->small_write.crc32_data)
bool data_csum_valid = true;
if (!bs->dsk.csum_block_size)
{
uint32_t data_crc32 = 0;
for (auto & sd: small_write_data)
{
data_crc32 = crc32c(data_crc32, sd.iov_base, sd.iov_len);
}
data_csum_valid = data_crc32 == je->small_write.crc32_data;
if (!data_csum_valid)
{
printf(
"Journal entry data is corrupt for small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u - data crc32 %x != %x\n",
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
je->small_write.offset, je->small_write.len,
data_crc32, je->small_write.crc32_data
);
}
}
else if (je->small_write.len > 0)
{
// FIXME: deduplicate with disk_tool_journal.cpp
// like in enqueue_write()
uint32_t start = je->small_write.offset / bs->dsk.csum_block_size;
uint32_t end = (je->small_write.offset+je->small_write.len-1) / bs->dsk.csum_block_size;
uint32_t data_csum_size = (end-start+1) * (bs->dsk.data_csum_type & 0xFF);
uint32_t required_size = sizeof(journal_entry_small_write) + bs->dsk.clean_entry_bitmap_size + data_csum_size;
if (je->size != required_size)
{
printf(
"Journal entry data has invalid size for small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u - should be %u bytes but is %u bytes\n",
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
je->small_write.offset, je->small_write.len,
required_size, je->size
);
data_csum_valid = false;
}
else
{
int sd_num = 0;
size_t sd_pos = 0;
uint32_t *block_csums = (uint32_t*)((uint8_t*)je + sizeof(journal_entry_small_write) + bs->dsk.clean_entry_bitmap_size);
for (uint32_t pos = start; pos <= end; pos++, block_csums++)
{
size_t block_left = (pos == start
? (start == end
? je->small_write.len
: bs->dsk.csum_block_size - je->small_write.offset%bs->dsk.csum_block_size)
: (pos < end
? bs->dsk.csum_block_size
: (je->small_write.offset + je->small_write.len)%bs->dsk.csum_block_size));
if (pos > start && pos == end && block_left == 0)
{
// full last block
block_left = bs->dsk.csum_block_size;
}
uint32_t block_crc32 = 0;
while (block_left > 0)
{
assert(sd_num < small_write_data.size());
if (small_write_data[sd_num].iov_len >= sd_pos+block_left)
{
block_crc32 = crc32c(block_crc32, (uint8_t*)small_write_data[sd_num].iov_base+sd_pos, block_left);
sd_pos += block_left;
break;
}
else
{
block_crc32 = crc32c(block_crc32, (uint8_t*)small_write_data[sd_num].iov_base+sd_pos, small_write_data[sd_num].iov_len-sd_pos);
block_left -= (small_write_data[sd_num].iov_len-sd_pos);
sd_pos = 0;
sd_num++;
}
}
if (block_crc32 != *block_csums)
{
printf(
"Journal entry data is corrupt for small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u - block %u crc32 %x != %x\n",
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
je->small_write.oid.inode, je->small_write.oid.stripe, je->small_write.version,
je->small_write.offset, je->small_write.len,
pos, block_crc32, *block_csums
);
data_csum_valid = false;
break;
}
}
}
}
if (!data_csum_valid)
{
// journal entry is corrupt, stop here
// interesting thing is that we must clear the corrupt entry if we're not readonly,
// because we don't write next entries in the same journal block
printf("Journal entry data is corrupt (data crc32 %x != %x)\n", data_crc32, je->small_write.crc32_data);
memset((uint8_t*)buf + proc_pos - done_pos + pos, 0, bs->journal.block_size - pos);
bs->journal.next_free = prev_free;
init_write_buf = (uint8_t*)buf + proc_pos - done_pos;
@@ -755,11 +923,14 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.oid = je->small_write.oid,
.version = je->small_write.version,
};
void *bmp = NULL;
void *bmp_from = (uint8_t*)je + sizeof(journal_entry_small_write);
if (bs->dsk.clean_entry_bitmap_size <= sizeof(void*))
uint64_t dyn_size = bs->dsk.dirty_dyn_size(je->small_write.offset, je->small_write.len);
void *dyn = NULL;
void *dyn_from = (uint8_t*)je + sizeof(journal_entry_small_write);
if (!bs->alloc_dyn_data)
{
memcpy(&bmp, bmp_from, bs->dsk.clean_entry_bitmap_size);
// Bitmap without checksum is only 4 bytes for 128k objects, save it inline
// It can even contain 4 byte bitmap + 4 byte CRC32 for 4 kb writes :)
memcpy(&dyn, dyn_from, dyn_size);
}
else
{
@@ -767,8 +938,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
// allocations for entry bitmaps. This can only be fixed by using
// a patched map with dynamic entry size, but not the btree_map,
// because it doesn't keep iterators valid all the time.
bmp = malloc_or_die(bs->dsk.clean_entry_bitmap_size);
memcpy(bmp, bmp_from, bs->dsk.clean_entry_bitmap_size);
dyn = malloc_or_die(dyn_size+sizeof(int));
*((int*)dyn) = 1;
memcpy((uint8_t*)dyn+sizeof(int), dyn_from, dyn_size);
}
bs->dirty_db.emplace(ov, (dirty_entry){
.state = (BS_ST_SMALL_WRITE | BS_ST_SYNCED),
@@ -777,7 +949,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.offset = je->small_write.offset,
.len = je->small_write.len,
.journal_sector = proc_pos,
.bitmap = bmp,
.dyn_data = dyn,
});
bs->journal.used_sectors[proc_pos]++;
#ifdef BLOCKSTORE_DEBUG
@@ -836,11 +1008,13 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.oid = je->big_write.oid,
.version = je->big_write.version,
};
void *bmp = NULL;
void *bmp_from = (uint8_t*)je + sizeof(journal_entry_big_write);
if (bs->dsk.clean_entry_bitmap_size <= sizeof(void*))
uint64_t dyn_size = bs->dsk.dirty_dyn_size(je->big_write.offset, je->big_write.len);
void *dyn = NULL;
void *dyn_from = (uint8_t*)je + sizeof(journal_entry_big_write);
if (!bs->alloc_dyn_data)
{
memcpy(&bmp, bmp_from, bs->dsk.clean_entry_bitmap_size);
// Bitmap without checksum is only 4 bytes for 128k objects, save it inline
memcpy(&dyn, dyn_from, dyn_size);
}
else
{
@@ -848,8 +1022,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
// allocations for entry bitmaps. This can only be fixed by using
// a patched map with dynamic entry size, but not the btree_map,
// because it doesn't keep iterators valid all the time.
bmp = malloc_or_die(bs->dsk.clean_entry_bitmap_size);
memcpy(bmp, bmp_from, bs->dsk.clean_entry_bitmap_size);
dyn = malloc_or_die(dyn_size+sizeof(int));
*((int*)dyn) = 1;
memcpy((uint8_t*)dyn+sizeof(int), dyn_from, dyn_size);
}
auto dirty_it = bs->dirty_db.emplace(ov, (dirty_entry){
.state = (BS_ST_BIG_WRITE | BS_ST_SYNCED),
@@ -858,7 +1033,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.offset = je->big_write.offset,
.len = je->big_write.len,
.journal_sector = proc_pos,
.bitmap = bmp,
.dyn_data = dyn,
}).first;
if (bs->data_alloc->get(je->big_write.location >> bs->dsk.block_order))
{

View File

@@ -50,6 +50,7 @@ class blockstore_init_journal
uint64_t next_free;
std::vector<bs_init_journal_done> done;
std::vector<obj_ver_id> double_allocs;
std::vector<iovec> small_write_data;
uint64_t journal_pos = 0;
uint64_t continue_pos = 0;
void *init_write_buf = NULL;

View File

@@ -17,6 +17,7 @@ blockstore_journal_check_t::blockstore_journal_check_t(blockstore_impl_t *bs)
// Check if we can write <required> entries of <size> bytes and <data_after> data bytes after them to the journal
int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries_required, int size, int data_after)
{
uint64_t prev_next = next_sector;
int required = entries_required;
while (1)
{
@@ -35,11 +36,19 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries
}
required -= fits;
next_in_pos += fits * size;
sectors_to_write++;
if (next_sector != prev_next || !sectors_to_write)
{
// Except the previous call to this function
sectors_to_write++;
}
}
else if (bs->journal.sector_info[next_sector].dirty)
{
sectors_to_write++;
if (next_sector != prev_next || !sectors_to_write)
{
// Except the previous call to this function
sectors_to_write++;
}
}
if (required <= 0)
{
@@ -189,6 +198,7 @@ void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_
priv->pending_ops++;
if (!priv->min_flushed_journal_sector)
priv->min_flushed_journal_sector = 1+cur_sector;
assert(priv->min_flushed_journal_sector <= journal.sector_count);
priv->max_flushed_journal_sector = 1+cur_sector;
}
@@ -289,3 +299,31 @@ void journal_t::dump_diagnostics()
journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
);
}
static uint64_t zero_page[4096];
uint32_t crc32c_pad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad)
{
uint32_t r = prev_crc;
while (left_pad >= 4096)
{
r = crc32c(r, zero_page, 4096);
left_pad -= 4096;
}
if (left_pad > 0)
r = crc32c(r, zero_page, left_pad);
r = crc32c(r, buf, len);
while (right_pad >= 4096)
{
r = crc32c(r, zero_page, 4096);
right_pad -= 4096;
}
if (left_pad > 0)
r = crc32c(r, zero_page, right_pad);
return r;
}
uint32_t crc32c_nopad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad)
{
return crc32c(0, buf, len);
}

View File

@@ -8,7 +8,8 @@
#define MIN_JOURNAL_SIZE 4*1024*1024
#define JOURNAL_MAGIC 0x4A33
#define JOURNAL_VERSION 1
#define JOURNAL_VERSION_V1 1
#define JOURNAL_VERSION_V2 2
#define JOURNAL_BUFFER_SIZE 4*1024*1024
#define JOURNAL_ENTRY_HEADER_SIZE 16
@@ -32,7 +33,7 @@
#define JE_BIG_WRITE_INSTANT 0x08
#define JE_MAX 0x08
// crc32c comes first to ease calculation and is equal to crc32()
// crc32c comes first to ease calculation
struct __attribute__((__packed__)) journal_entry_start
{
uint32_t crc32;
@@ -42,8 +43,12 @@ struct __attribute__((__packed__)) journal_entry_start
uint32_t reserved;
uint64_t journal_start;
uint64_t version;
uint32_t data_csum_type;
uint32_t csum_block_size;
};
#define JE_START_LEGACY_SIZE 24
#define JE_START_V0_SIZE 24
#define JE_START_V1_SIZE 32
#define JE_START_V2_SIZE 40
struct __attribute__((__packed__)) journal_entry_small_write
{
@@ -59,10 +64,12 @@ struct __attribute__((__packed__)) journal_entry_small_write
// small_write entries contain <len> bytes of data which is stored in next sectors
// data_offset is its offset within journal
uint64_t data_offset;
uint32_t crc32_data;
uint32_t crc32_data; // zero when data_csum_type != 0
// small_write and big_write entries are followed by the "external" bitmap
// its size is dynamic and included in journal entry's <size> field
uint8_t bitmap[];
// and then data checksums if data_csum_type != 0
// uint32_t data_crc32c[];
};
struct __attribute__((__packed__)) journal_entry_big_write
@@ -80,6 +87,8 @@ struct __attribute__((__packed__)) journal_entry_big_write
// small_write and big_write entries are followed by the "external" bitmap
// its size is dynamic and included in journal entry's <size> field
uint8_t bitmap[];
// and then data checksums if data_csum_type != 0
// uint32_t data_crc32c[];
};
struct __attribute__((__packed__)) journal_entry_stable
@@ -218,3 +227,6 @@ struct blockstore_journal_check_t
};
journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size);
uint32_t crc32c_pad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad);
uint32_t crc32c_nopad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad);

View File

@@ -19,6 +19,10 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
if (config.find("autosync_writes") != config.end())
{
autosync_writes = strtoull(config["autosync_writes"].c_str(), NULL, 10);
}
if (!max_flusher_count)
{
max_flusher_count = 256;
@@ -85,11 +89,13 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
immediate_commit = IMMEDIATE_SMALL;
}
metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
inmemory_meta = config["inmemory_metadata"] != "false";
inmemory_meta = config["inmemory_metadata"] != "false" && config["inmemory_metadata"] != "0" &&
config["inmemory_metadata"] != "no";
journal.sector_count = strtoull(config["journal_sector_buffer_count"].c_str(), NULL, 10);
journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
journal.inmemory = config["inmemory_journal"] != "false";
journal.inmemory = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" &&
config["inmemory_journal"] != "no";
// Validate
if (journal.sector_count < 2)
{
@@ -133,19 +139,24 @@ void blockstore_impl_t::calc_lengths()
{
metadata_buffer = memalign(MEM_ALIGNMENT, dsk.meta_len);
if (!metadata_buffer)
throw std::runtime_error("Failed to allocate memory for the metadata");
throw std::runtime_error("Failed to allocate memory for the metadata ("+std::to_string(dsk.meta_len/1024/1024)+" MB)");
}
else if (dsk.clean_entry_bitmap_size)
else if (dsk.clean_entry_bitmap_size || dsk.data_csum_type)
{
clean_bitmap = (uint8_t*)malloc(dsk.block_count * 2*dsk.clean_entry_bitmap_size);
if (!clean_bitmap)
throw std::runtime_error("Failed to allocate memory for the metadata sparse write bitmap");
clean_bitmaps = (uint8_t*)malloc(dsk.block_count * 2 * dsk.clean_entry_bitmap_size);
if (!clean_bitmaps)
{
throw std::runtime_error(
"Failed to allocate memory for the metadata sparse write bitmap ("+
std::to_string(dsk.block_count * 2 * dsk.clean_entry_bitmap_size / 1024 / 1024)+" MB)"
);
}
}
if (journal.inmemory)
{
journal.buffer = memalign(MEM_ALIGNMENT, journal.len);
if (!journal.buffer)
throw std::runtime_error("Failed to allocate memory for journal");
throw std::runtime_error("Failed to allocate memory for journal ("+std::to_string(journal.len/1024/1024)+" MB)");
}
else
{

View File

@@ -1,6 +1,7 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include <limits.h>
#include "blockstore_impl.h"
int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
@@ -8,12 +9,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
{
if (!len)
{
// Zero-length version - skip
return 1;
}
else if (IS_IN_FLIGHT(item_state))
{
// Write not finished yet - skip
// Zero-length read
return 1;
}
else if (IS_DELETE(item_state))
@@ -22,6 +18,7 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
memset(buf, 0, len);
return 1;
}
assert(!IS_IN_FLIGHT(item_state));
if (journal.inmemory && IS_JOURNAL(item_state))
{
memcpy(buf, (uint8_t*)journal.buffer + offset, len);
@@ -40,59 +37,115 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
return 1;
}
// FIXME I've seen a bug here so I want some tests
int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
uint32_t item_state, uint64_t item_version, uint64_t item_location, uint64_t journal_sector)
void blockstore_impl_t::find_holes(std::vector<copy_buffer_t> & read_vec,
uint32_t item_start, uint32_t item_end,
std::function<int(int, bool, uint32_t, uint32_t)> callback)
{
uint32_t cur_start = item_start;
if (cur_start < read_op->offset + read_op->len && item_end > read_op->offset)
auto cur_start = item_start;
int i = 0;
while (cur_start < item_end)
{
cur_start = cur_start < read_op->offset ? read_op->offset : cur_start;
item_end = item_end > read_op->offset + read_op->len ? read_op->offset + read_op->len : item_end;
auto it = PRIV(read_op)->read_vec.begin();
while (1)
// COPY_BUF_CSUM_FILL items are fake items inserted in the end, their offsets aren't in order
if (i >= read_vec.size() || read_vec[i].copy_flags & COPY_BUF_CSUM_FILL || read_vec[i].offset >= item_end)
{
for (; it != PRIV(read_op)->read_vec.end(); it++)
{
if (it->offset >= cur_start)
{
break;
}
else if (it->offset + it->len > cur_start)
{
cur_start = it->offset + it->len;
if (cur_start >= item_end)
{
goto endwhile;
}
}
}
if (it == PRIV(read_op)->read_vec.end() || it->offset > cur_start)
{
fulfill_read_t el = {
.offset = cur_start,
.len = it == PRIV(read_op)->read_vec.end() || it->offset >= item_end ? item_end-cur_start : it->offset-cur_start,
.journal_sector = journal_sector,
};
it = PRIV(read_op)->read_vec.insert(it, el);
if (!fulfill_read_push(read_op,
(uint8_t*)read_op->buf + el.offset - read_op->offset,
item_location + el.offset - item_start,
el.len, item_state, item_version))
{
return 0;
}
fulfilled += el.len;
}
cur_start = it->offset + it->len;
if (it == PRIV(read_op)->read_vec.end() || cur_start >= item_end)
{
break;
}
// Hole (at end): cur_start .. item_end
i += callback(i, false, cur_start, item_end);
break;
}
else if (read_vec[i].offset > cur_start)
{
// Hole: cur_start .. min(read_vec[i].offset, item_end)
auto cur_end = read_vec[i].offset > item_end ? item_end : read_vec[i].offset;
i += callback(i, false, cur_start, cur_end);
cur_start = cur_end;
}
else if (read_vec[i].offset + read_vec[i].len > cur_start)
{
// Allocated: cur_start .. min(read_vec[i].offset + read_vec[i].len, item_end)
auto cur_end = read_vec[i].offset + read_vec[i].len;
cur_end = cur_end > item_end ? item_end : cur_end;
i += callback(i, true, cur_start, cur_end);
cur_start = cur_end;
i++;
}
else
i++;
}
endwhile:
return 1;
}
int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op,
uint64_t &fulfilled, uint32_t item_start, uint32_t item_end, // FIXME: Rename item_* to dirty_*
uint32_t item_state, uint64_t item_version, uint64_t item_location,
uint64_t journal_sector, uint8_t *csum, int *dyn_data)
{
int r = 1;
if (item_start < read_op->offset + read_op->len && item_end > read_op->offset)
{
auto & rv = PRIV(read_op)->read_vec;
auto rd_start = item_start < read_op->offset ? read_op->offset : item_start;
auto rd_end = item_end > read_op->offset + read_op->len ? read_op->offset + read_op->len : item_end;
find_holes(rv, rd_start, rd_end, [&](int pos, bool alloc, uint32_t start, uint32_t end)
{
if (!r || alloc)
return 0;
if (!journal.inmemory && dsk.csum_block_size > dsk.bitmap_granularity && IS_JOURNAL(item_state) && !IS_DELETE(item_state))
{
uint32_t blk_begin = (start/dsk.csum_block_size) * dsk.csum_block_size;
blk_begin = blk_begin < item_start ? item_start : blk_begin;
uint32_t blk_end = ((end-1) / dsk.csum_block_size + 1) * dsk.csum_block_size;
blk_end = blk_end > item_end ? item_end : blk_end;
rv.push_back((copy_buffer_t){
.copy_flags = COPY_BUF_JOURNAL|COPY_BUF_CSUM_FILL,
.offset = blk_begin,
.len = blk_end-blk_begin,
.csum_buf = (csum + (blk_begin/dsk.csum_block_size -
item_start/dsk.csum_block_size) * (dsk.data_csum_type & 0xFF)),
.dyn_data = dyn_data,
});
if (dyn_data)
{
(*dyn_data)++;
}
// Submit the journal checksum block read
if (!read_checksum_block(read_op, 1, fulfilled, item_location - item_start))
{
r = 0;
}
return 0;
}
copy_buffer_t el = {
.copy_flags = (IS_JOURNAL(item_state) ? COPY_BUF_JOURNAL : COPY_BUF_DATA),
.offset = start,
.len = end-start,
.disk_offset = item_location + start - item_start,
.journal_sector = (IS_JOURNAL(item_state) ? journal_sector : 0),
.csum_buf = !csum ? NULL : (csum + (start - item_start) / dsk.csum_block_size * (dsk.data_csum_type & 0xFF)),
.dyn_data = dyn_data,
};
if (dyn_data)
{
(*dyn_data)++;
}
if (IS_BIG_WRITE(item_state))
{
// If we don't track it then we may IN THEORY read another object's data:
// submit read -> remove the object -> flush remove -> overwrite with another object -> finish read
// Very improbable, but possible
PRIV(read_op)->clean_block_used = 1;
}
rv.insert(rv.begin() + pos, el);
fulfilled += el.len;
if (!fulfill_read_push(read_op,
(uint8_t*)read_op->buf + el.offset - read_op->offset,
item_location + el.offset - item_start,
el.len, item_state, item_version))
{
r = 0;
}
return 1;
});
}
return r;
}
uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offset)
@@ -106,10 +159,225 @@ uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offse
clean_entry_bitmap = ((uint8_t*)metadata_buffer + sector + pos*dsk.clean_entry_size + sizeof(clean_disk_entry) + offset);
}
else
clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*2*dsk.clean_entry_bitmap_size + offset);
clean_entry_bitmap = (uint8_t*)(clean_bitmaps + meta_loc*2*dsk.clean_entry_bitmap_size + offset);
return clean_entry_bitmap;
}
int blockstore_impl_t::fill_partial_checksum_blocks(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled,
uint8_t *clean_entry_bitmap, int *dyn_data, bool from_journal, uint8_t *read_buf, uint64_t read_offset, uint64_t read_end)
{
if (read_end == read_offset)
return 0;
int required = 0;
read_buf -= read_offset;
uint32_t last_block = (read_end-1)/dsk.csum_block_size;
uint32_t start_block = read_offset/dsk.csum_block_size;
uint32_t end_block = 0;
while (start_block <= last_block)
{
if (read_range_fulfilled(rv, fulfilled, read_buf, clean_entry_bitmap,
start_block*dsk.csum_block_size < read_offset ? read_offset : start_block*dsk.csum_block_size,
(start_block+1)*dsk.csum_block_size > read_end ? read_end : (start_block+1)*dsk.csum_block_size))
{
// read_range_fulfilled() also adds zero-filled areas
start_block++;
}
else
{
// Find a sequence of checksum blocks required to be read
end_block = start_block;
while ((end_block+1)*dsk.csum_block_size < read_end &&
!read_range_fulfilled(rv, fulfilled, read_buf, clean_entry_bitmap,
(end_block+1)*dsk.csum_block_size < read_offset ? read_offset : (end_block+1)*dsk.csum_block_size,
(end_block+2)*dsk.csum_block_size > read_end ? read_end : (end_block+2)*dsk.csum_block_size))
{
end_block++;
}
end_block++;
// OK, mark this range as required
rv.push_back((copy_buffer_t){
.copy_flags = COPY_BUF_CSUM_FILL | (from_journal ? COPY_BUF_JOURNALED_BIG : 0),
.offset = start_block*dsk.csum_block_size,
.len = (end_block-start_block)*dsk.csum_block_size,
// save clean_entry_bitmap if we're reading clean data from the journal
.csum_buf = from_journal ? clean_entry_bitmap : NULL,
.dyn_data = dyn_data,
});
if (dyn_data)
{
(*dyn_data)++;
}
start_block = end_block;
required++;
}
}
return required;
}
// read_buf should be == op->buf - op->offset
bool blockstore_impl_t::read_range_fulfilled(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled, uint8_t *read_buf,
uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end)
{
bool all_done = true;
find_holes(rv, item_start, item_end, [&](int pos, bool alloc, uint32_t cur_start, uint32_t cur_end)
{
if (alloc)
return 0;
int diff = 0;
uint32_t bmp_start = cur_start/dsk.bitmap_granularity;
uint32_t bmp_end = cur_end/dsk.bitmap_granularity;
uint32_t bmp_pos = bmp_start;
while (bmp_pos < bmp_end)
{
while (bmp_pos < bmp_end && !(clean_entry_bitmap[bmp_pos >> 3] & (1 << (bmp_pos & 0x7))))
bmp_pos++;
if (bmp_pos > bmp_start)
{
// zero fill
copy_buffer_t el = {
.copy_flags = COPY_BUF_ZERO,
.offset = bmp_start*dsk.bitmap_granularity,
.len = (bmp_pos-bmp_start)*dsk.bitmap_granularity,
};
rv.insert(rv.begin() + pos, el);
if (read_buf)
memset(read_buf + el.offset, 0, el.len);
fulfilled += el.len;
diff++;
}
bmp_start = bmp_pos;
while (bmp_pos < bmp_end && (clean_entry_bitmap[bmp_pos >> 3] & (1 << (bmp_pos & 0x7))))
bmp_pos++;
if (bmp_pos > bmp_start)
{
// something is to be read
all_done = false;
}
bmp_start = bmp_pos;
}
return diff;
});
return all_done;
}
bool blockstore_impl_t::read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc)
{
auto & rv = PRIV(op)->read_vec;
auto *vi = &rv[rv.size()-rv_pos];
uint32_t item_start = vi->offset, item_end = vi->offset+vi->len;
uint32_t fill_size = 0;
int n_iov = 0;
find_holes(rv, item_start, item_end, [&](int pos, bool alloc, uint32_t cur_start, uint32_t cur_end)
{
if (alloc)
{
fill_size += cur_end-cur_start;
n_iov++;
}
else
{
if (cur_start < op->offset)
{
fill_size += op->offset-cur_start;
n_iov++;
cur_start = op->offset;
}
if (cur_end > op->offset+op->len)
{
fill_size += cur_end-(op->offset+op->len);
n_iov++;
cur_end = op->offset+op->len;
}
if (cur_end > cur_start)
{
n_iov++;
}
}
return 0;
});
void *buf = memalign_or_die(MEM_ALIGNMENT, fill_size + n_iov*sizeof(struct iovec));
iovec *iov = (struct iovec*)((uint8_t*)buf+fill_size);
n_iov = 0;
fill_size = 0;
find_holes(rv, item_start, item_end, [&](int pos, bool alloc, uint32_t cur_start, uint32_t cur_end)
{
int res = 0;
if (alloc)
{
iov[n_iov++] = (struct iovec){ (uint8_t*)buf+fill_size, cur_end-cur_start };
fill_size += cur_end-cur_start;
}
else
{
if (cur_start < op->offset)
{
iov[n_iov++] = (struct iovec){ (uint8_t*)buf+fill_size, op->offset-cur_start };
fill_size += op->offset-cur_start;
cur_start = op->offset;
}
auto lim_end = cur_end > op->offset+op->len ? op->offset+op->len : cur_end;
if (lim_end > cur_start)
{
iov[n_iov++] = (struct iovec){ (uint8_t*)op->buf+cur_start-op->offset, lim_end-cur_start };
rv.insert(rv.begin() + pos, (copy_buffer_t){
.copy_flags = COPY_BUF_DATA,
.offset = cur_start,
.len = lim_end-cur_start,
});
fulfilled += lim_end-cur_start;
res++;
}
if (cur_end > op->offset+op->len)
{
iov[n_iov++] = (struct iovec){ (uint8_t*)buf+fill_size, cur_end - (op->offset+op->len) };
fill_size += cur_end - (op->offset+op->len);
cur_end = op->offset+op->len;
}
}
return res;
});
vi = &rv[rv.size()-rv_pos];
// Save buf into read_vec too but in a creepy way
// FIXME: Shit, something else should be invented %)
*vi = (copy_buffer_t){
.copy_flags = vi->copy_flags,
.offset = vi->offset,
.len = ((uint64_t)n_iov << 32) | fill_size,
.disk_offset = clean_loc + item_start,
.buf = (uint8_t*)buf,
.csum_buf = vi->csum_buf,
.dyn_data = vi->dyn_data,
};
int submit_fd = (vi->copy_flags & COPY_BUF_JOURNAL ? dsk.journal_fd : dsk.data_fd);
uint64_t submit_offset = (vi->copy_flags & COPY_BUF_JOURNAL ? journal.offset : dsk.data_offset);
uint32_t d_pos = 0;
for (int n_pos = 0; n_pos < n_iov; n_pos += IOV_MAX)
{
int n_cur = n_iov-n_pos < IOV_MAX ? n_iov-n_pos : IOV_MAX;
BS_SUBMIT_GET_SQE(sqe, data);
PRIV(op)->pending_ops++;
my_uring_prep_readv(sqe, submit_fd, iov + n_pos, n_cur, submit_offset + clean_loc + item_start + d_pos);
data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
if (n_pos > 0 || n_pos + IOV_MAX < n_iov)
{
uint32_t d_len = 0;
for (int i = 0; i < IOV_MAX; i++)
d_len += iov[n_pos+i].iov_len;
data->iov.iov_len = d_len;
d_pos += d_len;
}
else
data->iov.iov_len = item_end-item_start;
}
if (!(vi->copy_flags & COPY_BUF_JOURNAL))
{
// Reads running parallel to flushes of the same clean block may read
// a mixture of old and new data. So we don't verify checksums for such blocks.
PRIV(op)->clean_block_used = 1;
}
return true;
}
int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
{
auto & clean_db = clean_db_shard(read_op->oid);
@@ -131,6 +399,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
}
uint64_t fulfilled = 0;
PRIV(read_op)->pending_ops = 0;
PRIV(read_op)->clean_block_used = 0;
auto & rv = PRIV(read_op)->read_vec;
uint64_t result_version = 0;
if (dirty_found)
{
@@ -148,23 +418,36 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
FINISH_OP(read_op);
return 2;
}
int *dyn_data = (int*)(dsk.csum_block_size > 0 && alloc_dyn_data ? dirty.dyn_data : NULL);
uint8_t *bmp_ptr = (alloc_dyn_data
? (uint8_t*)dirty.dyn_data + sizeof(int) : (uint8_t*)&dirty.dyn_data);
if (!result_version)
{
result_version = dirty_it->first.version;
if (read_op->bitmap)
{
void *bmp_ptr = (dsk.clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap);
memcpy(read_op->bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
}
}
// If inmemory_journal is false, journal trim will have to wait until the read is completed
if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len,
dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset),
(IS_JOURNAL(dirty.state) ? dirty.journal_sector+1 : 0)))
if (!IS_JOURNAL(dirty.state))
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
return 0;
// Read from data disk, possibly checking checksums
if (!fulfill_clean_read(read_op, fulfilled, bmp_ptr, dyn_data,
dirty.offset, dirty.offset+dirty.len, dirty.location, dirty_it->first.version))
{
goto undo_read;
}
}
else
{
// Copy from memory or read from journal, possibly checking checksums
if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len,
dirty.state, dirty_it->first.version, dirty.location, dirty.journal_sector+1,
journal.inmemory ? NULL : bmp_ptr+dsk.clean_entry_bitmap_size, dyn_data))
{
goto undo_read;
}
}
}
if (fulfilled == read_op->len || dirty_it == dirty_db.begin())
@@ -187,50 +470,10 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
}
if (fulfilled < read_op->len)
{
if (!dsk.clean_entry_bitmap_size)
if (!fulfill_clean_read(read_op, fulfilled, NULL, NULL, 0, dsk.data_block_size,
clean_it->second.location, clean_it->second.version))
{
if (!fulfill_read(read_op, fulfilled, 0, dsk.data_block_size,
(BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_it->second.location, 0))
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
return 0;
}
}
else
{
uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_it->second.location, 0);
uint64_t bmp_start = 0, bmp_end = 0, bmp_size = dsk.data_block_size/dsk.bitmap_granularity;
while (bmp_start < bmp_size)
{
while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size)
{
bmp_end++;
}
if (bmp_end > bmp_start)
{
// fill with zeroes
assert(fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
bmp_end * dsk.bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
}
bmp_start = bmp_end;
while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
{
bmp_end++;
}
if (bmp_end > bmp_start)
{
if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
clean_it->second.location + bmp_start * dsk.bitmap_granularity, 0))
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
return 0;
}
bmp_start = bmp_end;
}
}
goto undo_read;
}
}
}
@@ -242,11 +485,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
FINISH_OP(read_op);
return 2;
}
if (fulfilled < read_op->len)
{
assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0));
assert(fulfilled == read_op->len);
}
assert(fulfilled == read_op->len);
read_op->version = result_version;
if (!PRIV(read_op)->pending_ops)
{
@@ -271,6 +510,309 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
}
read_op->retval = 0;
return 2;
undo_read:
// need to wait. undo added requests, don't dequeue op
if (dsk.csum_block_size > dsk.bitmap_granularity)
{
for (auto & vec: rv)
{
if ((vec.copy_flags & COPY_BUF_CSUM_FILL) && vec.buf)
{
free(vec.buf);
vec.buf = NULL;
}
if (vec.dyn_data && --(*vec.dyn_data) == 0) // refcount
{
free(vec.dyn_data);
vec.dyn_data = NULL;
}
}
}
rv.clear();
return 0;
}
int blockstore_impl_t::pad_journal_read(std::vector<copy_buffer_t> & rv, copy_buffer_t & cp,
// FIXME Passing dirty_entry& would be nicer
uint64_t dirty_offset, uint64_t dirty_end, uint64_t dirty_loc, uint8_t *csum_ptr, int *dyn_data,
uint64_t offset, uint64_t submit_len, uint64_t & blk_begin, uint64_t & blk_end, uint8_t* & blk_buf)
{
if (offset % dsk.csum_block_size || submit_len % dsk.csum_block_size)
{
if (offset < blk_end)
{
// Already being read as a part of the previous checksum block series
cp.buf = blk_buf + offset - blk_begin;
cp.copy_flags |= COPY_BUF_COALESCED;
if (offset+submit_len > blk_end)
cp.len = blk_end-offset;
return 2;
}
else
{
// We don't use fill_partial_checksum_blocks for journal because journal writes never have holes (internal bitmap)
blk_begin = (offset/dsk.csum_block_size) * dsk.csum_block_size;
blk_begin = blk_begin < dirty_offset ? dirty_offset : blk_begin;
blk_end = ((offset+submit_len-1)/dsk.csum_block_size + 1) * dsk.csum_block_size;
blk_end = blk_end > dirty_end ? dirty_end : blk_end;
if (blk_begin < offset || blk_end > offset+submit_len)
{
blk_buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, blk_end-blk_begin);
cp.buf = blk_buf + offset - blk_begin;
cp.copy_flags |= COPY_BUF_COALESCED;
rv.push_back((copy_buffer_t){
.copy_flags = COPY_BUF_JOURNAL|COPY_BUF_CSUM_FILL,
.offset = blk_begin,
.len = blk_end-blk_begin,
.disk_offset = dirty_loc + blk_begin - dirty_offset,
.buf = blk_buf,
.csum_buf = (csum_ptr + (blk_begin/dsk.csum_block_size -
dirty_offset/dsk.csum_block_size) * (dsk.data_csum_type & 0xFF)),
.dyn_data = dyn_data,
});
if (dyn_data)
{
(*dyn_data)++;
}
return 1;
}
}
}
return 0;
}
bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled,
uint8_t *clean_entry_bitmap, int *dyn_data, uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver)
{
bool from_journal = clean_entry_bitmap != NULL;
if (!clean_entry_bitmap)
{
// NULL clean_entry_bitmap means we're reading from data, not from the journal,
// and the bitmap location is obvious
clean_entry_bitmap = get_clean_entry_bitmap(clean_loc, 0);
}
if (dsk.csum_block_size > dsk.bitmap_granularity)
{
auto & rv = PRIV(read_op)->read_vec;
int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap, dyn_data, from_journal,
(uint8_t*)read_op->buf, read_op->offset, read_op->offset+read_op->len);
if (!inmemory_meta && !from_journal && req > 0)
{
// Read checksums from disk
uint8_t *csum_buf = read_clean_meta_block(read_op, clean_loc, rv.size()-req);
for (int i = req; i > 0; i--)
{
rv[rv.size()-i].csum_buf = csum_buf;
}
}
for (int i = req; i > 0; i--)
{
if (!read_checksum_block(read_op, i, fulfilled, clean_loc))
{
return false;
}
}
PRIV(read_op)->clean_block_used = req > 0;
}
else if (from_journal)
{
// Don't scan bitmap - journal writes don't have holes (internal bitmap)!
uint8_t *csum = !dsk.csum_block_size ? 0 : (clean_entry_bitmap + dsk.clean_entry_bitmap_size +
item_start/dsk.csum_block_size*(dsk.data_csum_type & 0xFF));
if (!fulfill_read(read_op, fulfilled, item_start, item_end,
(BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_loc + item_start, 0, csum, dyn_data))
{
return false;
}
if (item_start > 0 && fulfilled < read_op->len)
{
// fill with zeroes
assert(fulfill_read(read_op, fulfilled, 0, item_start, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0, NULL, NULL));
}
if (item_end < dsk.data_block_size && fulfilled < read_op->len)
{
// fill with zeroes
assert(fulfill_read(read_op, fulfilled, item_end, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0, NULL, NULL));
}
}
else
{
bool csum_done = !dsk.csum_block_size || inmemory_meta;
uint8_t *csum_buf = clean_entry_bitmap;
uint64_t bmp_start = 0, bmp_end = 0, bmp_size = dsk.data_block_size/dsk.bitmap_granularity;
while (bmp_start < bmp_size)
{
while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size)
{
bmp_end++;
}
if (bmp_end > bmp_start)
{
// fill with zeroes
assert(fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
bmp_end * dsk.bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0, NULL, NULL));
}
bmp_start = bmp_end;
while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
{
bmp_end++;
}
if (bmp_end > bmp_start)
{
if (!csum_done)
{
// Read checksums from disk
csum_buf = read_clean_meta_block(read_op, clean_loc, PRIV(read_op)->read_vec.size());
csum_done = true;
}
uint8_t *csum = !dsk.csum_block_size ? 0 : (csum_buf + 2*dsk.clean_entry_bitmap_size + bmp_start*(dsk.data_csum_type & 0xFF));
if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
clean_loc + bmp_start * dsk.bitmap_granularity, 0, csum, dyn_data))
{
return false;
}
bmp_start = bmp_end;
}
}
}
// Increment reference counter if clean data is being read from the disk
if (PRIV(read_op)->clean_block_used)
{
auto & uo = used_clean_objects[clean_loc];
uo.refs++;
if (dsk.csum_block_size && flusher->is_mutated(clean_loc))
uo.was_changed = true;
PRIV(read_op)->clean_block_used = clean_loc;
}
return true;
}
uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, uint64_t clean_loc, int rv_pos)
{
auto & rv = PRIV(op)->read_vec;
auto sector = ((clean_loc >> dsk.block_order) / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
auto pos = ((clean_loc >> dsk.block_order) % (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.clean_entry_size;
uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.meta_block_size);
rv.insert(rv.begin()+rv_pos, (copy_buffer_t){
.copy_flags = COPY_BUF_META_BLOCK|COPY_BUF_CSUM_FILL,
.offset = pos,
.buf = buf,
});
BS_SUBMIT_GET_SQE(sqe, data);
data->iov = (struct iovec){ buf, dsk.meta_block_size };
PRIV(op)->pending_ops++;
my_uring_prep_readv(sqe, dsk.meta_fd, &data->iov, 1, dsk.meta_offset + dsk.meta_block_size + sector);
data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
// return pointer to checksums + bitmap
return buf + pos + sizeof(clean_disk_entry);
}
bool blockstore_impl_t::verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb)
{
assert(!(offset % dsk.csum_block_size));
uint32_t *csums = (uint32_t*)csum_buf;
uint32_t block_csum = 0;
uint32_t block_done = 0;
uint32_t block_num = clean_entry_bitmap ? offset/dsk.csum_block_size : 0;
uint32_t bmp_pos = offset/dsk.bitmap_granularity;
for (int i = 0; i < n_iov; i++)
{
uint32_t pos = 0;
while (pos < iov[i].iov_len)
{
uint32_t start = pos;
uint8_t bit = (clean_entry_bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1;
while (pos < iov[i].iov_len && ((clean_entry_bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1) == bit)
{
pos += dsk.bitmap_granularity;
bmp_pos++;
}
uint32_t len = pos-start;
auto buf = (uint8_t*)iov[i].iov_base+start;
while (block_done+len >= dsk.csum_block_size)
{
auto cur_len = dsk.csum_block_size-block_done;
block_csum = crc32c_pad(block_csum, buf, bit ? cur_len : 0, bit ? 0 : cur_len, 0);
if (block_csum != csums[block_num])
{
if (bad_block_cb)
bad_block_cb(block_num*dsk.csum_block_size, block_csum, csums[block_num]);
else
return false;
}
block_num++;
buf += cur_len;
len -= cur_len;
block_done = block_csum = 0;
}
if (len > 0)
{
block_csum = crc32c_pad(block_csum, buf, bit ? len : 0, bit ? 0 : len, 0);
block_done += len;
}
}
}
assert(!block_done);
return true;
}
bool blockstore_impl_t::verify_journal_checksums(uint8_t *csums, uint32_t offset,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb)
{
uint32_t block_csum = 0;
uint32_t block_num = 0;
uint32_t block_done = offset%dsk.csum_block_size;
for (int i = 0; i < n_iov; i++)
{
uint32_t len = iov[i].iov_len;
auto buf = (uint8_t*)iov[i].iov_base;
while (block_done+len >= dsk.csum_block_size)
{
auto cur_len = dsk.csum_block_size-block_done;
block_csum = crc32c(block_csum, buf, cur_len);
if (block_csum != ((uint32_t*)csums)[block_num])
{
if (bad_block_cb)
bad_block_cb(block_num*dsk.csum_block_size, block_csum, ((uint32_t*)csums)[block_num]);
else
return false;
}
block_num++;
buf += cur_len;
len -= cur_len;
block_done = block_csum = 0;
}
if (len > 0)
{
block_csum = crc32c(block_csum, buf, len);
block_done += len;
}
}
if (block_done > 0 && block_csum != ((uint32_t*)csums)[block_num])
{
if (bad_block_cb)
bad_block_cb(block_num*dsk.csum_block_size, block_csum, ((uint32_t*)csums)[block_num]);
else
return false;
}
return true;
}
bool blockstore_impl_t::verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *dyn_data, bool from_journal,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb)
{
uint32_t offset = clean_loc % dsk.data_block_size;
if (from_journal)
return verify_padded_checksums(dyn_data, dyn_data + dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb);
clean_loc = (clean_loc >> dsk.block_order) << dsk.block_order;
if (!dyn_data)
{
assert(inmemory_meta);
dyn_data = get_clean_entry_bitmap(clean_loc, 0);
}
return verify_padded_checksums(dyn_data, dyn_data + 2*dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb);
}
void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op)
@@ -284,6 +826,139 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
}
if (PRIV(op)->pending_ops == 0)
{
if (dsk.csum_block_size)
{
// verify checksums if required
auto & rv = PRIV(op)->read_vec;
void *meta_block = NULL;
if (dsk.csum_block_size > dsk.bitmap_granularity)
{
for (int i = rv.size()-1; i >= 0 && (rv[i].copy_flags & COPY_BUF_CSUM_FILL); i--)
{
if (rv[i].copy_flags & COPY_BUF_META_BLOCK)
{
// Metadata read. Skip
assert(!meta_block);
meta_block = rv[i].buf;
rv[i].buf = NULL;
continue;
}
struct iovec *iov = (struct iovec*)((uint8_t*)rv[i].buf + (rv[i].len & 0xFFFFFFFF));
int n_iov = rv[i].len >> 32;
bool ok = true;
if (rv[i].copy_flags & COPY_BUF_JOURNAL)
{
// SMALL_WRITE from journal
verify_journal_checksums(
rv[i].csum_buf, rv[i].offset, iov, n_iov,
[&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
{
ok = false;
printf(
"Checksum mismatch in object %lx:%lx v%lu in journal at 0x%lx, checksum block #%u: got %08x, expected %08x\n",
op->oid.inode, op->oid.stripe, op->version,
rv[i].disk_offset, bad_block / dsk.csum_block_size, calc_csum, stored_csum
);
}
);
}
else
{
// BIG_WRITE from journal or clean data
// Do not verify checksums if the data location is/was mutated by flushers
auto & uo = used_clean_objects.at((rv[i].disk_offset >> dsk.block_order) << dsk.block_order);
if (!uo.was_changed)
{
verify_clean_padded_checksums(
op, rv[i].disk_offset, rv[i].csum_buf, (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG), iov, n_iov,
[&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
{
ok = false;
printf(
"Checksum mismatch in object %lx:%lx v%lu in %s data at 0x%lx, checksum block #%u: got %08x, expected %08x\n",
op->oid.inode, op->oid.stripe, op->version,
(rv[i].copy_flags & COPY_BUF_JOURNALED_BIG ? "redirect-write" : "clean"),
rv[i].disk_offset, bad_block / dsk.csum_block_size, calc_csum, stored_csum
);
}
);
}
}
if (!ok)
{
op->retval = -EDOM;
}
free(rv[i].buf);
rv[i].buf = NULL;
if (rv[i].dyn_data && --(*rv[i].dyn_data) == 0) // refcount
{
free(rv[i].dyn_data);
rv[i].dyn_data = NULL;
}
}
}
else
{
for (auto & vec: rv)
{
if (vec.copy_flags & COPY_BUF_META_BLOCK)
{
// Metadata read. Skip
assert(!meta_block);
meta_block = vec.buf;
vec.buf = NULL;
continue;
}
if (vec.csum_buf)
{
uint32_t *csum = (uint32_t*)vec.csum_buf;
for (size_t p = 0; p < vec.len; p += dsk.csum_block_size, csum++)
{
if (crc32c(0, (uint8_t*)op->buf + vec.offset - op->offset + p, dsk.csum_block_size) != *csum)
{
// checksum error
printf(
"Checksum mismatch in object %lx:%lx v%lu in %s area at offset 0x%lx+0x%lx: %08x vs %08x\n",
op->oid.inode, op->oid.stripe, op->version,
(vec.copy_flags & COPY_BUF_JOURNAL) ? "journal" : "data", vec.disk_offset, p,
crc32c(0, (uint8_t*)op->buf + vec.offset - op->offset + p, dsk.csum_block_size), *csum
);
op->retval = -EDOM;
break;
}
}
}
if (vec.dyn_data && --(*vec.dyn_data) == 0) // refcount
{
free(vec.dyn_data);
vec.dyn_data = NULL;
}
}
}
if (meta_block)
{
// Free after checking
free(meta_block);
meta_block = NULL;
}
}
if (PRIV(op)->clean_block_used)
{
// Release clean data block
auto uo_it = used_clean_objects.find(PRIV(op)->clean_block_used);
if (uo_it != used_clean_objects.end())
{
uo_it->second.refs--;
if (uo_it->second.refs <= 0)
{
if (uo_it->second.was_freed)
{
data_alloc->set(PRIV(op)->clean_block_used, false);
}
used_clean_objects.erase(uo_it);
}
}
}
if (!journal.inmemory)
{
// Release journal sector usage
@@ -324,8 +999,9 @@ int blockstore_impl_t::read_bitmap(object_id oid, uint64_t target_version, void
*result_version = dirty_it->first.version;
if (bitmap)
{
void *bmp_ptr = (dsk.clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap);
memcpy(bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
void *dyn_ptr = (alloc_dyn_data
? (uint8_t*)dirty_it->second.dyn_data + sizeof(int) : (uint8_t*)&dirty_it->second.dyn_data);
memcpy(bitmap, dyn_ptr, dsk.clean_entry_bitmap_size);
}
return 0;
}

View File

@@ -218,7 +218,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
auto used = --journal.used_sectors[dirty_it->second.journal_sector];
#ifdef BLOCKSTORE_DEBUG
printf(
"remove usage of journal offset %08lx by %lx:%lx v%lu (%d refs)\n", dirty_it->second.journal_sector,
"remove usage of journal offset %08lx by %lx:%lx v%lu (%lu refs)\n", dirty_it->second.journal_sector,
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, used
);
#endif
@@ -227,11 +227,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
journal.used_sectors.erase(dirty_it->second.journal_sector);
flusher->mark_trim_possible();
}
if (dsk.clean_entry_bitmap_size > sizeof(void*))
{
free(dirty_it->second.bitmap);
dirty_it->second.bitmap = NULL;
}
free_dirty_dyn_data(dirty_it->second);
if (dirty_it == dirty_start)
{
break;
@@ -240,3 +236,18 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
}
dirty_db.erase(dirty_start, dirty_end);
}
void blockstore_impl_t::free_dirty_dyn_data(dirty_entry & e)
{
if (e.dyn_data)
{
if (alloc_dyn_data &&
--*((int*)e.dyn_data) == 0) // refcount
{
// dyn_data contains the bitmap and checksums
// free it if it doesn't refer to the in-memory journal
free(e.dyn_data);
}
e.dyn_data = NULL;
}
}

View File

@@ -27,8 +27,6 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
unsynced_big_write_count -= unsynced_big_writes.size();
PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
PRIV(op)->sync_small_writes.swap(unsynced_small_writes);
PRIV(op)->sync_small_checked = 0;
PRIV(op)->sync_big_checked = 0;
unsynced_big_writes.clear();
unsynced_small_writes.clear();
if (PRIV(op)->sync_big_writes.size() > 0)
@@ -78,7 +76,23 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
// 2nd step: Data device is synced, prepare & write journal entries
// Check space in the journal and journal memory buffers
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
if (dsk.csum_block_size)
{
// More complex check because all journal entries have different lengths
int left = PRIV(op)->sync_big_writes.size();
for (auto & sbw: PRIV(op)->sync_big_writes)
{
left--;
auto & dirty_entry = dirty_db.at(sbw);
uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
left == 0 ? JOURNAL_STABILIZE_RESERVATION : 0))
{
return 0;
}
}
}
else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
{
return 0;
@@ -90,16 +104,17 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
int s = 0;
while (it != PRIV(op)->sync_big_writes.end())
{
if (!journal.entry_fits(sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size) &&
auto & dirty_entry = dirty_db.at(*it);
uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
if (!journal.entry_fits(sizeof(journal_entry_big_write) + dyn_size) &&
journal.sector_info[journal.cur_sector].dirty)
{
prepare_journal_sector_write(journal.cur_sector, op);
s++;
}
auto & dirty_entry = dirty_db.at(*it);
journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
journal, (dirty_entry.state & BS_ST_INSTANT) ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size
sizeof(journal_entry_big_write) + dyn_size
);
dirty_entry.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
@@ -115,8 +130,8 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
je->offset = dirty_entry.offset;
je->len = dirty_entry.len;
je->location = dirty_entry.location;
memcpy((void*)(je+1), (dsk.clean_entry_bitmap_size > sizeof(void*)
? dirty_entry.bitmap : &dirty_entry.bitmap), dsk.clean_entry_bitmap_size);
memcpy((void*)(je+1), (alloc_dyn_data
? (uint8_t*)dirty_entry.dyn_data+sizeof(int) : (uint8_t*)&dirty_entry.dyn_data), dyn_size);
je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32;
it++;

Some files were not shown because too many files have changed in this diff Show More