Compare commits

..

73 Commits

Author SHA1 Message Date
06f4e0fcce K/V control prints (for debug only) O:-)
Some checks failed
Test / test_move_reappear (push) Successful in 18s
Test / test_rm (push) Successful in 12s
Test / test_snapshot_chain (push) Successful in 59s
Test / test_snapshot_down (push) Successful in 19s
Test / test_snapshot_ec (push) Failing after 3m5s
Test / test_splitbrain (push) Successful in 12s
Test / test_snapshot_chain_ec (push) Failing after 3m6s
Test / test_snapshot_down_ec (push) Failing after 3m10s
Test / test_rebalance_verify_ec (push) Failing after 43s
Test / test_rebalance_verify_imm (push) Successful in 3m0s
Test / test_rebalance_verify (push) Successful in 3m29s
Test / test_rebalance_verify_ec_imm (push) Successful in 2m48s
Test / test_write_no_same (push) Successful in 12s
Test / test_interrupted_rebalance_ec (push) Failing after 10m5s
Test / test_write (push) Failing after 3m6s
Test / test_write_xor (push) Failing after 3m5s
Test / test_heal_pg_size_2 (push) Failing after 3m46s
Test / test_heal_csum_32k_dj (push) Successful in 4m48s
Test / test_heal_csum_32k_dmj (push) Failing after 4m50s
Test / test_heal_csum_32k (push) Successful in 5m9s
Test / test_heal_ec (push) Failing after 10m14s
Test / test_heal_csum_4k_dj (push) Successful in 5m28s
Test / test_scrub (push) Successful in 1m11s
Test / test_heal_csum_4k_dmj (push) Failing after 5m36s
Test / test_scrub_zero_osd_2 (push) Successful in 45s
Test / test_scrub_pg_size_3 (push) Successful in 1m2s
Test / test_scrub_xor (push) Failing after 3m8s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Failing after 3m8s
Test / test_scrub_ec (push) Failing after 3m6s
Test / test_heal_csum_4k (push) Failing after 10m19s
2023-12-01 02:33:04 +03:00
f285cfc483 Fix eviction when random_pos selects the end
Some checks failed
Test / test_move_reappear (push) Successful in 18s
Test / test_rm (push) Successful in 13s
Test / test_snapshot_chain (push) Successful in 1m1s
Test / test_snapshot_down (push) Successful in 20s
Test / test_snapshot_ec (push) Failing after 3m5s
Test / test_splitbrain (push) Successful in 12s
Test / test_snapshot_chain_ec (push) Failing after 3m6s
Test / test_snapshot_down_ec (push) Failing after 3m6s
Test / test_rebalance_verify_ec (push) Failing after 45s
Test / test_rebalance_verify (push) Successful in 2m34s
Test / test_rebalance_verify_imm (push) Successful in 2m5s
Test / test_write (push) Successful in 54s
Test / test_write_no_same (push) Successful in 12s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m7s
Test / test_write_xor (push) Failing after 3m6s
Test / test_interrupted_rebalance_ec (push) Failing after 10m6s
Test / test_heal_pg_size_2 (push) Failing after 10m10s
Test / test_heal_ec (push) Failing after 10m7s
Test / test_heal_csum_32k_dmj (push) Failing after 10m10s
Test / test_heal_csum_32k_dj (push) Failing after 10m15s
Test / test_heal_csum_32k (push) Failing after 3m29s
Test / test_scrub (push) Successful in 1m9s
Test / test_scrub_zero_osd_2 (push) Successful in 1m18s
Test / test_heal_csum_4k_dmj (push) Failing after 5m56s
Test / test_scrub_pg_size_3 (push) Successful in 31s
Test / test_scrub_xor (push) Failing after 3m12s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Failing after 3m6s
Test / test_heal_csum_4k_dj (push) Failing after 10m10s
Test / test_scrub_ec (push) Failing after 3m6s
Test / test_heal_csum_4k (push) Failing after 10m21s
2023-12-01 01:43:03 +03:00
12b50b421d Implement min/max list_count to make listings during performance test reasonable 2023-12-01 01:17:04 +03:00
9f6d09428d Fix and improve parallel allocation
- Do not try to allocate more DB blocks in an inode block until it's "confirmed" and "locked" by the first write
- Do not recheck for new zero DB blocks on first write into an inode block - a CAS failure means someone else is already writing into it
- Throw new allocation blocks away regardless of whether the known_version is 0 on a CAS failure
2023-12-01 01:17:04 +03:00
580025cfc9 Implement key_prefix for K/V stress test 2023-12-01 01:17:04 +03:00
13e2d3ce7c More fixes
- do not overwrite a block with older version if known version is newer
  (read may start before update and end after update)
- invalidated block versions can't be remembered and trusted
- right boundary for split blocks is right_half when diving down, not key_lt
- restart update also when block is "invalidated", not just on version mismatch
- copy callback in listings to avoid closure destruction bugs too
2023-12-01 01:17:04 +03:00
c5b00f897a Add logging and one more assert 2023-12-01 01:17:04 +03:00
e847e26912 Make get_block() wait for updating when unrelated block is found along the path 2023-12-01 01:17:04 +03:00
3393463466 Fix a race condition where changed blocks were parsed over existing cached blocks and getting a mix of data 2023-12-01 01:17:04 +03:00
bd96a6194a Simplify code by removing an unneeded "optimisation" 2023-12-01 01:17:04 +03:00
601fe10c28 Add kv_log_level, print warnings on level 1, trace ops on level 10 2023-12-01 01:17:04 +03:00
63dbc9ca85 Fix duplicate keys in listings on parallel updates -- do not rewind key "iterator position" 2023-12-01 01:17:04 +03:00
aa0c363c39 Implement key suffix to avoid collisions of multiple test workers 2023-12-01 01:17:04 +03:00
ce52c5589e Do not complain on empty first block 2023-12-01 01:17:04 +03:00
aee20ab1ee Add JSON output for stress-tester 2023-12-01 01:17:04 +03:00
bb81992fac Print total stats 2023-12-01 01:17:04 +03:00
a28f401aff Do not send more than op_count operations (fix segfault on finish) 2023-12-01 01:17:04 +03:00
4ac7e096fd Add some more resiliency to serialize() 2023-12-01 01:17:04 +03:00
b6171a4599 Invalidate blocks being updated too 2023-12-01 01:17:03 +03:00
28045f230c Change new block allocation method: make each writer choose multiple empty PG blocks and place blocks in them 2023-12-01 01:17:03 +03:00
10e867880f Remove blocks from cache on unsuccessful updates 2023-12-01 01:17:03 +03:00
012462171a Allow to track multiple updates per block (it should never happen though) 2023-12-01 01:17:03 +03:00
904793cdab Do not call stop_updating after failed write_new_block and after clear_block (both delete the item) 2023-12-01 01:17:03 +03:00
45c01db2de Track versions of parent blocks and recheck if changed during update 2023-12-01 01:17:03 +03:00
8c9206cecd Fix resume_split condition (key_lt can also be "") 2023-12-01 01:17:03 +03:00
e8c46ededa Experiment: transform offsets for better sharding 2023-12-01 01:17:03 +03:00
e9b321a0e0 More post-stress-test fixes
- Prevent _split types of new blocks
- Stop updating new blocks only after the whole update, otherwise pointers
  may become invalid
- Use recheck_none for updates initially
- Use UINT64_MAX as initial block version when postponing ops, otherwise the
  check fails when the block is initially empty. This for example leads to
  writing both leaf items & block pointers (which is incorrect) into the root
  block when starting stress-test with --parallelism 32
- Fix -EINTR comparison
2023-12-01 01:17:03 +03:00
09a77991ae Print operation statistics 2023-12-01 01:17:03 +03:00
29d8c9b6f3 K/V fixes after stress-test :-)
- track block versions correctly - per inode block (128kb) instead of tree block (4kb)
- prevent multiple parallel CAS writes of the same inode block
- add logging for EILSEQ which means invalid data in the tree
- fix get_block updated flag which was true for blocks already in cache and was leading to infinite loops on "unrelated block" errors
- apply changes to blocks in cache only after successful writes (using "virtual changes")
- do not replace cached block with an older version from disk
- recheck "unrelated blocks" (read/update collisions) until data stops changing
- track tree path correctly - do not treat split block as parent of its right half
- correctly move blocks when finding new empty place on disk
- restart updates from the beginning when one of blocks is changed by a parallel update
- fix delete using SET opcode and setting key to the empty value instead
- prevent changing the same key more than 1 time in parallel
- fix listing verification
- resume continue_updates in update_find (required because it uses continue_update itself)
- add allow_old_cached parameter to get()
2023-12-01 01:17:03 +03:00
20321aaaef Implement K/V DB stress tester 2023-12-01 01:17:03 +03:00
987b005356 Evict blocks based on memory limit & block usage 2023-12-01 01:17:03 +03:00
41754b748b Track blocks per level 2023-12-01 01:17:03 +03:00
31913256f3 Track block level 2023-12-01 01:17:03 +03:00
0ee36baed7 Experimental B-Tree Vitastor embedded K/V database implementation! 2023-12-01 01:17:03 +03:00
19e2d9d6fa Fix crash on unknown long argument to vitastor-disk
Some checks failed
Test / test_move_reappear (push) Successful in 18s
Test / test_rm (push) Successful in 11s
Test / test_snapshot_chain (push) Successful in 59s
Test / test_snapshot_down (push) Successful in 19s
Test / test_snapshot_ec (push) Failing after 3m5s
Test / test_splitbrain (push) Successful in 11s
Test / test_snapshot_chain_ec (push) Failing after 3m6s
Test / test_rebalance_verify_imm (push) Failing after 22s
Test / test_rebalance_verify_ec (push) Failing after 44s
Test / test_snapshot_down_ec (push) Failing after 3m5s
Test / test_rebalance_verify (push) Successful in 2m53s
Test / test_rebalance_verify_ec_imm (push) Successful in 2m14s
Test / test_write_no_same (push) Successful in 12s
Test / test_write (push) Failing after 3m19s
Test / test_interrupted_rebalance_ec (push) Failing after 10m7s
Test / test_write_xor (push) Failing after 3m8s
Test / test_heal_pg_size_2 (push) Failing after 3m9s
Test / test_heal_csum_32k_dmj (push) Successful in 4m45s
Test / test_heal_csum_32k_dj (push) Successful in 5m1s
Test / test_heal_csum_4k_dmj (push) Successful in 4m12s
Test / test_heal_ec (push) Failing after 10m13s
Test / test_heal_csum_4k_dj (push) Failing after 4m13s
Test / test_scrub_zero_osd_2 (push) Successful in 48s
Test / test_scrub (push) Successful in 51s
Test / test_scrub_pg_size_3 (push) Successful in 55s
Test / test_heal_csum_32k (push) Failing after 10m12s
Test / test_heal_csum_4k (push) Failing after 3m55s
Test / test_scrub_xor (push) Failing after 3m7s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Failing after 3m14s
Test / test_scrub_ec (push) Failing after 3m11s
2023-12-01 00:55:51 +03:00
bfc7e61909 Add more notes + performance comparison about VDUSE 2023-11-25 02:25:56 +03:00
7da4868b37 Fix monitor statistics aggregation in case of empty /osd/stats keys
Some checks failed
Test / test_move_reappear (push) Successful in 18s
Test / test_rm (push) Successful in 13s
Test / test_snapshot_chain (push) Successful in 1m1s
Test / test_snapshot_down (push) Successful in 20s
Test / test_snapshot_ec (push) Failing after 3m6s
Test / test_splitbrain (push) Successful in 12s
Test / test_snapshot_chain_ec (push) Failing after 3m6s
Test / test_snapshot_down_ec (push) Failing after 3m7s
Test / test_rebalance_verify_ec (push) Failing after 48s
Test / test_rebalance_verify (push) Successful in 3m30s
Test / test_rebalance_verify_imm (push) Successful in 3m3s
Test / test_rebalance_verify_ec_imm (push) Successful in 2m36s
Test / test_write_no_same (push) Successful in 12s
Test / test_interrupted_rebalance_ec (push) Failing after 10m7s
Test / test_write (push) Failing after 3m7s
Test / test_write_xor (push) Failing after 3m6s
Test / test_heal_csum_32k_dmj (push) Failing after 4m5s
Test / test_heal_csum_32k (push) Failing after 3m16s
Test / test_heal_pg_size_2 (push) Failing after 10m7s
Test / test_heal_ec (push) Failing after 10m10s
Test / test_heal_csum_32k_dj (push) Failing after 10m14s
Test / test_scrub (push) Successful in 59s
Test / test_scrub_zero_osd_2 (push) Successful in 54s
Test / test_heal_csum_4k_dj (push) Successful in 4m27s
Test / test_scrub_pg_size_3 (push) Successful in 41s
Test / test_heal_csum_4k (push) Failing after 4m56s
Test / test_scrub_xor (push) Failing after 3m11s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Failing after 3m5s
Test / test_scrub_ec (push) Failing after 3m5s
Test / test_heal_csum_4k_dmj (push) Failing after 10m8s
2023-11-24 01:05:21 +03:00
b5c020ce0b Use io_uring SQ size for ringloop capacity - otherwise get_sqe could return NULL when space_left() was > 0 under load
Some checks failed
Test / test_rm (push) Successful in 11s
Test / test_move_reappear (push) Successful in 17s
Test / test_snapshot_chain (push) Successful in 1m0s
Test / test_snapshot_down (push) Successful in 19s
Test / test_snapshot_ec (push) Failing after 3m5s
Test / test_splitbrain (push) Successful in 12s
Test / test_snapshot_chain_ec (push) Failing after 3m5s
Test / test_snapshot_down_ec (push) Failing after 3m6s
Test / test_rebalance_verify_ec (push) Failing after 42s
Test / test_rebalance_verify_imm (push) Successful in 2m47s
Test / test_rebalance_verify (push) Successful in 3m14s
Test / test_rebalance_verify_ec_imm (push) Successful in 2m38s
Test / test_interrupted_rebalance_ec (push) Failing after 10m5s
Test / test_write_no_same (push) Successful in 11s
Test / test_write (push) Failing after 3m6s
Test / test_write_xor (push) Failing after 3m5s
Test / test_heal_pg_size_2 (push) Failing after 3m45s
Test / test_heal_csum_32k_dj (push) Failing after 4m41s
Test / test_heal_csum_32k (push) Successful in 4m28s
Test / test_heal_ec (push) Failing after 10m12s
Test / test_heal_csum_4k_dmj (push) Failing after 4m27s
Test / test_scrub (push) Successful in 53s
Test / test_heal_csum_32k_dmj (push) Failing after 10m16s
Test / test_scrub_zero_osd_2 (push) Successful in 59s
Test / test_heal_csum_4k_dj (push) Failing after 4m24s
Test / test_scrub_pg_size_3 (push) Successful in 1m7s
Test / test_scrub_xor (push) Failing after 3m14s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Failing after 3m6s
Test / test_scrub_ec (push) Failing after 3m7s
Test / test_heal_csum_4k (push) Failing after 10m9s
Raise default io_uring size to 1024 for the same effective capacity as previously
2023-11-20 03:04:06 +03:00
6b33ae973d %d -> %lu 2023-11-20 03:02:26 +03:00
cf36445359 Reserve journal space for stabilize requests dynamically to prevent stalls 2023-11-20 03:01:57 +03:00
3fd873d263 Add -fno-omit-frame-pointer by default 2023-11-20 02:59:54 +03:00
a00e8ae9ed Fix mismatch journal pos format in vitastor-disk
Some checks failed
Test / test_interrupted_rebalance_ec (push) Successful in 1m47s
Test / test_rm (push) Successful in 15s
Test / test_snapshot_ec (push) Successful in 35s
Test / test_snapshot_down (push) Successful in 22s
Test / test_move_reappear (push) Failing after 50s
Test / test_snapshot_down_ec (push) Successful in 23s
Test / test_splitbrain (push) Successful in 22s
Test / test_snapshot_chain (push) Successful in 2m12s
Test / test_snapshot_chain_ec (push) Successful in 3m0s
Test / test_rebalance_verify_imm (push) Successful in 3m42s
Test / test_write (push) Successful in 35s
Test / test_rebalance_verify (push) Successful in 4m23s
Test / test_write_no_same (push) Successful in 13s
Test / test_rebalance_verify_ec (push) Successful in 4m45s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m49s
Test / test_write_xor (push) Failing after 3m9s
Test / test_heal_pg_size_2 (push) Successful in 3m20s
Test / test_heal_csum_32k_dmj (push) Successful in 4m41s
Test / test_heal_ec (push) Successful in 6m24s
Test / test_heal_csum_32k_dj (push) Successful in 5m53s
Test / test_heal_csum_32k (push) Successful in 6m10s
Test / test_heal_csum_4k_dmj (push) Successful in 6m21s
Test / test_scrub (push) Failing after 3m13s
Test / test_scrub_zero_osd_2 (push) Successful in 56s
Test / test_scrub_xor (push) Successful in 44s
Test / test_heal_csum_4k_dj (push) Successful in 5m48s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m0s
Test / test_scrub_ec (push) Successful in 52s
Test / test_heal_csum_4k (push) Successful in 5m43s
Test / test_scrub_pg_size_3 (push) Successful in 1m8s
2023-11-19 15:19:54 +03:00
75674545dc Limit the number of printed object versions in slow op dump (otherwise it may overflow the fixed buffer)
Some checks failed
Test / test_interrupted_rebalance_ec (push) Successful in 1m55s
Test / test_snapshot_ec (push) Successful in 35s
Test / test_rm (push) Successful in 17s
Test / test_snapshot_down (push) Successful in 22s
Test / test_move_reappear (push) Failing after 49s
Test / test_snapshot_down_ec (push) Successful in 24s
Test / test_splitbrain (push) Successful in 21s
Test / test_snapshot_chain (push) Successful in 2m17s
Test / test_snapshot_chain_ec (push) Successful in 2m58s
Test / test_rebalance_verify_imm (push) Successful in 3m12s
Test / test_write (push) Successful in 35s
Test / test_rebalance_verify (push) Successful in 3m52s
Test / test_write_no_same (push) Successful in 14s
Test / test_write_xor (push) Successful in 57s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m18s
Test / test_rebalance_verify_ec (push) Successful in 5m21s
Test / test_heal_pg_size_2 (push) Successful in 4m0s
Test / test_heal_ec (push) Successful in 4m48s
Test / test_heal_csum_32k_dmj (push) Successful in 5m8s
Test / test_heal_csum_32k_dj (push) Successful in 5m29s
Test / test_heal_csum_32k (push) Successful in 6m21s
Test / test_scrub (push) Successful in 1m15s
Test / test_heal_csum_4k_dmj (push) Successful in 6m42s
Test / test_scrub_zero_osd_2 (push) Successful in 1m1s
Test / test_scrub_xor (push) Successful in 49s
Test / test_heal_csum_4k_dj (push) Successful in 6m22s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m6s
Test / test_scrub_pg_size_3 (push) Successful in 1m37s
Test / test_scrub_ec (push) Successful in 35s
Test / test_heal_csum_4k (push) Successful in 6m16s
2023-11-13 01:10:28 +03:00
225eb2fe3d Support RDMA without ODP by stupidly copying memory. Disable ODP by default
ODP is slower than regular RDMA even with memory copy overhead

Example numbers:
- 3950000 random read iops without ODP vs 240000 iops with ODP
- 1447000 random write iops without ODP vs 101000 iops with ODP

Reference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
2023-11-12 15:03:47 +03:00
7e82573ed0 Fix RDMA connection leak which was preventing stable functioning of RDMA :)
All checks were successful
Test / test_minsize_1 (push) Successful in 14s
Test / test_snapshot_ec (push) Successful in 38s
Test / test_rm (push) Successful in 16s
Test / test_move_reappear (push) Successful in 19s
Test / test_snapshot_down (push) Successful in 30s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 22s
Test / test_snapshot_chain (push) Successful in 2m19s
Test / test_snapshot_chain_ec (push) Successful in 3m0s
Test / test_rebalance_verify_imm (push) Successful in 2m45s
Test / test_rebalance_verify (push) Successful in 3m36s
Test / test_write (push) Successful in 37s
Test / test_write_no_same (push) Successful in 15s
Test / test_write_xor (push) Successful in 53s
Test / test_rebalance_verify_ec (push) Successful in 4m54s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m8s
Test / test_heal_pg_size_2 (push) Successful in 4m5s
Test / test_heal_ec (push) Successful in 4m48s
Test / test_heal_csum_32k_dj (push) Successful in 5m25s
Test / test_heal_csum_32k_dmj (push) Successful in 5m29s
Test / test_heal_csum_32k (push) Successful in 6m22s
Test / test_scrub (push) Successful in 1m7s
Test / test_heal_csum_4k_dmj (push) Successful in 6m39s
Test / test_scrub_zero_osd_2 (push) Successful in 1m9s
Test / test_scrub_xor (push) Successful in 47s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m3s
Test / test_heal_csum_4k_dj (push) Successful in 6m12s
Test / test_heal_csum_4k (push) Successful in 6m11s
Test / test_scrub_pg_size_3 (push) Successful in 1m29s
Test / test_scrub_ec (push) Successful in 33s
2023-11-11 23:40:47 +03:00
12a6bed2d5 Return the new accidentally rolled back json11 commit ("allow trailing comma")
Some checks failed
Test / test_snapshot_ec (push) Successful in 28s
Test / test_move_reappear (push) Successful in 19s
Test / test_interrupted_rebalance_ec (push) Successful in 1m51s
Test / test_rm (push) Successful in 18s
Test / test_snapshot_down (push) Successful in 30s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 24s
Test / test_snapshot_chain (push) Successful in 2m22s
Test / test_rebalance_verify_imm (push) Successful in 3m4s
Test / test_rebalance_verify (push) Successful in 3m48s
Test / test_write (push) Successful in 39s
Test / test_write_no_same (push) Successful in 12s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m14s
Test / test_rebalance_verify_ec (push) Successful in 4m2s
Test / test_heal_pg_size_2 (push) Successful in 3m44s
Test / test_heal_csum_32k_dmj (push) Successful in 4m43s
Test / test_heal_ec (push) Successful in 6m47s
Test / test_heal_csum_32k_dj (push) Successful in 5m57s
Test / test_heal_csum_32k (push) Successful in 6m11s
Test / test_scrub (push) Successful in 1m10s
Test / test_scrub_zero_osd_2 (push) Successful in 1m17s
Test / test_heal_csum_4k_dmj (push) Successful in 6m19s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m10s
Test / test_scrub_pg_size_3 (push) Successful in 1m47s
Test / test_heal_csum_4k_dj (push) Successful in 6m3s
Test / test_heal_csum_4k (push) Successful in 6m3s
Test / test_scrub_ec (push) Successful in 28s
Test / test_scrub_xor (push) Successful in 23s
Test / test_snapshot_chain_ec (push) Successful in 2m25s
Test / test_write_xor (push) Failing after 3m6s
2023-11-07 15:49:23 +03:00
5524dbdab7 Release 1.2.0
Some checks failed
Test / test_snapshot_ec (push) Successful in 25s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 1m18s
Test / test_rm (push) Successful in 15s
Test / test_snapshot_down (push) Successful in 22s
Test / test_snapshot_down_ec (push) Successful in 23s
Test / test_splitbrain (push) Successful in 18s
Test / test_snapshot_chain (push) Successful in 2m13s
Test / test_snapshot_chain_ec (push) Successful in 2m57s
Test / test_rebalance_verify_imm (push) Successful in 2m51s
Test / test_write (push) Successful in 38s
Test / test_rebalance_verify (push) Successful in 3m39s
Test / test_write_no_same (push) Successful in 12s
Test / test_rebalance_verify_ec (push) Successful in 3m56s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m6s
Test / test_heal_pg_size_2 (push) Successful in 3m43s
Test / test_heal_csum_32k_dmj (push) Successful in 4m35s
Test / test_heal_csum_32k_dj (push) Successful in 5m44s
Test / test_heal_csum_32k (push) Successful in 5m50s
Test / test_heal_csum_4k_dmj (push) Successful in 5m44s
Test / test_scrub_zero_osd_2 (push) Successful in 57s
Test / test_scrub (push) Successful in 1m0s
Test / test_scrub_xor (push) Successful in 1m5s
Test / test_heal_csum_4k_dj (push) Successful in 5m9s
Test / test_scrub_pg_size_3 (push) Successful in 1m38s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 54s
Test / test_scrub_ec (push) Successful in 52s
Test / test_heal_csum_4k (push) Successful in 5m8s
Test / test_heal_ec (push) Successful in 3m17s
Test / test_write_xor (push) Successful in 35s
Test / test_move_reappear (push) Failing after 48s
New features:

- Implement CSI volume expansion
- Implement CSI volume snapshots
- CSI driver now requires Kubernetes >= 1.20

Bug fixes:

- Important bug fix for EC: fix EC n+k, k>=2 read recovery in ISA-L version returning
  incorrect data when reading at least the second chunk out of multiple missing chunks
  without reading the first one. All users of EC n+k, k>=2 should upgrade as soon as
  possible, and upgrade should be conducted with downtime: first stop all clients
  (VMs/containers), then all OSDs, then upgrade and restart everything.
- Fix unstable statistics aggregation in monitor (affecting vitastor-cli status and df)
- Make udev not wait for OSDs to start during boot
- Do not report negative numbers of offline PGs in vitastor-cli status when changing PG count
- Report both old and new PG counts in vitastor-cli df when changing it
- Fix OSDs sometimes not starting with "The code only supports journal versions 1 and 2,
  but it is 2 on disk" error after upgrading from pre-1.0 versions and letting OSDs run
  for some time
- Fix monitors sometimes returning old PG count back after OSD configuration changes
- Make monitor PG changes more stable and timeout errors less probable
2023-11-05 01:48:57 +03:00
cd3dec06ac Remove spaces from old->new PG count in df
All checks were successful
Test / test_interrupted_rebalance_ec (push) Successful in 1m50s
Test / test_move_reappear (push) Successful in 19s
Test / test_rm (push) Successful in 14s
Test / test_snapshot_ec (push) Successful in 33s
Test / test_snapshot_down (push) Successful in 29s
Test / test_snapshot_down_ec (push) Successful in 32s
Test / test_splitbrain (push) Successful in 24s
Test / test_snapshot_chain (push) Successful in 2m34s
Test / test_rebalance_verify_imm (push) Successful in 3m9s
Test / test_rebalance_verify (push) Successful in 4m9s
Test / test_write (push) Successful in 40s
Test / test_write_no_same (push) Successful in 13s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m15s
Test / test_rebalance_verify_ec (push) Successful in 4m29s
Test / test_heal_pg_size_2 (push) Successful in 3m21s
Test / test_heal_csum_32k_dmj (push) Successful in 5m38s
Test / test_heal_ec (push) Successful in 6m14s
Test / test_heal_csum_32k_dj (push) Successful in 6m22s
Test / test_heal_csum_32k (push) Successful in 6m40s
Test / test_scrub (push) Successful in 1m11s
Test / test_scrub_zero_osd_2 (push) Successful in 1m12s
Test / test_scrub_xor (push) Successful in 1m16s
Test / test_heal_csum_4k_dj (push) Successful in 6m4s
Test / test_heal_csum_4k_dmj (push) Successful in 6m34s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m4s
Test / test_heal_csum_4k (push) Successful in 5m37s
Test / test_scrub_ec (push) Successful in 43s
Test / test_scrub_pg_size_3 (push) Successful in 1m14s
Test / test_write_xor (push) Successful in 1m11s
Test / test_snapshot_chain_ec (push) Successful in 2m43s
2023-11-05 01:45:45 +03:00
371d79e059 Document vitastor-csi features 2023-11-05 01:05:26 +03:00
0e888e6c60 Prevent spamming etcd with last_clean_pgs update requests
All checks were successful
Test / test_snapshot_ec (push) Successful in 34s
Test / test_interrupted_rebalance_ec (push) Successful in 1m49s
Test / test_move_reappear (push) Successful in 19s
Test / test_rm (push) Successful in 14s
Test / test_snapshot_down (push) Successful in 30s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 24s
Test / test_snapshot_chain (push) Successful in 2m23s
Test / test_snapshot_chain_ec (push) Successful in 3m4s
Test / test_rebalance_verify_imm (push) Successful in 2m54s
Test / test_rebalance_verify (push) Successful in 3m48s
Test / test_write (push) Successful in 35s
Test / test_write_no_same (push) Successful in 14s
Test / test_write_xor (push) Successful in 55s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m15s
Test / test_rebalance_verify_ec (push) Successful in 5m3s
Test / test_heal_pg_size_2 (push) Successful in 3m59s
Test / test_heal_ec (push) Successful in 4m56s
Test / test_heal_csum_32k_dmj (push) Successful in 5m48s
Test / test_heal_csum_32k_dj (push) Successful in 5m44s
Test / test_heal_csum_32k (push) Successful in 6m35s
Test / test_scrub (push) Successful in 1m14s
Test / test_heal_csum_4k_dmj (push) Successful in 6m54s
Test / test_scrub_zero_osd_2 (push) Successful in 1m2s
Test / test_scrub_xor (push) Successful in 49s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m9s
Test / test_scrub_pg_size_3 (push) Successful in 1m54s
Test / test_heal_csum_4k_dj (push) Successful in 6m17s
Test / test_heal_csum_4k (push) Successful in 6m18s
Test / test_scrub_ec (push) Successful in 37s
2023-11-05 00:12:00 +03:00
408c21d8f0 Scale last_clean_pgs PG count even if current PGs already contain the new number of PGs
Some checks failed
Test / test_interrupted_rebalance_ec (push) Successful in 1m49s
Test / test_snapshot_ec (push) Successful in 31s
Test / test_rm (push) Successful in 13s
Test / test_snapshot_down (push) Successful in 23s
Test / test_move_reappear (push) Failing after 50s
Test / test_snapshot_down_ec (push) Successful in 22s
Test / test_splitbrain (push) Successful in 20s
Test / test_snapshot_chain (push) Successful in 2m15s
Test / test_snapshot_chain_ec (push) Successful in 2m56s
Test / test_rebalance_verify_imm (push) Successful in 2m59s
Test / test_write (push) Successful in 34s
Test / test_rebalance_verify (push) Successful in 3m44s
Test / test_write_no_same (push) Successful in 13s
Test / test_write_xor (push) Successful in 52s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m5s
Test / test_rebalance_verify_ec (push) Successful in 5m1s
Test / test_heal_pg_size_2 (push) Successful in 4m1s
Test / test_heal_ec (push) Successful in 5m3s
Test / test_heal_csum_32k_dmj (push) Successful in 5m13s
Test / test_heal_csum_32k_dj (push) Successful in 5m37s
Test / test_heal_csum_32k (push) Successful in 6m19s
Test / test_scrub (push) Successful in 1m11s
Test / test_heal_csum_4k_dmj (push) Successful in 6m13s
Test / test_scrub_zero_osd_2 (push) Successful in 1m5s
Test / test_scrub_xor (push) Successful in 48s
Test / test_heal_csum_4k_dj (push) Successful in 6m11s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m10s
Test / test_scrub_pg_size_3 (push) Successful in 1m29s
Test / test_heal_csum_4k (push) Successful in 6m9s
Test / test_scrub_ec (push) Successful in 35s
2023-11-04 23:45:59 +03:00
43cb9ae212 Prevent multiple parallel recheck_pgs in case of timeouts
Some checks failed
Test / test_snapshot_ec (push) Successful in 37s
Test / test_minsize_1 (push) Successful in 13s
Test / test_rm (push) Successful in 12s
Test / test_move_reappear (push) Successful in 17s
Test / test_snapshot_down (push) Successful in 30s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 22s
Test / test_snapshot_chain (push) Successful in 2m25s
Test / test_snapshot_chain_ec (push) Failing after 3m7s
Test / test_rebalance_verify_imm (push) Successful in 3m0s
Test / test_rebalance_verify (push) Successful in 3m54s
Test / test_write (push) Successful in 34s
Test / test_write_no_same (push) Successful in 14s
Test / test_write_xor (push) Successful in 52s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m6s
Test / test_rebalance_verify_ec (push) Successful in 5m10s
Test / test_heal_pg_size_2 (push) Successful in 4m1s
Test / test_heal_ec (push) Successful in 4m21s
Test / test_heal_csum_32k_dmj (push) Successful in 5m10s
Test / test_heal_csum_32k_dj (push) Successful in 5m51s
Test / test_heal_csum_32k (push) Successful in 6m54s
Test / test_heal_csum_4k_dmj (push) Successful in 6m38s
Test / test_scrub (push) Successful in 1m9s
Test / test_scrub_zero_osd_2 (push) Successful in 1m2s
Test / test_scrub_xor (push) Successful in 43s
Test / test_heal_csum_4k_dj (push) Successful in 6m24s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m16s
Test / test_scrub_pg_size_3 (push) Successful in 1m38s
Test / test_scrub_ec (push) Successful in 37s
Test / test_heal_csum_4k (push) Successful in 6m2s
2023-11-04 20:59:56 +03:00
e15b6e7805 Fix "cannot be narrowed" in clang
Some checks failed
Test / test_snapshot_ec (push) Successful in 44s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 2m10s
Test / test_rm (push) Successful in 16s
Test / test_move_reappear (push) Failing after 51s
Test / test_snapshot_down (push) Successful in 22s
Test / test_snapshot_down_ec (push) Successful in 24s
Test / test_splitbrain (push) Successful in 23s
Test / test_snapshot_chain (push) Successful in 2m32s
Test / test_snapshot_chain_ec (push) Successful in 3m2s
Test / test_rebalance_verify_imm (push) Successful in 3m0s
Test / test_write (push) Successful in 33s
Test / test_rebalance_verify (push) Successful in 3m53s
Test / test_write_no_same (push) Successful in 12s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m23s
Test / test_rebalance_verify_ec (push) Successful in 4m11s
Test / test_write_xor (push) Failing after 3m12s
Test / test_heal_pg_size_2 (push) Successful in 3m47s
Test / test_heal_csum_32k_dmj (push) Successful in 5m17s
Test / test_heal_ec (push) Successful in 5m34s
Test / test_heal_csum_32k_dj (push) Successful in 6m43s
Test / test_heal_csum_32k (push) Successful in 6m30s
Test / test_scrub (push) Successful in 1m18s
Test / test_scrub_zero_osd_2 (push) Successful in 1m11s
Test / test_heal_csum_4k_dmj (push) Successful in 6m24s
Test / test_heal_csum_4k_dj (push) Successful in 6m23s
Test / test_scrub_xor (push) Successful in 54s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m1s
Test / test_scrub_ec (push) Successful in 54s
Test / test_scrub_pg_size_3 (push) Successful in 1m25s
Test / test_heal_csum_4k (push) Successful in 6m10s
2023-11-04 18:14:44 +03:00
31017d8412 Allow to start with V2 journal with header size from V1, as incorrectly updated by previous versions 2023-11-04 18:13:42 +03:00
4819854064 Fix OSDs incorrectly updating journal superblock after upgrade to 1.x from pre-1.x and refusing to start after it
Some checks failed
Test / test_interrupted_rebalance_imm (push) Successful in 3m38s
Test / test_snapshot_ec (push) Successful in 33s
Test / test_rm (push) Successful in 16s
Test / test_snapshot_down (push) Successful in 23s
Test / test_move_reappear (push) Failing after 47s
Test / test_snapshot_down_ec (push) Successful in 23s
Test / test_splitbrain (push) Successful in 21s
Test / test_snapshot_chain (push) Successful in 2m31s
Test / test_snapshot_chain_ec (push) Successful in 3m7s
Test / test_rebalance_verify_imm (push) Successful in 2m54s
Test / test_write (push) Successful in 32s
Test / test_rebalance_verify (push) Successful in 3m46s
Test / test_write_no_same (push) Successful in 13s
Test / test_write_xor (push) Successful in 37s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m56s
Test / test_rebalance_verify_ec (push) Successful in 5m0s
Test / test_heal_pg_size_2 (push) Failing after 4m18s
Test / test_heal_ec (push) Successful in 5m3s
Test / test_heal_csum_32k_dmj (push) Successful in 5m19s
Test / test_heal_csum_32k_dj (push) Successful in 5m44s
Test / test_heal_csum_32k (push) Successful in 6m37s
Test / test_heal_csum_4k_dmj (push) Successful in 6m46s
Test / test_scrub (push) Successful in 1m5s
Test / test_scrub_zero_osd_2 (push) Successful in 48s
Test / test_scrub_xor (push) Successful in 45s
Test / test_heal_csum_4k_dj (push) Successful in 6m37s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m17s
Test / test_scrub_pg_size_3 (push) Successful in 1m40s
Test / test_scrub_ec (push) Successful in 34s
Test / test_heal_csum_4k (push) Successful in 7m13s
2023-11-04 15:02:24 +03:00
1f509cca77 Fix unused capture warnings and void* arithmetic (clang)
Some checks failed
Test / test_minsize_1 (push) Successful in 14s
Test / test_snapshot_ec (push) Successful in 40s
Test / test_rm (push) Successful in 16s
Test / test_move_reappear (push) Successful in 18s
Test / test_snapshot_down (push) Successful in 31s
Test / test_snapshot_down_ec (push) Successful in 33s
Test / test_splitbrain (push) Successful in 22s
Test / test_snapshot_chain (push) Successful in 2m31s
Test / test_snapshot_chain_ec (push) Failing after 3m7s
Test / test_rebalance_verify_imm (push) Successful in 3m6s
Test / test_write (push) Successful in 39s
Test / test_rebalance_verify (push) Successful in 4m7s
Test / test_write_no_same (push) Successful in 13s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m23s
Test / test_rebalance_verify_ec (push) Successful in 4m20s
Test / test_write_xor (push) Failing after 3m9s
Test / test_heal_pg_size_2 (push) Successful in 3m55s
Test / test_heal_csum_32k_dmj (push) Successful in 4m44s
Test / test_heal_csum_32k_dj (push) Successful in 6m8s
Test / test_heal_csum_32k (push) Successful in 5m58s
Test / test_heal_ec (push) Failing after 10m16s
Test / test_heal_csum_4k_dmj (push) Successful in 5m57s
Test / test_scrub (push) Successful in 1m8s
Test / test_scrub_zero_osd_2 (push) Successful in 59s
Test / test_scrub_xor (push) Successful in 47s
Test / test_heal_csum_4k_dj (push) Successful in 5m30s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m7s
Test / test_scrub_pg_size_3 (push) Successful in 1m34s
Test / test_heal_csum_4k (push) Successful in 5m21s
Test / test_scrub_ec (push) Successful in 43s
2023-11-04 14:55:12 +03:00
aa8e8e8271 Add version info to --help output
Some checks failed
Test / test_minsize_1 (push) Successful in 16s
Test / test_snapshot_ec (push) Successful in 39s
Test / test_move_reappear (push) Successful in 21s
Test / test_rm (push) Successful in 16s
Test / test_snapshot_down (push) Successful in 32s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 24s
Test / test_snapshot_chain (push) Successful in 2m31s
Test / test_snapshot_chain_ec (push) Failing after 3m6s
Test / test_rebalance_verify_imm (push) Successful in 2m47s
Test / test_rebalance_verify (push) Successful in 3m38s
Test / test_write (push) Successful in 38s
Test / test_write_no_same (push) Successful in 13s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m19s
Test / test_rebalance_verify_ec (push) Successful in 4m6s
Test / test_write_xor (push) Failing after 3m10s
Test / test_heal_pg_size_2 (push) Successful in 3m54s
Test / test_heal_csum_32k_dmj (push) Successful in 5m6s
Test / test_heal_ec (push) Successful in 5m48s
Test / test_heal_csum_32k_dj (push) Successful in 6m22s
Test / test_heal_csum_32k (push) Successful in 6m30s
Test / test_scrub (push) Successful in 1m18s
Test / test_scrub_zero_osd_2 (push) Successful in 1m12s
Test / test_heal_csum_4k_dmj (push) Successful in 6m38s
Test / test_heal_csum_4k_dj (push) Successful in 6m14s
Test / test_scrub_xor (push) Successful in 1m0s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 56s
Test / test_scrub_ec (push) Successful in 51s
Test / test_scrub_pg_size_3 (push) Successful in 1m20s
Test / test_heal_csum_4k (push) Successful in 5m58s
2023-11-04 13:32:12 +03:00
4d79e531c5 Do not print "-X offline" in status when changing pool PG count, print it in df instead
Some checks failed
Test / test_interrupted_rebalance_ec_imm (push) Successful in 2m20s
Test / test_rm (push) Successful in 16s
Test / test_snapshot_ec (push) Successful in 36s
Test / test_snapshot_down (push) Successful in 23s
Test / test_move_reappear (push) Failing after 48s
Test / test_snapshot_down_ec (push) Successful in 23s
Test / test_splitbrain (push) Successful in 21s
Test / test_snapshot_chain (push) Successful in 2m21s
Test / test_snapshot_chain_ec (push) Successful in 3m1s
Test / test_rebalance_verify_imm (push) Successful in 3m6s
Test / test_write (push) Successful in 35s
Test / test_rebalance_verify (push) Successful in 3m46s
Test / test_write_no_same (push) Successful in 13s
Test / test_rebalance_verify_ec (push) Successful in 4m7s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m23s
Test / test_write_xor (push) Failing after 3m8s
Test / test_heal_pg_size_2 (push) Successful in 4m15s
Test / test_heal_csum_32k_dmj (push) Successful in 4m39s
Test / test_heal_ec (push) Successful in 6m35s
Test / test_heal_csum_32k_dj (push) Successful in 6m5s
Test / test_heal_csum_32k (push) Successful in 6m45s
Test / test_scrub (push) Successful in 1m8s
Test / test_heal_csum_4k_dmj (push) Successful in 6m24s
Test / test_scrub_zero_osd_2 (push) Successful in 1m6s
Test / test_scrub_xor (push) Successful in 41s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m18s
Test / test_heal_csum_4k_dj (push) Successful in 6m29s
Test / test_scrub_pg_size_3 (push) Successful in 1m34s
Test / test_heal_csum_4k (push) Successful in 6m7s
Test / test_scrub_ec (push) Successful in 30s
2023-11-04 13:12:13 +03:00
30dff8893f Fix ISA-L version EC recovery with first missing data chunk not being read
Some checks failed
Test / test_snapshot (push) Successful in 44s
Test / test_snapshot_ec (push) Successful in 28s
Test / test_move_reappear (push) Successful in 19s
Test / test_rm (push) Successful in 16s
Test / test_snapshot_down (push) Successful in 30s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 24s
Test / test_snapshot_chain (push) Successful in 2m20s
Test / test_snapshot_chain_ec (push) Successful in 3m1s
Test / test_rebalance_verify_imm (push) Successful in 2m49s
Test / test_rebalance_verify (push) Successful in 3m37s
Test / test_write (push) Successful in 42s
Test / test_write_no_same (push) Successful in 14s
Test / test_write_xor (push) Successful in 54s
Test / test_rebalance_verify_ec (push) Successful in 4m55s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m13s
Test / test_heal_pg_size_2 (push) Successful in 4m4s
Test / test_heal_ec (push) Successful in 5m2s
Test / test_heal_csum_32k_dmj (push) Failing after 5m54s
Test / test_heal_csum_32k_dj (push) Successful in 6m6s
Test / test_heal_csum_32k (push) Successful in 6m59s
Test / test_scrub (push) Successful in 1m16s
Test / test_heal_csum_4k_dmj (push) Successful in 6m56s
Test / test_scrub_xor (push) Successful in 51s
Test / test_scrub_zero_osd_2 (push) Successful in 1m1s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m25s
Test / test_heal_csum_4k (push) Successful in 6m9s
Test / test_heal_csum_4k_dj (push) Successful in 6m33s
Test / test_scrub_pg_size_3 (push) Successful in 1m37s
Test / test_scrub_ec (push) Successful in 26s
(Yes, all EC n + k with k >= 2 users should upgrade as soon as possible)
2023-11-04 01:34:18 +03:00
becf14a705 Add a test for EC with multiple missing data chunks, but without recovery of first of them 2023-11-04 01:34:18 +03:00
64388788c1 Implement CSI volume expansion
Some checks failed
Test / test_snapshot_ec (push) Successful in 35s
Test / test_minsize_1 (push) Successful in 16s
Test / test_rm (push) Successful in 15s
Test / test_snapshot_down (push) Successful in 22s
Test / test_move_reappear (push) Failing after 50s
Test / test_snapshot_down_ec (push) Successful in 23s
Test / test_splitbrain (push) Successful in 19s
Test / test_snapshot_chain (push) Successful in 2m24s
Test / test_snapshot_chain_ec (push) Failing after 3m6s
Test / test_rebalance_verify_imm (push) Successful in 3m15s
Test / test_write (push) Successful in 41s
Test / test_rebalance_verify (push) Successful in 4m13s
Test / test_write_no_same (push) Successful in 13s
Test / test_write_xor (push) Successful in 50s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m28s
Test / test_rebalance_verify_ec (push) Successful in 5m30s
Test / test_heal_pg_size_2 (push) Successful in 4m5s
Test / test_heal_ec (push) Successful in 4m57s
Test / test_heal_csum_32k_dmj (push) Successful in 6m13s
Test / test_heal_csum_32k_dj (push) Successful in 6m10s
Test / test_heal_csum_32k (push) Successful in 6m40s
Test / test_heal_csum_4k_dmj (push) Successful in 6m24s
Test / test_scrub (push) Successful in 1m7s
Test / test_scrub_xor (push) Successful in 47s
Test / test_scrub_zero_osd_2 (push) Successful in 53s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m25s
Test / test_scrub_pg_size_3 (push) Successful in 1m59s
Test / test_heal_csum_4k_dj (push) Successful in 6m10s
Test / test_heal_csum_4k (push) Successful in 6m2s
Test / test_scrub_ec (push) Successful in 40s
2023-11-01 12:46:20 +03:00
37653abe4b Implement CSI volume snapshots 2023-11-01 12:46:20 +03:00
7c054c6f10 Add "id" to df --json output 2023-11-01 12:46:16 +03:00
bb7709e824 Support listening on non-127.0.0.1 in tests 2023-11-01 12:45:27 +03:00
ebeace5a2d Add cmake and pkg-config to debian build depends 2023-11-01 12:45:27 +03:00
a378789f10 Rollback erroneous go.mod changes in 1.1.0 O:-)
All checks were successful
Test / test_snapshot_ec (push) Successful in 30s
Test / test_move_reappear (push) Successful in 19s
Test / test_interrupted_rebalance_ec (push) Successful in 1m52s
Test / test_rm (push) Successful in 16s
Test / test_snapshot_down (push) Successful in 32s
Test / test_snapshot_down_ec (push) Successful in 31s
Test / test_splitbrain (push) Successful in 23s
Test / test_snapshot_chain (push) Successful in 2m23s
Test / test_snapshot_chain_ec (push) Successful in 3m2s
Test / test_rebalance_verify_imm (push) Successful in 3m1s
Test / test_rebalance_verify (push) Successful in 3m49s
Test / test_write (push) Successful in 41s
Test / test_write_no_same (push) Successful in 13s
Test / test_write_xor (push) Successful in 43s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m15s
Test / test_rebalance_verify_ec (push) Successful in 5m9s
Test / test_heal_pg_size_2 (push) Successful in 4m13s
Test / test_heal_ec (push) Successful in 4m31s
Test / test_heal_csum_32k_dmj (push) Successful in 5m59s
Test / test_heal_csum_32k_dj (push) Successful in 6m14s
Test / test_heal_csum_32k (push) Successful in 6m47s
Test / test_heal_csum_4k_dmj (push) Successful in 6m47s
Test / test_scrub (push) Successful in 1m11s
Test / test_scrub_zero_osd_2 (push) Successful in 1m0s
Test / test_scrub_xor (push) Successful in 52s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m19s
Test / test_scrub_pg_size_3 (push) Successful in 1m52s
Test / test_heal_csum_4k_dj (push) Successful in 6m2s
Test / test_heal_csum_4k (push) Successful in 5m46s
Test / test_scrub_ec (push) Successful in 25s
2023-10-30 18:47:48 +03:00
1fe678e57b Add --no-block to udev rule
Some checks failed
Test / test_minsize_1 (push) Successful in 13s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 2m0s
Test / test_move_reappear (push) Successful in 21s
Test / test_rm (push) Successful in 15s
Test / test_snapshot_down (push) Successful in 30s
Test / test_snapshot_down_ec (push) Successful in 32s
Test / test_splitbrain (push) Successful in 23s
Test / test_snapshot_chain (push) Successful in 2m29s
Test / test_snapshot_chain_ec (push) Successful in 3m4s
Test / test_rebalance_verify_ec_imm (push) Failing after 18s
Test / test_write (push) Successful in 29s
Test / test_rebalance_verify_imm (push) Successful in 2m53s
Test / test_write_no_same (push) Successful in 12s
Test / test_rebalance_verify (push) Successful in 3m46s
Test / test_write_xor (push) Failing after 3m6s
Test / test_rebalance_verify_ec (push) Successful in 5m1s
Test / test_heal_pg_size_2 (push) Successful in 4m50s
Test / test_heal_ec (push) Successful in 4m34s
Test / test_heal_csum_32k_dmj (push) Successful in 5m5s
Test / test_heal_csum_32k_dj (push) Successful in 5m57s
Test / test_heal_csum_32k (push) Successful in 6m56s
Test / test_heal_csum_4k_dmj (push) Successful in 7m28s
Test / test_scrub (push) Successful in 1m10s
Test / test_scrub_zero_osd_2 (push) Successful in 57s
Test / test_scrub_xor (push) Successful in 53s
Test / test_heal_csum_4k_dj (push) Successful in 6m34s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m8s
Test / test_scrub_pg_size_3 (push) Successful in 1m37s
Test / test_scrub_ec (push) Successful in 41s
Test / test_heal_csum_4k (push) Successful in 6m6s
2023-10-30 12:18:29 +03:00
2e592a2f22 Fix undefined variable "timeout"
Some checks failed
Test / test_snapshot_ec (push) Successful in 44s
Test / test_rm (push) Successful in 17s
Test / test_interrupted_rebalance_ec_imm (push) Successful in 2m9s
Test / test_snapshot_down (push) Successful in 23s
Test / test_move_reappear (push) Failing after 51s
Test / test_snapshot_down_ec (push) Successful in 25s
Test / test_splitbrain (push) Successful in 24s
Test / test_snapshot_chain (push) Successful in 2m26s
Test / test_snapshot_chain_ec (push) Failing after 3m6s
Test / test_rebalance_verify_imm (push) Successful in 3m2s
Test / test_write (push) Successful in 35s
Test / test_rebalance_verify (push) Successful in 3m56s
Test / test_write_no_same (push) Successful in 12s
Test / test_write_xor (push) Successful in 38s
Test / test_rebalance_verify_ec (push) Successful in 5m2s
Test / test_rebalance_verify_ec_imm (push) Successful in 4m13s
Test / test_heal_pg_size_2 (push) Successful in 4m17s
Test / test_heal_ec (push) Successful in 5m2s
Test / test_heal_csum_32k_dmj (push) Successful in 5m43s
Test / test_heal_csum_32k_dj (push) Successful in 5m36s
Test / test_heal_csum_32k (push) Successful in 7m4s
Test / test_heal_csum_4k_dmj (push) Successful in 6m47s
Test / test_scrub (push) Successful in 1m11s
Test / test_scrub_zero_osd_2 (push) Successful in 59s
Test / test_scrub_pg_size_3 (push) Successful in 1m26s
Test / test_heal_csum_4k_dj (push) Successful in 6m32s
Test / test_heal_csum_4k (push) Successful in 6m31s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 27s
Test / test_scrub_ec (push) Successful in 25s
Test / test_scrub_xor (push) Failing after 3m7s
2023-10-29 01:30:55 +03:00
b92f644e3a Fix statistics aggregation, calculate inode stats by first deriving per-OSD stats, too 2023-10-29 01:30:55 +03:00
890ea3dbc0 Forgot to add new parameter page to README 2023-10-28 13:39:53 +03:00
06630369bf Plans++ 2023-10-28 13:38:04 +03:00
b4740acf62 Fix operations paused for 0.5-1 second when it happens that io_uring submit is not triggered
Some checks failed
Test / test_snapshot (push) Successful in 24s
Test / test_snapshot_ec (push) Successful in 33s
Test / test_minsize_1 (push) Successful in 15s
Test / test_rm (push) Successful in 17s
Test / test_move_reappear (push) Failing after 48s
Test / test_snapshot_down_ec (push) Successful in 24s
Test / test_splitbrain (push) Successful in 22s
Test / test_snapshot_chain (push) Successful in 2m30s
Test / test_snapshot_chain_ec (push) Successful in 3m4s
Test / test_rebalance_verify_imm (push) Successful in 2m41s
Test / test_write (push) Successful in 48s
Test / test_rebalance_verify (push) Successful in 3m42s
Test / test_write_no_same (push) Successful in 12s
Test / test_rebalance_verify_ec_imm (push) Successful in 3m17s
Test / test_rebalance_verify_ec (push) Successful in 4m11s
Test / test_write_xor (push) Failing after 3m8s
Test / test_heal_pg_size_2 (push) Successful in 3m40s
Test / test_heal_csum_32k_dmj (push) Successful in 5m9s
Test / test_heal_ec (push) Successful in 6m31s
Test / test_heal_csum_32k_dj (push) Successful in 6m30s
Test / test_heal_csum_32k (push) Successful in 6m22s
Test / test_scrub (push) Successful in 1m14s
Test / test_scrub_zero_osd_2 (push) Successful in 1m20s
Test / test_heal_csum_4k_dmj (push) Successful in 6m23s
Test / test_scrub_xor (push) Successful in 1m4s
Test / test_heal_csum_4k_dj (push) Successful in 6m2s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 59s
Test / test_scrub_ec (push) Successful in 50s
Test / test_scrub_pg_size_3 (push) Successful in 1m35s
Test / test_heal_csum_4k (push) Successful in 6m1s
2023-10-28 13:18:21 +03:00
eae81bbda6 Fix typo 2023-10-28 01:09:20 +03:00
83 changed files with 4314 additions and 326 deletions

View File

@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
project(vitastor)
set(VERSION "1.1.0")
set(VERSION "1.2.0")
add_subdirectory(src)

View File

@@ -50,6 +50,7 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
- Параметры
- [Общие](docs/config/common.ru.md)
- [Сетевые](docs/config/network.ru.md)
- [Клиентский код](docs/config/client.en.md)
- [Глобальные дисковые параметры](docs/config/layout-cluster.ru.md)
- [Дисковые параметры OSD](docs/config/layout-osd.ru.md)
- [Прочие параметры OSD](docs/config/osd.ru.md)

View File

@@ -50,6 +50,7 @@ Read more details below in the documentation.
- Parameter Reference
- [Common](docs/config/common.en.md)
- [Network](docs/config/network.en.md)
- [Client](docs/config/client.en.md)
- [Global Disk Layout](docs/config/layout-cluster.en.md)
- [OSD Disk Layout](docs/config/layout-osd.en.md)
- [OSD Runtime Parameters](docs/config/osd.en.md)

View File

@@ -1,4 +1,4 @@
VERSION ?= v1.1.0
VERSION ?= v1.2.0
all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v1.1.0
image: vitalif/vitastor-csi:v1.2.0
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -35,10 +35,13 @@ rules:
verbs: ["get", "list", "watch"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshots"]
verbs: ["get", "list"]
verbs: ["get", "list", "patch"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshots/status"]
verbs: ["get", "list", "patch"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshotcontents"]
verbs: ["create", "get", "list", "watch", "update", "delete"]
verbs: ["create", "get", "list", "watch", "update", "delete", "patch"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshotclasses"]
verbs: ["get", "list", "watch"]
@@ -53,7 +56,7 @@ rules:
verbs: ["get", "list", "watch"]
- apiGroups: ["snapshot.storage.k8s.io"]
resources: ["volumesnapshotcontents/status"]
verbs: ["update"]
verbs: ["update", "patch"]
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["get"]

View File

@@ -23,6 +23,11 @@ metadata:
name: csi-vitastor-provisioner
spec:
replicas: 3
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
maxSurge: 0
selector:
matchLabels:
app: csi-vitastor-provisioner
@@ -46,7 +51,7 @@ spec:
priorityClassName: system-cluster-critical
containers:
- name: csi-provisioner
image: k8s.gcr.io/sig-storage/csi-provisioner:v2.2.0
image: k8s.gcr.io/sig-storage/csi-provisioner:v3.0.0
args:
- "--csi-address=$(ADDRESS)"
- "--v=5"
@@ -116,7 +121,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v1.1.0
image: vitalif/vitastor-csi:v1.2.0
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -17,3 +17,4 @@ parameters:
# multiple etcdUrls may be specified, delimited by comma
#etcdUrl: "http://192.168.7.2:2379"
#etcdPrefix: "/vitastor"
allowVolumeExpansion: true

View File

@@ -0,0 +1,7 @@
apiVersion: snapshot.storage.k8s.io/v1
kind: VolumeSnapshotClass
metadata:
name: vitastor-snapclass
driver: csi.vitastor.io
deletionPolicy: Delete
parameters:

View File

@@ -0,0 +1,16 @@
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: test-vitastor-clone
spec:
storageClassName: vitastor
dataSource:
name: snap1
kind: VolumeSnapshot
apiGroup: snapshot.storage.k8s.io
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi

View File

@@ -0,0 +1,8 @@
apiVersion: snapshot.storage.k8s.io/v1
kind: VolumeSnapshot
metadata:
name: snap1
spec:
volumeSnapshotClassName: vitastor-snapclass
source:
persistentVolumeClaimName: test-vitastor-pvc

View File

@@ -9,7 +9,8 @@ require (
golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
google.golang.org/grpc v1.33.1
k8s.io/klog v1.1.0
google.golang.org/protobuf v1.24.0
k8s.io/klog v1.0.0
k8s.io/utils v0.0.0-20210305010621-2afb4311ab10
)

View File

@@ -6,9 +6,9 @@ cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTj
cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0=
cloud.google.com/go v0.51.0/go.mod h1:hWtGJ6gnXH+KgDv+V0zFGDvpi07n3z8ZNj3T1RW0Gcw=
cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o=
cloud.google.com/go/datastore v1.1.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE=
cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE=
cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I=
cloud.google.com/go/storage v1.1.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw=
cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw=
dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
github.com/Azure/go-ansiterm v0.0.0-20170929234023-d6e3b3328b78/go.mod h1:LmzpDX56iTiv29bbRTIsUNlaFfuhWRQBWjQdVyAevI8=
github.com/Azure/go-autorest/autorest v0.9.0/go.mod h1:xyHB1BMZT0cuDHU7I0+g046+BFDTQ8rEZB0s4Yfa6bI=
@@ -25,14 +25,14 @@ github.com/Azure/go-autorest/tracing v0.5.0/go.mod h1:r/s2XiOKccPW3HrqB+W0TQzfbt
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ=
github.com/PuerkitoBio/purell v1.1.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
github.com/PuerkitoBio/urlesc v0.0.0-20160726150825-5bd2802263f2/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
github.com/beorn7/perks v1.1.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/blang/semver v3.5.0+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
@@ -92,13 +92,13 @@ github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QD
github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0=
github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
github.com/google/btree v1.1.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4=
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs=
github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
@@ -112,7 +112,7 @@ github.com/googleapis/gnostic v0.4.1/go.mod h1:LRhVm6pbyptWbWbuZ38d1eyptfvIytN3i
github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hpcloud/tail v1.1.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
github.com/imdario/mergo v0.3.5/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA=
github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
@@ -121,7 +121,7 @@ github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1
github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk=
github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00=
github.com/kisielk/gotool v1.1.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
@@ -153,10 +153,10 @@ github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.1.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.1.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
github.com/prometheus/client_golang v1.1.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=
github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
@@ -326,13 +326,13 @@ google.golang.org/protobuf v1.24.0 h1:UhZDfRO8JRQru4/+LlLE0BRKGF8L+PICnvYZmx/fEG
google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4=
gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.1.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.1.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
gopkg.in/check.v1 v1.1.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
gopkg.in/tomb.v1 v1.1.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
@@ -351,8 +351,8 @@ k8s.io/apimachinery v0.19.0/go.mod h1:DnPGDnARWFvYa3pMHgSxtbZb7gpzzAZ1pTfaUNDVlm
k8s.io/client-go v0.19.0/go.mod h1:H9E/VT95blcFQnlyShFgnFT9ZnJOAceiUHM3MlRC+mU=
k8s.io/component-base v0.19.0/go.mod h1:dKsY8BxkA+9dZIAh2aWJLL/UdASFDNtGYTCItL4LM7Y=
k8s.io/gengo v0.0.0-20200413195148-3a45101e95ac/go.mod h1:ezvh/TsK7cY6rbqRK0oQQ8IAqLxYwwyPxAX1Pzy0ii0=
k8s.io/klog v1.1.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8=
k8s.io/klog v1.1.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I=
k8s.io/klog v1.0.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8=
k8s.io/klog v1.0.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I=
k8s.io/klog/v2 v2.0.0/go.mod h1:PBfzABfn139FHAV07az/IF9Wp1bkk3vpT2XSJ76fSDE=
k8s.io/klog/v2 v2.2.0 h1:XRvcwJozkgZ1UQJmfMGpvRthQHOvihEhYtDfAaxMz/A=
k8s.io/klog/v2 v2.2.0/go.mod h1:Od+F08eJP+W3HUb4pSrPpgp9DGU4GzlpG/TmITuYh/Y=

View File

@@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "1.1.0"
vitastorCSIDriverVersion = "1.2.0"
)
// Config struct fills the parameters of request or user input

View File

@@ -20,6 +20,7 @@ import (
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"google.golang.org/protobuf/types/known/timestamppb"
"github.com/container-storage-interface/spec/lib/go/csi"
)
@@ -45,6 +46,7 @@ type InodeConfig struct
ParentPool uint64 `json:"parent_pool,omitempty"`
ParentId uint64 `json:"parent_id,omitempty"`
Readonly bool `json:"readonly,omitempty"`
CreateTs uint64 `json:"create_ts,omitempty"`
}
type ControllerServer struct
@@ -178,27 +180,43 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
return nil, status.Error(codes.InvalidArgument, "no etcdUrl in storage class configuration and no etcd_address in vitastor.conf")
}
args := []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) }
// Support creation from snapshot
var src *csi.VolumeContentSource
if (req.VolumeContentSource.GetSnapshot() != nil)
{
snapId := req.VolumeContentSource.GetSnapshot().GetSnapshotId()
if (snapId != "")
{
snapVars := make(map[string]string)
err := json.Unmarshal([]byte(snapId), &snapVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
args = append(args, "--parent", snapVars["name"]+"@"+snapVars["snapshot"])
src = &csi.VolumeContentSource{
Type: &csi.VolumeContentSource_Snapshot{
Snapshot: &csi.VolumeContentSource_SnapshotSource{
SnapshotId: snapId,
},
},
}
}
}
// Create image using vitastor-cli
_, err := invokeCLI(ctxVars, []string{ "create", volName, "-s", fmt.Sprintf("%v", volSize), "--pool", fmt.Sprintf("%v", poolId) })
_, err := invokeCLI(ctxVars, args)
if (err != nil)
{
if (strings.Index(err.Error(), "already exists") > 0)
{
stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", volName })
inodeCfg, err := invokeList(ctxVars, volName, true)
if (err != nil)
{
return nil, err
}
var inodeCfg []InodeConfig
err = json.Unmarshal(stat, &inodeCfg)
if (err != nil)
{
return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
}
if (len(inodeCfg) == 0)
{
return nil, status.Error(codes.Internal, "vitastor-cli create said that image already exists, but ls can't find it")
}
if (inodeCfg[0].Size < uint64(volSize))
{
return nil, status.Error(codes.Internal, "image "+volName+" is already created, but size is less than expected")
@@ -217,6 +235,7 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol
// Ugly, but VolumeContext isn't passed to DeleteVolume :-(
VolumeId: string(volumeIdJson),
CapacityBytes: volSize,
ContentSource: src,
},
}, nil
}
@@ -230,15 +249,15 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol
return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
}
ctxVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
volVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &volVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := ctxVars["name"]
volName := volVars["name"]
ctxVars, _, _ = GetConnectionParams(ctxVars)
ctxVars, _, _ := GetConnectionParams(volVars)
_, err = invokeCLI(ctxVars, []string{ "rm", volName })
if (err != nil)
@@ -344,6 +363,8 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
csi.ControllerServiceCapability_RPC_LIST_VOLUMES,
csi.ControllerServiceCapability_RPC_EXPAND_VOLUME,
csi.ControllerServiceCapability_RPC_CREATE_DELETE_SNAPSHOT,
csi.ControllerServiceCapability_RPC_LIST_SNAPSHOTS,
// TODO: csi.ControllerServiceCapability_RPC_CLONE_VOLUME,
} {
controllerServerCapabilities = append(controllerServerCapabilities, functionControllerServerCapabilities(capability))
}
@@ -353,28 +374,214 @@ func (cs *ControllerServer) ControllerGetCapabilities(ctx context.Context, req *
}, nil
}
func invokeList(ctxVars map[string]string, pattern string, expectExist bool) ([]InodeConfig, error)
{
stat, err := invokeCLI(ctxVars, []string{ "ls", "--json", pattern })
if (err != nil)
{
return nil, err
}
var inodeCfg []InodeConfig
err = json.Unmarshal(stat, &inodeCfg)
if (err != nil)
{
return nil, status.Error(codes.Internal, "Invalid JSON in vitastor-cli ls: "+err.Error())
}
if (expectExist && len(inodeCfg) == 0)
{
return nil, status.Error(codes.Internal, "Can't find expected image "+pattern+" via vitastor-cli ls")
}
return inodeCfg, nil
}
// CreateSnapshot create snapshot of an existing PV
func (cs *ControllerServer) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshotRequest) (*csi.CreateSnapshotResponse, error)
{
return nil, status.Error(codes.Unimplemented, "")
klog.Infof("received controller create snapshot request %+v", protosanitizer.StripSecrets(req))
if (req == nil)
{
return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
}
if (req.SourceVolumeId == "" || req.Name == "")
{
return nil, status.Error(codes.InvalidArgument, "source volume ID and snapshot name are required fields")
}
// snapshot name
snapName := req.Name
// req.VolumeId is an ugly json string in our case :)
ctxVars := make(map[string]string)
err := json.Unmarshal([]byte(req.SourceVolumeId), &ctxVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := ctxVars["name"]
// Create image using vitastor-cli
_, err = invokeCLI(ctxVars, []string{ "create", "--snapshot", snapName, volName })
if (err != nil && strings.Index(err.Error(), "already exists") <= 0)
{
return nil, err
}
// Check created snapshot
inodeCfg, err := invokeList(ctxVars, volName+"@"+snapName, true)
if (err != nil)
{
return nil, err
}
// Use ugly JSON snapshot ID again, DeleteSnapshot doesn't have context :-(
ctxVars["snapshot"] = snapName
snapIdJson, _ := json.Marshal(ctxVars)
return &csi.CreateSnapshotResponse{
Snapshot: &csi.Snapshot{
SizeBytes: int64(inodeCfg[0].Size),
SnapshotId: string(snapIdJson),
SourceVolumeId: req.SourceVolumeId,
CreationTime: &timestamppb.Timestamp{ Seconds: int64(inodeCfg[0].CreateTs) },
ReadyToUse: true,
},
}, nil
}
// DeleteSnapshot delete provided snapshot of a PV
func (cs *ControllerServer) DeleteSnapshot(ctx context.Context, req *csi.DeleteSnapshotRequest) (*csi.DeleteSnapshotResponse, error)
{
return nil, status.Error(codes.Unimplemented, "")
klog.Infof("received controller delete snapshot request %+v", protosanitizer.StripSecrets(req))
if (req == nil)
{
return nil, status.Errorf(codes.InvalidArgument, "request cannot be empty")
}
if (req.SnapshotId == "")
{
return nil, status.Error(codes.InvalidArgument, "snapshot ID is a required field")
}
volVars := make(map[string]string)
err := json.Unmarshal([]byte(req.SnapshotId), &volVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "snapshot ID not in JSON format")
}
volName := volVars["name"]
snapName := volVars["snapshot"]
ctxVars, _, _ := GetConnectionParams(volVars)
_, err = invokeCLI(ctxVars, []string{ "rm", volName+"@"+snapName })
if (err != nil)
{
return nil, err
}
return &csi.DeleteSnapshotResponse{}, nil
}
// ListSnapshots list the snapshots of a PV
func (cs *ControllerServer) ListSnapshots(ctx context.Context, req *csi.ListSnapshotsRequest) (*csi.ListSnapshotsResponse, error)
{
return nil, status.Error(codes.Unimplemented, "")
klog.Infof("received controller list snapshots request %+v", protosanitizer.StripSecrets(req))
if (req == nil)
{
return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
}
volVars := make(map[string]string)
err := json.Unmarshal([]byte(req.SourceVolumeId), &volVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := volVars["name"]
ctxVars, _, _ := GetConnectionParams(volVars)
inodeCfg, err := invokeList(ctxVars, volName+"@*", false)
if (err != nil)
{
return nil, err
}
resp := &csi.ListSnapshotsResponse{}
for _, ino := range inodeCfg
{
snapName := ino.Name[len(volName)+1:]
if (len(req.StartingToken) > 0 && snapName < req.StartingToken)
{
}
else if (req.MaxEntries == 0 || len(resp.Entries) < int(req.MaxEntries))
{
volVars["snapshot"] = snapName
snapIdJson, _ := json.Marshal(volVars)
resp.Entries = append(resp.Entries, &csi.ListSnapshotsResponse_Entry{
Snapshot: &csi.Snapshot{
SizeBytes: int64(ino.Size),
SnapshotId: string(snapIdJson),
SourceVolumeId: req.SourceVolumeId,
CreationTime: &timestamppb.Timestamp{ Seconds: int64(ino.CreateTs) },
ReadyToUse: true,
},
})
}
else
{
resp.NextToken = snapName
break
}
}
return resp, nil
}
// ControllerExpandVolume resizes a volume
// ControllerExpandVolume increases the size of a volume
func (cs *ControllerServer) ControllerExpandVolume(ctx context.Context, req *csi.ControllerExpandVolumeRequest) (*csi.ControllerExpandVolumeResponse, error)
{
return nil, status.Error(codes.Unimplemented, "")
klog.Infof("received controller expand volume request %+v", protosanitizer.StripSecrets(req))
if (req == nil)
{
return nil, status.Error(codes.InvalidArgument, "request cannot be empty")
}
if (req.VolumeId == "" || req.CapacityRange == nil || req.CapacityRange.RequiredBytes == 0)
{
return nil, status.Error(codes.InvalidArgument, "VolumeId, CapacityRange and RequiredBytes are required fields")
}
volVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &volVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
}
volName := volVars["name"]
ctxVars, _, _ := GetConnectionParams(volVars)
inodeCfg, err := invokeList(ctxVars, volName, true)
if (err != nil)
{
return nil, err
}
if (req.CapacityRange.RequiredBytes > 0 && inodeCfg[0].Size < uint64(req.CapacityRange.RequiredBytes))
{
sz := ((req.CapacityRange.RequiredBytes+4095)/4096)*4096
_, err := invokeCLI(ctxVars, []string{ "modify", "--inc_size", "1", "--resize", fmt.Sprintf("%d", sz), volName })
if (err != nil)
{
return nil, err
}
inodeCfg, err = invokeList(ctxVars, volName, true)
if (err != nil)
{
return nil, err
}
}
return &csi.ControllerExpandVolumeResponse{
CapacityBytes: int64(inodeCfg[0].Size),
NodeExpansionRequired: false,
}, nil
}
// ControllerGetVolume get volume info

View File

@@ -49,6 +49,13 @@ func (is *IdentityServer) GetPluginCapabilities(ctx context.Context, req *csi.Ge
},
},
},
{
Type: &csi.PluginCapability_VolumeExpansion_{
VolumeExpansion: &csi.PluginCapability_VolumeExpansion{
Type: csi.PluginCapability_VolumeExpansion_OFFLINE,
},
},
},
},
}, nil
}

View File

@@ -70,10 +70,10 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
isBlock := req.GetVolumeCapability().GetBlock() != nil
// Check that it's not already mounted
_, error := mount.IsNotMountPoint(ns.mounter, targetPath)
if (error != nil)
_, err := mount.IsNotMountPoint(ns.mounter, targetPath)
if (err != nil)
{
if (os.IsNotExist(error))
if (os.IsNotExist(err))
{
if (isBlock)
{
@@ -102,12 +102,12 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
}
else
{
return nil, status.Error(codes.Internal, error.Error())
return nil, status.Error(codes.Internal, err.Error())
}
}
ctxVars := make(map[string]string)
err := json.Unmarshal([]byte(req.VolumeId), &ctxVars)
err = json.Unmarshal([]byte(req.VolumeId), &ctxVars)
if (err != nil)
{
return nil, status.Error(codes.Internal, "volume ID not in JSON format")
@@ -147,70 +147,74 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
}
devicePath := strings.TrimSpace(stdoutStr)
// Check existing format
diskMounter := &mount.SafeFormatAndMount{Interface: ns.mounter, Exec: utilexec.New()}
existingFormat, err := diskMounter.GetDiskFormat(devicePath)
if (err != nil)
{
klog.Errorf("failed to get disk format for path %s, error: %v", err)
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
return nil, err
}
// Format the device (ext4 or xfs)
fsType := req.GetVolumeCapability().GetMount().GetFsType()
opt := req.GetVolumeCapability().GetMount().GetMountFlags()
opt = append(opt, "_netdev")
if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
!Contains(opt, "ro"))
{
opt = append(opt, "ro")
}
if (fsType == "xfs")
{
opt = append(opt, "nouuid")
}
readOnly := Contains(opt, "ro")
if (existingFormat == "" && !readOnly)
{
args := []string{}
switch fsType
{
case "ext4":
args = []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
case "xfs":
args = []string{"-K", devicePath}
}
if (len(args) > 0)
{
cmdOut, cmdErr := diskMounter.Exec.Command("mkfs."+fsType, args...).CombinedOutput()
if (cmdErr != nil)
{
klog.Errorf("failed to run mkfs error: %v, output: %v", cmdErr, string(cmdOut))
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
return nil, status.Error(codes.Internal, cmdErr.Error())
}
}
}
if (isBlock)
{
opt = append(opt, "bind")
err = diskMounter.Mount(devicePath, targetPath, fsType, opt)
err = diskMounter.Mount(devicePath, targetPath, "", []string{"bind"})
}
else
{
// Check existing format
existingFormat, err := diskMounter.GetDiskFormat(devicePath)
if (err != nil)
{
klog.Errorf("failed to get disk format for path %s, error: %v", err)
goto unmap
}
// Format the device (ext4 or xfs)
fsType := req.GetVolumeCapability().GetMount().GetFsType()
opt := req.GetVolumeCapability().GetMount().GetMountFlags()
opt = append(opt, "_netdev")
if ((req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_READER_ONLY ||
req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY) &&
!Contains(opt, "ro"))
{
opt = append(opt, "ro")
}
if (fsType == "xfs")
{
opt = append(opt, "nouuid")
}
readOnly := Contains(opt, "ro")
if (existingFormat == "" && !readOnly)
{
var cmdOut []byte
switch fsType
{
case "ext4":
args := []string{"-m0", "-Enodiscard,lazy_itable_init=1,lazy_journal_init=1", devicePath}
cmdOut, err = diskMounter.Exec.Command("mkfs.ext4", args...).CombinedOutput()
case "xfs":
cmdOut, err = diskMounter.Exec.Command("mkfs.xfs", "-K", devicePath).CombinedOutput()
}
if (err != nil)
{
klog.Errorf("failed to run mkfs error: %v, output: %v", err, string(cmdOut))
goto unmap
}
}
err = diskMounter.FormatAndMount(devicePath, targetPath, fsType, opt)
// Try to run online resize on mount.
// FIXME: Implement online resize. It requires online resize support in vitastor-nbd.
if (err == nil && existingFormat != "" && !readOnly)
{
var cmdOut []byte
switch (fsType)
{
case "ext4":
cmdOut, err = diskMounter.Exec.Command("resize2fs", devicePath).CombinedOutput()
case "xfs":
cmdOut, err = diskMounter.Exec.Command("xfs_growfs", devicePath).CombinedOutput()
}
if (err != nil)
{
klog.Errorf("failed to run resizefs error: %v, output: %v", err, string(cmdOut))
goto unmap
}
}
}
if (err != nil)
{
@@ -218,15 +222,18 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
"failed to mount device path (%s) to path (%s) for volume (%s) error: %s",
devicePath, targetPath, volName, err,
)
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
return nil, status.Error(codes.Internal, err.Error())
goto unmap
}
return &csi.NodePublishVolumeResponse{}, nil
unmap:
// unmap NBD device
unmapOut, unmapErr := exec.Command("/usr/bin/vitastor-nbd", "unmap", devicePath).CombinedOutput()
if (unmapErr != nil)
{
klog.Errorf("failed to unmap NBD device %s: %s, error: %v", devicePath, unmapOut, unmapErr)
}
return nil, status.Error(codes.Internal, err.Error())
}
// NodeUnpublishVolume unmounts the volume from the target path

4
debian/changelog vendored
View File

@@ -1,10 +1,10 @@
vitastor (1.1.0-1) unstable; urgency=medium
vitastor (1.2.0-1) unstable; urgency=medium
* Bugfixes
-- Vitaliy Filippov <vitalif@yourcmc.ru> Fri, 03 Jun 2022 02:09:44 +0300
vitastor (1.1.0-1) unstable; urgency=medium
vitastor (1.2.0-1) unstable; urgency=medium
* Implement NFS proxy
* Add documentation

2
debian/control vendored
View File

@@ -2,7 +2,7 @@ Source: vitastor
Section: admin
Priority: optional
Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8), linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev, libibverbs-dev, libisal-dev, cmake, pkg-config
Standards-Version: 4.5.0
Homepage: https://vitastor.io/
Rules-Requires-Root: no

View File

@@ -35,8 +35,8 @@ RUN set -e -x; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
cp -r /root/vitastor vitastor-1.1.0; \
cd vitastor-1.1.0; \
cp -r /root/vitastor vitastor-1.2.0; \
cd vitastor-1.2.0; \
ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@@ -49,8 +49,8 @@ RUN set -e -x; \
rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.1.0.orig.tar.xz vitastor-1.1.0; \
cd vitastor-1.1.0; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.2.0.orig.tar.xz vitastor-1.2.0; \
cd vitastor-1.2.0; \
V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \

View File

@@ -67,8 +67,8 @@
явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
буферизуются, это может приводить к потере данных. Поэтому в старых версиях
клиентских драйверов буферизация записи не включается вообще, в новых
версиях QEMU-драйвера включается только если разрешена опцией диска
`-blockdev cache.direct=false`, а в fio - только если нет опция `-direct=1`.
версиях QEMU-драйвера включается, только если разрешена опцией диска
`-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,

View File

@@ -20,6 +20,7 @@ between clients, OSDs and etcd.
- [rdma_max_msg](#rdma_max_msg)
- [rdma_max_recv](#rdma_max_recv)
- [rdma_max_send](#rdma_max_send)
- [rdma_odp](#rdma_odp)
- [peer_connect_interval](#peer_connect_interval)
- [peer_connect_timeout](#peer_connect_timeout)
- [osd_idle_timeout](#osd_idle_timeout)
@@ -68,11 +69,14 @@ but they are not connected to the cluster.
- Type: string
RDMA device name to use for Vitastor OSD communications (for example,
"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
to work. For example, Mellanox ConnectX-3 and older adapters don't have
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
root to list available RDMA devices and their features.
"rocep5s0f0"). Now Vitastor supports all adapters, even ones without
ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
Versions up to Vitastor 1.2.0 required ODP which is only present in
Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
Run `ibv_devinfo -v` as root to list available RDMA devices and their
features.
Remember that you also have to configure your network switches if you use
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
@@ -147,6 +151,28 @@ less than `rdma_max_recv` so the receiving side doesn't run out of buffers.
Doesn't affect memory usage - additional memory isn't allocated for send
operations.
## rdma_odp
- Type: boolean
- Default: false
Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
for RDMA adapter to be able to use it. This, in turn, allows to skip memory
copying during sending. One would think this should improve performance, but
**in reality** RDMA performance with ODP is **drastically** worse. Example
3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
This happens because Mellanox ODP implementation seems to be based on
message retransmissions when the adapter doesn't know about the buffer yet -
it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
which is generally slow in RDMA/RoCE networks. Here's a presentation about
it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
ODP support is retained in the code just in case a good ODP implementation
appears one day.
## peer_connect_interval
- Type: seconds

View File

@@ -20,6 +20,7 @@
- [rdma_max_msg](#rdma_max_msg)
- [rdma_max_recv](#rdma_max_recv)
- [rdma_max_send](#rdma_max_send)
- [rdma_odp](#rdma_odp)
- [peer_connect_interval](#peer_connect_interval)
- [peer_connect_timeout](#peer_connect_timeout)
- [osd_idle_timeout](#osd_idle_timeout)
@@ -71,12 +72,15 @@ RDMA может быть нужно только если у клиентов е
- Тип: строка
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
параметры и возможности.
Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
картами производства не Mellanox.
Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
список доступных RDMA-устройств, их параметры и возможности.
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
@@ -155,6 +159,29 @@ OSD в любом случае согласовывают реальное зн
Не влияет на потребление памяти - дополнительная память на операции отправки
не выделяется.
## rdma_odp
- Тип: булево (да/нет)
- Значение по умолчанию: false
Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
не регистрировать память для её использования RDMA-картой. Благодаря этому
можно не копировать данные при отправке их в сеть и, казалось бы, это должно
улучшать производительность - но **по факту** получается так, что
производительность только ухудшается, причём сильно. Пример - на 3-узловом
кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
основана на повторной передаче сообщений, когда карте не известен буфер -
вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
Возможность использования ODP сохранена в коде на случай, если вдруг в один
прекрасный день появится хорошая реализация ODP.
## peer_connect_interval
- Тип: секунды

View File

@@ -87,8 +87,8 @@
явно не разрешена клиентом, т.к. если клиент не знает, что запросы записи
буферизуются, это может приводить к потере данных. Поэтому в старых версиях
клиентских драйверов буферизация записи не включается вообще, в новых
версиях QEMU-драйвера включается только если разрешена опцией диска
`-blockdev cache.direct=false`, а в fio - только если нет опция `-direct=1`.
версиях QEMU-драйвера включается, только если разрешена опцией диска
`-blockdev cache.direct=false`, а в fio - только если нет опции `-direct=1`.
В NBD и NFS драйверах буферизация записи разрешена по умолчанию.
Можно обойти и это ограничение с помощью параметра `client_writeback_allowed`,

View File

@@ -48,11 +48,14 @@
type: string
info: |
RDMA device name to use for Vitastor OSD communications (for example,
"rocep5s0f0"). Please note that Vitastor RDMA requires Implicit On-Demand
Paging (Implicit ODP) and Scatter/Gather (SG) support from the RDMA device
to work. For example, Mellanox ConnectX-3 and older adapters don't have
Implicit ODP, so they're unsupported by Vitastor. Run `ibv_devinfo -v` as
root to list available RDMA devices and their features.
"rocep5s0f0"). Now Vitastor supports all adapters, even ones without
ODP support, like Mellanox ConnectX-3 and non-Mellanox cards.
Versions up to Vitastor 1.2.0 required ODP which is only present in
Mellanox ConnectX >= 4. See also [rdma_odp](#rdma_odp).
Run `ibv_devinfo -v` as root to list available RDMA devices and their
features.
Remember that you also have to configure your network switches if you use
RoCE/RoCEv2, otherwise you may experience unstable performance. Refer to
@@ -61,12 +64,15 @@
PFC (Priority Flow Control) and ECN (Explicit Congestion Notification).
info_ru: |
Название RDMA-устройства для связи с Vitastor OSD (например, "rocep5s0f0").
Имейте в виду, что поддержка RDMA в Vitastor требует функций устройства
Implicit On-Demand Paging (Implicit ODP) и Scatter/Gather (SG). Например,
адаптеры Mellanox ConnectX-3 и более старые не поддерживают Implicit ODP и
потому не поддерживаются в Vitastor. Запустите `ibv_devinfo -v` от имени
суперпользователя, чтобы посмотреть список доступных RDMA-устройств, их
параметры и возможности.
Сейчас Vitastor поддерживает все модели адаптеров, включая те, у которых
нет поддержки ODP, то есть вы можете использовать RDMA с ConnectX-3 и
картами производства не Mellanox.
Версии Vitastor до 1.2.0 включительно требовали ODP, который есть только
на Mellanox ConnectX 4 и более новых. См. также [rdma_odp](#rdma_odp).
Запустите `ibv_devinfo -v` от имени суперпользователя, чтобы посмотреть
список доступных RDMA-устройств, их параметры и возможности.
Обратите внимание, что если вы используете RoCE/RoCEv2, вам также необходимо
правильно настроить для него коммутаторы, иначе вы можете столкнуться с
@@ -160,6 +166,45 @@
у принимающей стороны в процессе работы не заканчивались буферы на приём.
Не влияет на потребление памяти - дополнительная память на операции отправки
не выделяется.
- name: rdma_odp
type: bool
default: false
online: false
info: |
Use RDMA with On-Demand Paging. ODP is currently only available on Mellanox
ConnectX-4 and newer adapters. ODP allows to not register memory explicitly
for RDMA adapter to be able to use it. This, in turn, allows to skip memory
copying during sending. One would think this should improve performance, but
**in reality** RDMA performance with ODP is **drastically** worse. Example
3-node cluster with 8 NVMe in each node and 2*25 GBit/s ConnectX-6 RDMA network
without ODP pushes 3950000 read iops, but only 239000 iops with ODP...
This happens because Mellanox ODP implementation seems to be based on
message retransmissions when the adapter doesn't know about the buffer yet -
it likely uses standard "RNR retransmissions" (RNR = receiver not ready)
which is generally slow in RDMA/RoCE networks. Here's a presentation about
it from ISPASS-2021 conference: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
ODP support is retained in the code just in case a good ODP implementation
appears one day.
info_ru: |
Использовать RDMA с On-Demand Paging. ODP - функция, доступная пока что
исключительно на адаптерах Mellanox ConnectX-4 и более новых. ODP позволяет
не регистрировать память для её использования RDMA-картой. Благодаря этому
можно не копировать данные при отправке их в сеть и, казалось бы, это должно
улучшать производительность - но **по факту** получается так, что
производительность только ухудшается, причём сильно. Пример - на 3-узловом
кластере с 8 NVMe в каждом узле и сетью 2*25 Гбит/с на чтение с RDMA без ODP
удаётся снять 3950000 iops, а с ODP - всего 239000 iops...
Это происходит из-за того, что реализация ODP у Mellanox неоптимальная и
основана на повторной передаче сообщений, когда карте не известен буфер -
вероятно, на стандартных "RNR retransmission" (RNR = receiver not ready).
А данные повторные передачи в RDMA/RoCE - всегда очень медленная штука.
Презентация на эту тему с конференции ISPASS-2021: https://tkygtr6.github.io/pub/ISPASS21_slides.pdf
Возможность использования ODP сохранена в коде на случай, если вдруг в один
прекрасный день появится хорошая реализация ODP.
- name: peer_connect_interval
type: sec
min: 1

View File

@@ -17,4 +17,15 @@ and apply all `NNN-*.yaml` manifests to your Kubernetes installation:
for i in ./???-*.yaml; do kubectl apply -f $i; done
```
After that you'll be able to create PersistentVolumes. See example in [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
After that you'll be able to create PersistentVolumes.
## Features
Vitastor CSI supports:
- Kubernetes starting with 1.20 (or 1.17 for older vitastor-csi <= 1.1.0)
- Filesystem RWO (ReadWriteOnce) volumes. Example: [PVC](../../csi/deploy/example-pvc.yaml), [pod](../../csi/deploy/example-test-pod.yaml)
- Raw block RWX (ReadWriteMany) volumes. Example: [PVC](../../csi/deploy/example-pvc-block.yaml), [pod](../../csi/deploy/example-test-pod-block.yaml)
- Volume expansion
- Volume snapshots. Example: [snapshot class](../../csi/deploy/example-snapshot-class.yaml), [snapshot](../../csi/deploy/example-snapshot.yaml), [clone](../../csi/deploy/example-snapshot-clone.yaml)
Remember that to use snapshots with CSI you also have to install [Snapshot Controller and CRDs](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).

View File

@@ -17,4 +17,15 @@
for i in ./???-*.yaml; do kubectl apply -f $i; done
```
После этого вы сможете создавать PersistentVolume. Пример смотрите в файле [csi/deploy/example-pvc.yaml](../../csi/deploy/example-pvc.yaml).
После этого вы сможете создавать PersistentVolume.
## Возможности
CSI-плагин Vitastor поддерживает:
- Версии Kubernetes, начиная с 1.20 (или с 1.17 для более старых vitastor-csi <= 1.1.0)
- Файловые RWO (ReadWriteOnce) тома. Пример: [PVC](../../csi/deploy/example-pvc.yaml), [под](../../csi/deploy/example-test-pod.yaml)
- Сырые блочные RWX (ReadWriteMany) тома. Пример: [PVC](../../csi/deploy/example-pvc-block.yaml), [под](../../csi/deploy/example-test-pod-block.yaml)
- Расширение размера томов
- Снимки томов. Пример: [класс снимков](../../csi/deploy/example-snapshot-class.yaml), [снимок](../../csi/deploy/example-snapshot.yaml), [клон снимка](../../csi/deploy/example-snapshot-clone.yaml)
Не забывайте, что для использования снимков нужно сначала установить [контроллер снимков и CRD](https://kubernetes-csi.github.io/docs/snapshot-controller.html#deployment).

View File

@@ -51,13 +51,15 @@
The following features are planned for the future:
- File system
- Control plane optimisation
- Other administrative tools
- Web GUI
- OpenNebula plugin
- iSCSI proxy
- iSCSI and NVMeoF gateways
- Multi-threaded client
- Faster failover
- S3
- Tiered storage (SSD caching)
- NVDIMM support
- Compression (possibly)
- Read caching using system page cache (possibly)

View File

@@ -51,12 +51,15 @@
## Планы развития
- Файловая система
- Оптимизация слоя управления
- Другие инструменты администрирования
- Web-интерфейс
- Плагин для OpenNebula
- iSCSI-прокси
- iSCSI и NVMeoF прокси
- Многопоточный клиент
- Более быстрое переключение при отказах
- S3
- Поддержка SSD-кэширования (tiered storage)
- Поддержка NVDIMM
- Возможно, сжатие

View File

@@ -127,19 +127,46 @@ Linux kernel, starting with version 5.15, supports a new interface for attaching
to the host - VDUSE (vDPA Device in Userspace). QEMU, starting with 7.2, has support for
exporting QEMU block devices over this protocol using qemu-storage-daemon.
VDUSE has the same problem as other FUSE-like interfaces in Linux: if a userspace process hangs,
for example, if it loses connectivity with Vitastor cluster - active processes doing I/O may
hang in the D state (uninterruptible sleep) and you won't be able to kill them even with kill -9.
In this case reboot will be the only way to remove VDUSE devices from system.
VDUSE is currently the best interface to attach Vitastor disks as kernel devices because:
- It avoids data copies and thus achieves much better performance than [NBD](nbd.en.md)
- It doesn't have NBD timeout problem - the device doesn't die if an operation executes for too long
- It doesn't have hung device problem - if the userspace process dies it can be restarted (!)
and block device will continue operation
- It doesn't seem to have the device number limit
On the other hand, VDUSE is faster than [NBD](nbd.en.md), so you may prefer to use it if
performance is important for you. Approximate performance numbers:
direct fio benchmark - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
Example performance comparison:
| | direct fio | NBD | VDUSE |
|----------------------|-------------|-------------|-------------|
| linear write | 3.85 GB/s | 1.12 GB/s | 3.85 GB/s |
| 4k random write Q128 | 240000 iops | 120000 iops | 178000 iops |
| 4k random write Q1 | 9500 iops | 7620 iops | 7640 iops |
| linear read | 4.3 GB/s | 1.8 GB/s | 2.85 GB/s |
| 4k random read Q128 | 287000 iops | 140000 iops | 189000 iops |
| 4k random read Q1 | 9600 iops | 7640 iops | 7780 iops |
To try VDUSE you need at least Linux 5.15, built with VDUSE support
(CONFIG_VIRTIO_VDPA=m and CONFIG_VDPA_USER=m). Debian Linux kernels have these options
disabled by now, so if you want to try it on Debian, use a kernel from Ubuntu
[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) or Proxmox.
(CONFIG_VIRTIO_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
Debian Linux kernels have these options disabled by now, so if you want to try it on Debian,
use a kernel from Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/), Proxmox,
or build modules for Debian kernel manually:
```
mkdir build
cd build
apt-get install linux-headers-`uname -r`
apt-get build-dep linux-image-`uname -r`-unsigned
apt-get source linux-image-`uname -r`-unsigned
cd linux*/drivers/vdpa
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
cd ../virtio
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
depmod -a
```
You also need `vdpa` tool from the `iproute2` package.
Commands to attach Vitastor image as a VDUSE device:
@@ -152,7 +179,7 @@ qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitas
vdpa dev add name test1 mgmtdev vduse
```
After running these commands /dev/vda device will appear in the system and you'll be able to
After running these commands, `/dev/vda` device will appear in the system and you'll be able to
use it as a normal disk.
To remove the device:

View File

@@ -129,19 +129,47 @@ qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
к системе - VDUSE (vDPA Device in Userspace), а в QEMU, начиная с версии 7.2, есть поддержка
экспорта блочных устройств QEMU по этому протоколу через qemu-storage-daemon.
VDUSE страдает общей проблемой FUSE-подобных интерфейсов в Linux: если пользовательский процесс
подвиснет, например, если будет потеряна связь с кластером Vitastor - читающие/пишущие в кластер
процессы могут "залипнуть" в состоянии D (непрерываемый сон) и их будет невозможно убить даже
через kill -9. В этом случае удалить из системы устройство можно только перезагрузившись.
VDUSE - на данный момент лучший интерфейс для подключения дисков Vitastor в виде блочных
устройств на уровне ядра, ибо:
- VDUSE не копирует данные и поэтому достигает значительно лучшей производительности, чем [NBD](nbd.ru.md)
- Также оно не имеет проблемы NBD-таймаута - устройство не умирает, если операция выполняется слишком долго
- Также оно не имеет проблемы подвисающих устройств - если процесс-обработчик умирает, его можно
перезапустить (!) и блочное устройство продолжит работать
- По-видимому, у него нет предела числа подключаемых в систему устройств
С другой стороны, VDUSE быстрее по сравнению с [NBD](nbd.ru.md), поэтому его может
быть предпочтительно использовать там, где производительность важнее. Порядок показателей:
прямое тестирование через fio - 115000 iops, NBD - 60000 iops, VDUSE - 90000 iops.
Пример сравнения производительности:
Чтобы использовать VDUSE, вам нужно ядро Linux версии хотя бы 5.15, собранное с поддержкой
VDUSE (CONFIG_VIRTIO_VDPA=m и CONFIG_VDPA_USER=m). В ядрах в Debian Linux поддержка пока
отключена - если хотите попробовать эту функцию на Debian, поставьте ядро из Ubuntu
[kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/) или из Proxmox.
| | Прямой fio | NBD | VDUSE |
|--------------------------|-------------|-------------|-------------|
| линейная запись | 3.85 GB/s | 1.12 GB/s | 3.85 GB/s |
| 4k случайная запись Q128 | 240000 iops | 120000 iops | 178000 iops |
| 4k случайная запись Q1 | 9500 iops | 7620 iops | 7640 iops |
| линейное чтение | 4.3 GB/s | 1.8 GB/s | 2.85 GB/s |
| 4k случайное чтение Q128 | 287000 iops | 140000 iops | 189000 iops |
| 4k случайное чтение Q1 | 9600 iops | 7640 iops | 7780 iops |
Чтобы попробовать VDUSE, вам нужно ядро Linux как минимум версии 5.15, собранное с поддержкой
VDUSE (CONFIG_VIRTIO_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
В ядрах в Debian Linux поддержка пока отключена по умолчанию, так что чтобы попробовать VDUSE
на Debian, поставьте ядро из Ubuntu [kernel-ppa/mainline](https://kernel.ubuntu.com/~kernel-ppa/mainline/),
из Proxmox или соберите модули для ядра Debian вручную:
```
mkdir build
cd build
apt-get install linux-headers-`uname -r`
apt-get build-dep linux-image-`uname -r`-unsigned
apt-get source linux-image-`uname -r`-unsigned
cd linux*/drivers/vdpa
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
cat Module.symvers >> /lib/modules/`uname -r`/build/Module.symvers
cd ../virtio
make -C /lib/modules/`uname -r`/build M=$PWD CONFIG_VDPA=m CONFIG_VDPA_USER=m CONFIG_VIRTIO_VDPA=m -j8 modules modules_install
depmod -a
```
Также вам понадобится консольная утилита `vdpa` из пакета `iproute2`.
Команды для подключения виртуального диска через VDUSE:
@@ -154,7 +182,7 @@ qemu-storage-daemon --daemonize --blockdev '{"node-name":"test1","driver":"vitas
vdpa dev add name test1 mgmtdev vduse
```
После этого в системе появится устройство /dev/vda, которое можно будет использовать как
После этого в системе появится устройство `/dev/vda`, которое можно будет использовать как
обычный диск.
Для удаления устройства из системы:

View File

@@ -3,5 +3,5 @@ SUBSYSTEM=="block", ENV{ID_PART_ENTRY_TYPE}=="e7009fac-a5a1-4d72-af72-53de130599
IMPORT{program}="/usr/bin/vitastor-disk udev $devnode", \
SYMLINK+="vitastor/$env{VITASTOR_ALIAS}"
ENV{VITASTOR_OSD_NUM}!="", ACTION=="add", RUN{program}+="/usr/bin/systemctl enable --now vitastor-osd@$env{VITASTOR_OSD_NUM}"
ENV{VITASTOR_OSD_NUM}!="", ACTION=="remove", RUN{program}+="/usr/bin/systemctl disable --now vitastor-osd@$env{VITASTOR_OSD_NUM}"
ENV{VITASTOR_OSD_NUM}!="", ACTION=="add", RUN{program}+="/usr/bin/systemctl enable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"
ENV{VITASTOR_OSD_NUM}!="", ACTION=="remove", RUN{program}+="/usr/bin/systemctl disable --now --no-block vitastor-osd@$env{VITASTOR_OSD_NUM}"

View File

@@ -397,12 +397,13 @@ class Mon
this.etcd_prefix = this.etcd_prefix.replace(/\/\/+/g, '/').replace(/^\/?(.*[^\/])\/?$/, '/$1');
this.etcd_start_timeout = (config.etcd_start_timeout || 5) * 1000;
this.state = JSON.parse(JSON.stringify(this.constructor.etcd_tree));
this.prev_stats = { osd_stats: {}, osd_diff: {} };
this.signals_set = false;
this.stat_time = Date.now();
this.ws = null;
this.ws_alive = false;
this.ws_keepalive_timer = null;
this.on_stop_cb = () => this.on_stop(0).catch(console.error);
this.recheck_pgs_active = false;
}
parse_etcd_addresses(addrs)
@@ -552,9 +553,9 @@ class Mon
const cur_addr = this.pick_next_etcd();
const base = 'ws'+cur_addr.substr(4);
let now = Date.now();
if (tried[base] && now-tried[base] < timeout)
if (tried[base] && now-tried[base] < this.etcd_start_timeout)
{
await new Promise(ok => setTimeout(ok, timeout-(now-tried[base])));
await new Promise(ok => setTimeout(ok, this.etcd_start_timeout-(now-tried[base])));
now = Date.now();
}
tried[base] = now;
@@ -692,8 +693,27 @@ class Mon
});
}
// Schedule save_last_clean() to to run after a small timeout (1s) (to not spam etcd)
schedule_save_last_clean()
{
if (!this.save_last_clean_timer)
{
this.save_last_clean_timer = setTimeout(() =>
{
this.save_last_clean_timer = null;
this.save_last_clean().catch(this.die);
}, this.config.mon_change_timeout || 1000);
}
}
async save_last_clean()
{
if (this.save_last_clean_running)
{
this.schedule_save_last_clean();
return;
}
this.save_last_clean_running = true;
// last_clean_pgs is used to avoid extra data move when observing a series of changes in the cluster
const new_clean_pgs = { items: {} };
next_pool:
@@ -730,6 +750,7 @@ class Mon
value: b64(JSON.stringify(this.state.history.last_clean_pgs))
} } ],
}, this.etcd_start_timeout, 0);
this.save_last_clean_running = false;
}
get_mon_state()
@@ -1203,6 +1224,12 @@ class Mon
async recheck_pgs()
{
if (this.recheck_pgs_active)
{
this.schedule_recheck();
return;
}
this.recheck_pgs_active = true;
// Take configuration and state, check it against the stored configuration hash
// Recalculate PGs and save them to etcd if the configuration is changed
// FIXME: Do not change anything if the distribution is good and random enough and no PGs are degraded
@@ -1224,6 +1251,7 @@ class Mon
// Pool deleted. Delete all PGs, but first stop them.
if (!await this.stop_all_pgs(pool_id))
{
this.recheck_pgs_active = false;
this.schedule_recheck();
return;
}
@@ -1292,9 +1320,16 @@ class Mon
// PG count changed. Need to bring all PGs down.
if (!await this.stop_all_pgs(pool_id))
{
this.recheck_pgs_active = false;
this.schedule_recheck();
return;
}
}
if (prev_pgs.length != pool_cfg.pg_count)
{
// Scale PG count
// Do it even if old_pg_count is already equal to pool_cfg.pg_count,
// because last_clean_pgs may still contain the old number of PGs
const new_pg_history = [];
PGUtil.scale_pg_count(prev_pgs, real_prev_pgs, pg_history, new_pg_history, pool_cfg.pg_count);
pg_history = new_pg_history;
@@ -1396,6 +1431,7 @@ class Mon
await this.save_pg_config(new_config_pgs);
}
}
this.recheck_pgs_active = false;
}
async save_pg_config(new_config_pgs, etcd_request = { compare: [], success: [] })
@@ -1445,7 +1481,6 @@ class Mon
}
// Schedule a recheck to run after a small timeout (1s)
// If already scheduled, cancel previous timer and schedule it again
// This is required for multiple change events to trigger at most 1 recheck in 1s
schedule_recheck()
{
@@ -1459,15 +1494,15 @@ class Mon
}
}
derive_osd_stats(st, prev)
derive_osd_stats(st, prev, prev_diff)
{
const zero_stats = { op: { bps: 0n, iops: 0n, lat: 0n }, subop: { iops: 0n, lat: 0n }, recovery: { bps: 0n, iops: 0n } };
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
if (!st || !st.time || prev && (prev.time || this.stat_time/1000) >= st.time)
const diff = { op_stats: {}, subop_stats: {}, recovery_stats: {}, inode_stats: {} };
if (!st || !st.time || !prev || !prev.time || prev.time >= st.time)
{
return diff;
return prev_diff || diff;
}
const timediff = BigInt(st.time*1000 - (prev && prev.time*1000 || this.stat_time));
const timediff = BigInt(st.time*1000 - prev.time*1000);
for (const op in st.op_stats||{})
{
const pr = prev && prev.op_stats && prev.op_stats[op];
@@ -1499,25 +1534,47 @@ class Mon
if (n > 0)
diff.recovery_stats[op] = { ...c, bps: b*1000n/timediff, iops: n*1000n/timediff };
}
for (const pool_id in st.inode_stats||{})
{
const pool_diff = diff.inode_stats[pool_id] = {};
for (const inode_num in st.inode_stats[pool_id])
{
const inode_diff = diff.inode_stats[pool_id][inode_num] = {};
for (const op of [ 'read', 'write', 'delete' ])
{
const c = st.inode_stats[pool_id][inode_num][op];
const pr = prev && prev.inode_stats && prev.inode_stats[pool_id] &&
prev.inode_stats[pool_id][inode_num] && prev.inode_stats[pool_id][inode_num][op];
const n = BigInt(c.count||0) - BigInt(pr && pr.count||0);
inode_diff[op] = {
bps: (BigInt(c.bytes||0) - BigInt(pr && pr.bytes||0))*1000n/timediff,
iops: n*1000n/timediff,
lat: (BigInt(c.usec||0) - BigInt(pr && pr.usec||0))/(n || 1n),
};
}
}
}
return diff;
}
sum_op_stats(timestamp, prev_stats)
sum_op_stats()
{
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
if (!prev_stats || prev_stats.timestamp >= timestamp)
for (const osd in this.state.osd.stats)
{
return sum_diff;
const cur = { ...this.state.osd.stats[osd], inode_stats: this.state.osd.inodestats[osd]||{} };
this.prev_stats.osd_diff[osd] = this.derive_osd_stats(
cur, this.prev_stats.osd_stats[osd], this.prev_stats.osd_diff[osd]
);
this.prev_stats.osd_stats[osd] = cur;
}
const tm = BigInt(timestamp - (prev_stats.timestamp || 0));
const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} };
// Sum derived values instead of deriving summed
for (const osd in this.state.osd.stats)
{
const derived = this.derive_osd_stats(this.state.osd.stats[osd],
this.prev_stats && this.prev_stats.osd_stats && this.prev_stats.osd_stats[osd]);
for (const type in derived)
const derived = this.prev_stats.osd_diff[osd];
for (const type in sum_diff)
{
for (const op in derived[type])
for (const op in derived[type]||{})
{
for (const k in derived[type][op])
{
@@ -1574,14 +1631,14 @@ class Mon
return { object_counts, object_bytes };
}
sum_inode_stats(prev_stats, timestamp, prev_timestamp)
sum_inode_stats()
{
const inode_stats = {};
const inode_stub = () => ({
raw_used: 0n,
read: { count: 0n, usec: 0n, bytes: 0n },
write: { count: 0n, usec: 0n, bytes: 0n },
delete: { count: 0n, usec: 0n, bytes: 0n },
read: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
write: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
delete: { count: 0n, usec: 0n, bytes: 0n, bps: 0n, iops: 0n, lat: 0n },
});
const seen_pools = {};
for (const pool_id in this.state.config.pools)
@@ -1633,11 +1690,25 @@ class Mon
}
}
}
if (prev_stats && prev_timestamp >= timestamp)
for (const osd in this.prev_stats.osd_diff)
{
prev_stats = null;
for (const pool_id in this.prev_stats.osd_diff[osd].inode_stats)
{
for (const inode_num in this.prev_stats.osd_diff[osd].inode_stats[pool_id])
{
inode_stats[pool_id][inode_num] = inode_stats[pool_id][inode_num] || inode_stub();
for (const op of [ 'read', 'write', 'delete' ])
{
const op_diff = this.prev_stats.osd_diff[osd].inode_stats[pool_id][inode_num][op] || {};
const op_st = inode_stats[pool_id][inode_num][op];
op_st.bps += op_diff.bps;
op_st.iops += op_diff.iops;
op_st.lat += op_diff.lat;
op_st.n_osd = (op_st.n_osd || 0) + 1;
}
}
}
}
const tm = prev_stats ? BigInt(timestamp - prev_timestamp) : 0;
for (const pool_id in inode_stats)
{
for (const inode_num in inode_stats[pool_id])
@@ -1646,11 +1717,12 @@ class Mon
for (const op of [ 'read', 'write', 'delete' ])
{
const op_st = inode_stats[pool_id][inode_num][op];
const prev_st = prev_stats && prev_stats[pool_id] && prev_stats[pool_id][inode_num] && prev_stats[pool_id][inode_num][op];
op_st.bps = prev_st ? (op_st.bytes - prev_st.bytes) * 1000n / tm : 0;
op_st.iops = prev_st ? (op_st.count - prev_st.count) * 1000n / tm : 0;
op_st.lat = prev_st ? (op_st.usec - prev_st.usec) / ((op_st.count - prev_st.count) || 1n) : 0;
if (op_st.bps > 0 || op_st.iops > 0 || op_st.lat > 0)
if (op_st.n_osd)
{
op_st.lat /= BigInt(op_st.n_osd);
delete op_st.n_osd;
}
if (op_st.bps > 0 || op_st.iops > 0)
nonzero = true;
}
if (!nonzero && (!this.state.config.inode[pool_id] || !this.state.config.inode[pool_id][inode_num]))
@@ -1683,15 +1755,9 @@ class Mon
async update_total_stats()
{
const txn = [];
const timestamp = Date.now();
const { object_counts, object_bytes } = this.sum_object_counts();
let stats = this.sum_op_stats(timestamp, this.prev_stats);
let { inode_stats, seen_pools } = this.sum_inode_stats(
this.prev_stats ? this.prev_stats.inode_stats : null,
timestamp, this.prev_stats ? this.prev_stats.timestamp : null
);
this.prev_stats = { timestamp, inode_stats, osd_stats: { ...this.state.osd.stats } };
this.stat_time = Date.now();
let stats = this.sum_op_stats();
let { inode_stats, seen_pools } = this.sum_inode_stats();
stats.object_counts = object_counts;
stats.object_bytes = object_bytes;
stats = this.serialize_bigints(stats);

View File

@@ -1,6 +1,6 @@
{
"name": "vitastor-mon",
"version": "1.1.0",
"version": "1.2.0",
"description": "Vitastor SDS monitor service",
"main": "mon-main.js",
"scripts": {

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VERSION = '1.1.0'
VERSION = '1.2.0'
LOG = logging.getLogger(__name__)

View File

@@ -24,4 +24,4 @@ rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform 's#^#vitastor-1.1.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.1.0$(rpm --eval '%dist').tar.gz *
tar --transform 's#^#vitastor-1.2.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.2.0$(rpm --eval '%dist').tar.gz *

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-1.1.0.el7.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.2.0.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 1.1.0
Version: 1.2.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.1.0.el7.tar.gz
Source0: vitastor-1.2.0.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -35,7 +35,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-1.1.0.el8.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.2.0.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 1.1.0
Version: 1.2.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.1.0.el8.tar.gz
Source0: vitastor-1.2.0.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -18,7 +18,7 @@ ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
cp /root/vitastor-1.1.0.el9.tar.gz ~/rpmbuild/SOURCES; \
cp /root/vitastor-1.2.0.el9.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 1.1.0
Version: 1.2.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-1.1.0.el9.tar.gz
Source0: vitastor-1.2.0.el9.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -16,10 +16,11 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
add_definitions(-DVERSION="1.1.0")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -I ${CMAKE_SOURCE_DIR}/src)
add_definitions(-DVERSION="1.2.0")
add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
add_link_options(-fno-omit-frame-pointer)
if (${WITH_ASAN})
add_definitions(-fsanitize=address -fno-omit-frame-pointer)
add_definitions(-fsanitize=address)
add_link_options(-fsanitize=address -fno-omit-frame-pointer)
endif (${WITH_ASAN})
@@ -180,6 +181,25 @@ target_link_libraries(vitastor-nbd
vitastor_client
)
# vitastor-kv
add_executable(vitastor-kv
kv_cli.cpp
kv_db.cpp
kv_db.h
)
target_link_libraries(vitastor-kv
vitastor_client
)
add_executable(vitastor-kv-stress
kv_stress.cpp
kv_db.cpp
kv_db.h
)
target_link_libraries(vitastor-kv-stress
vitastor_client
)
# vitastor-nfs
add_executable(vitastor-nfs
nfs_proxy.cpp

View File

@@ -1372,7 +1372,8 @@ bool journal_flusher_co::trim_journal(int wait_base)
? (uint32_t)JE_START_V1_SIZE : (uint32_t)JE_START_V2_SIZE),
.reserved = 0,
.journal_start = new_trim_pos,
.version = JOURNAL_VERSION_V2,
.version = (uint64_t)(!bs->dsk.data_csum_type && ((journal_entry_start*)flusher->journal_superblock)->version == JOURNAL_VERSION_V1
? JOURNAL_VERSION_V1 : JOURNAL_VERSION_V2),
.data_csum_type = bs->dsk.data_csum_type,
.csum_block_size = bs->dsk.csum_block_size,
};

View File

@@ -274,7 +274,7 @@ class blockstore_impl_t
blockstore_dirty_db_t dirty_db;
std::vector<blockstore_op_t*> submit_queue;
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
int unsynced_big_write_count = 0;
int unsynced_big_write_count = 0, unstable_unsynced = 0;
int unsynced_queued_ops = 0;
allocator *data_alloc = NULL;
uint8_t *zero_object;

View File

@@ -553,7 +553,7 @@ resume_1:
}
if (je_start->size == JE_START_V0_SIZE ||
(je_start->version != JOURNAL_VERSION_V1 || je_start->size != JE_START_V1_SIZE) &&
(je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE))
(je_start->version != JOURNAL_VERSION_V2 || je_start->size != JE_START_V2_SIZE && je_start->size != JE_START_V1_SIZE))
{
fprintf(
stderr, "The code only supports journal versions 2 and 1, but it is %lu on disk."
@@ -562,7 +562,8 @@ resume_1:
);
exit(1);
}
if (je_start->version == JOURNAL_VERSION_V1)
if (je_start->version == JOURNAL_VERSION_V1 ||
je_start->version == JOURNAL_VERSION_V2 && je_start->size == JE_START_V1_SIZE)
{
je_start->data_csum_type = 0;
je_start->csum_block_size = 0;

View File

@@ -145,6 +145,7 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
journal.sector_info[journal.cur_sector].offset = journal.next_free;
journal.in_sector_pos = 0;
journal.next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
assert(journal.next_free != journal.used_start);
memset(journal.inmemory
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);

View File

@@ -13,12 +13,6 @@
#define JOURNAL_BUFFER_SIZE 4*1024*1024
#define JOURNAL_ENTRY_HEADER_SIZE 16
// We reserve some extra space for future stabilize requests during writes
// FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
// writing more than can be stabilized afterwards
#define JOURNAL_STABILIZE_RESERVATION 65536
#define JOURNAL_INSTANT_RESERVATION 131072
// Journal entries
// Journal entries are linked to each other by their crc32 value
// The journal is almost a blockchain, because object versions constantly increase

View File

@@ -86,14 +86,15 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
auto & dirty_entry = dirty_db.at(sbw);
uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
left == 0 ? JOURNAL_STABILIZE_RESERVATION : 0))
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
{
return 0;
}
}
}
else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size,
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
{
return 0;
}
@@ -184,6 +185,11 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
{
mark_stable(dirty_it->first);
}
else
{
unstable_unsynced--;
assert(unstable_unsynced >= 0);
}
dirty_it++;
while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
{
@@ -214,6 +220,11 @@ void blockstore_impl_t::ack_sync(blockstore_op_t *op)
{
mark_stable(*it);
}
else
{
unstable_unsynced--;
assert(unstable_unsynced >= 0);
}
}
}
op->retval = 0;

View File

@@ -21,7 +21,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
dyn = calloc_or_die(1, dyn_size+sizeof(int));
*((int*)dyn) = 1;
}
uint8_t *dyn_ptr = (uint8_t*)(alloc_dyn_data ? dyn+sizeof(int) : &dyn);
uint8_t *dyn_ptr = (alloc_dyn_data ? (uint8_t*)dyn+sizeof(int) : (uint8_t*)&dyn);
uint64_t version = 1;
if (dirty_db.size() > 0)
{
@@ -320,7 +320,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, unsynced_big_write_count + 1,
sizeof(journal_entry_big_write) + dsk.clean_dyn_size,
(dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION))
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
{
return 0;
}
@@ -386,6 +386,10 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
sqe, dsk.data_fd, PRIV(op)->iov_zerofill, vcnt, dsk.data_offset + (loc << dsk.block_order) + op->offset - stripe_offset
);
PRIV(op)->pending_ops = 1;
if (immediate_commit != IMMEDIATE_ALL && !(dirty_it->second.state & BS_ST_INSTANT))
{
unstable_unsynced++;
}
if (immediate_commit != IMMEDIATE_ALL)
{
// Increase the counter, but don't save into unsynced_writes yet (can't sync until the write is finished)
@@ -408,7 +412,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0)
|| !space_check.check_available(op, 1,
sizeof(journal_entry_small_write) + dyn_size,
op->len + ((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
{
return 0;
}
@@ -499,6 +503,11 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
if (journal.next_free >= journal.len)
{
journal.next_free = dsk.journal_block_size;
assert(journal.next_free != journal.used_start);
}
if (immediate_commit == IMMEDIATE_NONE && !(dirty_it->second.state & BS_ST_INSTANT))
{
unstable_unsynced++;
}
if (!PRIV(op)->pending_ops)
{
@@ -538,7 +547,7 @@ resume_2:
uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len);
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size,
((dirty_it->second.state & BS_ST_INSTANT) ? JOURNAL_INSTANT_RESERVATION : JOURNAL_STABILIZE_RESERVATION)))
(unstable_writes.size()+unstable_unsynced)*journal.block_size))
{
return 0;
}
@@ -582,14 +591,20 @@ resume_4:
#endif
bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE;
bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE);
bool is_instant = ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT));
if (imm)
{
auto & unstab = unstable_writes[op->oid];
unstab = unstab < op->version ? op->version : unstab;
}
else if (!is_instant)
{
unstable_unsynced--;
assert(unstable_unsynced >= 0);
}
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK)
| (imm ? BS_ST_SYNCED : BS_ST_WRITTEN);
if (imm && ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT)))
if (imm && is_instant)
{
// Deletions and 'instant' operations are treated as immediately stable
mark_stable(dirty_it->first);
@@ -735,7 +750,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
});
assert(dirty_it != dirty_db.end());
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), JOURNAL_INSTANT_RESERVATION))
if (!space_check.check_available(op, 1, sizeof(journal_entry_del), (unstable_writes.size()+unstable_unsynced)*journal.block_size))
{
return 0;
}

View File

@@ -17,7 +17,7 @@
static const char *exe_name = NULL;
static const char* help_text =
"Vitastor command-line tool\n"
"Vitastor command-line tool " VERSION "\n"
"(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
"\n"
"COMMANDS:\n"
@@ -331,7 +331,7 @@ static int run(cli_tool_t *p, json11::Json::object cfg)
{
// Create client
json11::Json cfg_j = cfg;
p->ringloop = new ring_loop_t(512);
p->ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
p->epmgr = new epoll_manager_t(p->ringloop);
p->cli = new cluster_client_t(p->ringloop, p->epmgr->tfd, cfg_j);
// Smaller timeout by default for more interactiveness

View File

@@ -109,7 +109,7 @@ resume_1:
}
for (auto pg_per_pair: pg_per_osd)
{
uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.pg_count / pg_per_pair.second;
uint64_t pg_free = osd_free[pg_per_pair.first] * pool_cfg.real_pg_count / pg_per_pair.second;
if (pool_avail > pg_free)
{
pool_avail = pg_free;
@@ -124,8 +124,10 @@ resume_1:
pool_avail *= (pool_cfg.pg_size - pool_cfg.parity_chunks);
}
pool_stats[pool_cfg.id] = json11::Json::object {
{ "id", (uint64_t)pool_cfg.id },
{ "name", pool_cfg.name },
{ "pg_count", pool_cfg.pg_count },
{ "real_pg_count", pool_cfg.real_pg_count },
{ "scheme", pool_cfg.scheme == POOL_SCHEME_REPLICATED ? "replicated" : "ec" },
{ "scheme_name", pool_cfg.scheme == POOL_SCHEME_REPLICATED
? std::to_string(pool_cfg.pg_size)+"/"+std::to_string(pool_cfg.pg_minsize)
@@ -176,7 +178,7 @@ resume_1:
{ "title", "SCHEME" },
});
cols.push_back(json11::Json::object{
{ "key", "pg_count" },
{ "key", "pg_count_fmt" },
{ "title", "PGS" },
});
cols.push_back(json11::Json::object{
@@ -205,6 +207,9 @@ resume_1:
double raw_to = kv.second["raw_to_usable"].number_value();
if (raw_to < 0.000001 && raw_to > -0.000001)
raw_to = 1;
kv.second["pg_count_fmt"] = kv.second["real_pg_count"] == kv.second["pg_count"]
? kv.second["real_pg_count"].as_string()
: kv.second["real_pg_count"].as_string()+"->"+kv.second["pg_count"].as_string();
kv.second["total_fmt"] = format_size(kv.second["total_raw"].uint64_value() / raw_to);
kv.second["used_fmt"] = format_size(kv.second["used_raw"].uint64_value() / raw_to);
kv.second["max_avail_fmt"] = format_size(kv.second["max_available"].uint64_value());

View File

@@ -158,12 +158,7 @@ resume_2:
for (auto & pool_pair: parent->cli->st_cli.pool_config)
{
auto & pool_cfg = pool_pair.second;
bool active = true;
if (pool_cfg.pg_config.size() != pool_cfg.pg_count)
{
active = false;
pgs_by_state["offline"] += pool_cfg.pg_count-pool_cfg.pg_config.size();
}
bool active = pool_cfg.real_pg_count > 0;
pool_count++;
for (auto pg_it = pool_cfg.pg_config.begin(); pg_it != pool_cfg.pg_config.end(); pg_it++)
{

View File

@@ -6,7 +6,7 @@
#include "cluster_client_impl.h"
#include "http_client.h" // json_is_true
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config)
cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json config)
{
wb = new writeback_cache_t();
@@ -64,7 +64,7 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
cluster_client_t::~cluster_client_t()
{
msgr.repeer_pgs = [this](osd_num_t){};
msgr.repeer_pgs = [](osd_num_t){};
if (ringloop)
{
ringloop->unregister_consumer(&consumer);
@@ -454,7 +454,7 @@ bool cluster_client_t::flush()
wb->start_writebacks(this, 0);
cluster_op_t *sync = new cluster_op_t;
sync->opcode = OSD_OP_SYNC;
sync->callback = [this](cluster_op_t *sync)
sync->callback = [](cluster_op_t *sync)
{
delete sync;
};
@@ -465,7 +465,7 @@ bool cluster_client_t::flush()
bool sync_done = false;
cluster_op_t *sync = new cluster_op_t;
sync->opcode = OSD_OP_SYNC;
sync->callback = [this, &sync_done](cluster_op_t *sync)
sync->callback = [&sync_done](cluster_op_t *sync)
{
delete sync;
sync_done = true;
@@ -532,7 +532,7 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
return;
}
if (op->opcode == OSD_OP_WRITE && enable_writeback && !(op->flags & OP_FLUSH_BUFFER) &&
!op->version /* FIXME no CAS writeback */)
!op->version /* no CAS writeback */)
{
if (wb->writebacks_active >= client_max_writeback_iodepth)
{
@@ -553,7 +553,7 @@ void cluster_client_t::execute_internal(cluster_op_t *op)
}
if (op->opcode == OSD_OP_WRITE && !(op->flags & OP_IMMEDIATE_COMMIT))
{
if (!(op->flags & OP_FLUSH_BUFFER))
if (!(op->flags & OP_FLUSH_BUFFER) && !op->version /* no CAS write-repeat */)
{
wb->copy_write(op, CACHE_WRITTEN);
}
@@ -1152,7 +1152,7 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
osd_op_names[part->op.req.hdr.opcode], part->osd_num, part->op.reply.hdr.retval, expected
);
}
else
else if (log_level > 0)
{
fprintf(
stderr, "%s operation failed on OSD %lu: retval=%ld (expected %d)\n",

View File

@@ -121,7 +121,7 @@ public:
json11::Json::object cli_config, file_config, etcd_global_config;
json11::Json::object config;
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json config);
~cluster_client_t();
void execute(cluster_op_t *op);
void execute_raw(osd_num_t osd_num, osd_op_t *op);

View File

@@ -263,7 +263,7 @@ void writeback_cache_t::flush_buffers(cluster_client_t *cli, dirty_buf_it_t from
}
assert(calc_len == op->len);
writebacks_active++;
op->callback = [this, cli, flush_id](cluster_op_t* op)
op->callback = [this, flush_id](cluster_op_t* op)
{
// Buffer flushes should be always retried, regardless of the error,
// so they should never result in an error here
@@ -383,7 +383,7 @@ static void copy_to_op(cluster_op_t *op, uint64_t offset, uint8_t *buf, uint64_t
auto begin = (cur_offset < offset ? offset : cur_offset);
auto end = (cur_offset+v.iov_len > offset+len ? offset+len : cur_offset+v.iov_len);
memcpy(
v.iov_base + begin - cur_offset,
(uint8_t*)v.iov_base + begin - cur_offset,
buf + (cur_offset <= offset ? 0 : cur_offset-offset),
end - begin
);

View File

@@ -5,7 +5,7 @@
#include "str_util.h"
static const char *help_text =
"Vitastor disk management tool\n"
"Vitastor disk management tool " VERSION "\n"
"(c) Vitaliy Filippov, 2022+ (VNPL-1.1)\n"
"\n"
"COMMANDS:\n"
@@ -229,7 +229,7 @@ int main(int argc, char *argv[])
{
self.options["allow_data_loss"] = "1";
}
else if (argv[i][0] == '-' && argv[i][1] == '-')
else if (argv[i][0] == '-' && argv[i][1] == '-' && i < argc-1)
{
char *key = argv[i]+2;
self.options[key] = argv[++i];

View File

@@ -320,7 +320,7 @@ void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
if (journal_calc_data_pos != sw.data_offset)
{
printf(json ? ",\"bad_loc\":true,\"calc_loc\":\"0x%lx\""
: " (mismatched, calculated = %lu)", journal_pos);
: " (mismatched, calculated = %08lx)", journal_pos);
}
uint32_t data_csum_size = (!je_start.csum_block_size
? 0

View File

@@ -245,7 +245,7 @@ int disk_tool_t::resize_copy_data()
{
iodepth = 32;
}
ringloop = new ring_loop_t(iodepth < 512 ? 512 : iodepth);
ringloop = new ring_loop_t(iodepth < RINGLOOP_DEFAULT_SIZE ? RINGLOOP_DEFAULT_SIZE : iodepth);
dsk.data_fd = open(dsk.data_device.c_str(), O_DIRECT|O_RDWR);
if (dsk.data_fd < 0)
{

View File

@@ -130,7 +130,7 @@ static int bs_init(struct thread_data *td)
config[p.first] = p.second.dump();
}
}
bsd->ringloop = new ring_loop_t(512);
bsd->ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
bsd->epmgr = new epoll_manager_t(bsd->ringloop);
bsd->bs = new blockstore_t(config, bsd->ringloop, bsd->epmgr->tfd);
while (1)

401
src/kv_cli.cpp Normal file
View File

@@ -0,0 +1,401 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
//
// Vitastor shared key/value database test CLI
#define _XOPEN_SOURCE
#include <limits.h>
#include <netinet/tcp.h>
#include <sys/epoll.h>
#include <unistd.h>
#include <fcntl.h>
//#include <signal.h>
#include "epoll_manager.h"
#include "str_util.h"
#include "kv_db.h"
const char *exe_name = NULL;
class kv_cli_t
{
public:
kv_dbw_t *db = NULL;
ring_loop_t *ringloop = NULL;
epoll_manager_t *epmgr = NULL;
cluster_client_t *cli = NULL;
bool interactive = false;
int in_progress = 0;
char *cur_cmd = NULL;
int cur_cmd_size = 0, cur_cmd_alloc = 0;
bool finished = false, eof = false;
json11::Json::object cfg;
~kv_cli_t();
static json11::Json::object parse_args(int narg, const char *args[]);
void run(const json11::Json::object & cfg);
void read_cmd();
void next_cmd();
void handle_cmd(const std::string & cmd, std::function<void()> cb);
};
kv_cli_t::~kv_cli_t()
{
if (cur_cmd)
{
free(cur_cmd);
cur_cmd = NULL;
}
cur_cmd_alloc = 0;
if (db)
delete db;
if (cli)
{
cli->flush();
delete cli;
}
if (epmgr)
delete epmgr;
if (ringloop)
delete ringloop;
}
json11::Json::object kv_cli_t::parse_args(int narg, const char *args[])
{
json11::Json::object cfg;
for (int i = 1; i < narg; i++)
{
if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
{
printf(
"Vitastor Key/Value CLI\n"
"(c) Vitaliy Filippov, 2023+ (VNPL-1.1)\n"
"\n"
"USAGE: %s [--etcd_address ADDR] [OTHER OPTIONS]\n",
exe_name
);
exit(0);
}
else if (args[i][0] == '-' && args[i][1] == '-')
{
const char *opt = args[i]+2;
cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
}
}
return cfg;
}
void kv_cli_t::run(const json11::Json::object & cfg)
{
// Create client
ringloop = new ring_loop_t(512);
epmgr = new epoll_manager_t(ringloop);
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
db = new kv_dbw_t(cli);
// Load image metadata
while (!cli->is_ready())
{
ringloop->loop();
if (cli->is_ready())
break;
ringloop->wait();
}
// Run
fcntl(0, F_SETFL, fcntl(0, F_GETFL, 0) | O_NONBLOCK);
try
{
epmgr->tfd->set_fd_handler(0, false, [this](int fd, int events)
{
if (events & EPOLLIN)
{
read_cmd();
}
if (events & EPOLLRDHUP)
{
epmgr->tfd->set_fd_handler(0, false, NULL);
finished = true;
}
});
interactive = true;
printf("> ");
}
catch (std::exception & e)
{
// Can't add to epoll, STDIN is probably a file
read_cmd();
}
while (!finished)
{
ringloop->loop();
if (!finished)
ringloop->wait();
}
// Destroy the client
delete db;
db = NULL;
cli->flush();
delete cli;
delete epmgr;
delete ringloop;
cli = NULL;
epmgr = NULL;
ringloop = NULL;
}
void kv_cli_t::read_cmd()
{
if (!cur_cmd_alloc)
{
cur_cmd_alloc = 65536;
cur_cmd = (char*)malloc_or_die(cur_cmd_alloc);
}
while (cur_cmd_size < cur_cmd_alloc)
{
int r = read(0, cur_cmd+cur_cmd_size, cur_cmd_alloc-cur_cmd_size);
if (r < 0 && errno != EAGAIN)
fprintf(stderr, "Error reading from stdin: %s\n", strerror(errno));
if (r > 0)
cur_cmd_size += r;
if (r == 0)
eof = true;
if (r <= 0)
break;
}
next_cmd();
}
void kv_cli_t::next_cmd()
{
if (in_progress > 0)
{
return;
}
int pos = 0;
for (; pos < cur_cmd_size; pos++)
{
if (cur_cmd[pos] == '\n' || cur_cmd[pos] == '\r')
{
auto cmd = trim(std::string(cur_cmd, pos));
pos++;
memmove(cur_cmd, cur_cmd+pos, cur_cmd_size-pos);
cur_cmd_size -= pos;
in_progress++;
handle_cmd(cmd, [this]()
{
in_progress--;
if (interactive)
printf("> ");
next_cmd();
if (!in_progress)
read_cmd();
});
break;
}
}
if (eof && !in_progress)
{
finished = true;
}
}
void kv_cli_t::handle_cmd(const std::string & cmd, std::function<void()> cb)
{
if (cmd == "")
{
cb();
return;
}
auto pos = cmd.find_first_of(" \t");
if (pos != std::string::npos)
{
while (pos < cmd.size()-1 && (cmd[pos+1] == ' ' || cmd[pos+1] == '\t'))
pos++;
}
auto opname = strtolower(pos == std::string::npos ? cmd : cmd.substr(0, pos));
if (opname == "open")
{
uint64_t pool_id = 0;
inode_t inode_id = 0;
uint32_t kv_block_size = 0;
int scanned = sscanf(cmd.c_str() + pos+1, "%lu %lu %u", &pool_id, &inode_id, &kv_block_size);
if (scanned == 2)
{
kv_block_size = 4096;
}
if (scanned < 2 || !pool_id || !inode_id || !kv_block_size || (kv_block_size & (kv_block_size-1)) != 0)
{
fprintf(stderr, "Usage: open <pool_id> <inode_id> [block_size]. Block size must be a power of 2. Default is 4096.\n");
cb();
return;
}
cfg["kv_block_size"] = (uint64_t)kv_block_size;
db->open(INODE_WITH_POOL(pool_id, inode_id), cfg, [=](int res)
{
if (res < 0)
fprintf(stderr, "Error opening index: %s (code %d)\n", strerror(-res), res);
else
printf("Index opened. Current size: %lu bytes\n", db->get_size());
cb();
});
}
else if (opname == "config")
{
auto pos2 = cmd.find_first_of(" \t", pos+1);
if (pos2 == std::string::npos)
{
fprintf(stderr, "Usage: config <property> <value>\n");
cb();
return;
}
auto key = trim(cmd.substr(pos+1, pos2-pos-1));
auto value = parse_size(trim(cmd.substr(pos2+1)));
if (key != "kv_memory_limit" &&
key != "kv_allocate_blocks" &&
key != "kv_evict_max_misses" &&
key != "kv_evict_attempts_per_level" &&
key != "kv_evict_unused_age" &&
key != "kv_log_level")
{
fprintf(
stderr, "Allowed properties: kv_memory_limit, kv_allocate_blocks,"
" kv_evict_max_misses, kv_evict_attempts_per_level, kv_evict_unused_age, kv_log_level\n"
);
}
else
{
cfg[key] = value;
db->set_config(cfg);
}
cb();
}
else if (opname == "get" || opname == "set" || opname == "del")
{
if (opname == "get" || opname == "del")
{
if (pos == std::string::npos)
{
fprintf(stderr, "Usage: %s <key>\n", opname.c_str());
cb();
return;
}
auto key = trim(cmd.substr(pos+1));
if (opname == "get")
{
db->get(key, [this, cb](int res, const std::string & value)
{
if (res < 0)
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
else
{
write(1, value.c_str(), value.size());
write(1, "\n", 1);
}
cb();
});
}
else
{
db->del(key, [this, cb](int res)
{
if (res < 0)
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
else
printf("OK\n");
cb();
});
}
}
else
{
auto pos2 = cmd.find_first_of(" \t", pos+1);
if (pos2 == std::string::npos)
{
fprintf(stderr, "Usage: set <key> <value>\n");
cb();
return;
}
auto key = trim(cmd.substr(pos+1, pos2-pos-1));
auto value = trim(cmd.substr(pos2+1));
db->set(key, value, [this, cb](int res)
{
if (res < 0)
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
else
printf("OK\n");
cb();
});
}
}
else if (opname == "list")
{
std::string start, end;
if (pos != std::string::npos)
{
auto pos2 = cmd.find_first_of(" \t", pos+1);
if (pos2 != std::string::npos)
{
start = trim(cmd.substr(pos+1, pos2-pos-1));
end = trim(cmd.substr(pos2+1));
}
else
{
start = trim(cmd.substr(pos+1));
}
}
void *handle = db->list_start(start);
db->list_next(handle, [=](int res, const std::string & key, const std::string & value)
{
if (res < 0)
{
if (res != -ENOENT)
{
fprintf(stderr, "Error: %s (code %d)\n", strerror(-res), res);
}
db->list_close(handle);
cb();
}
else
{
printf("%s = %s\n", key.c_str(), value.c_str());
db->list_next(handle, NULL);
}
});
}
else if (opname == "close")
{
db->close([=]()
{
printf("Index closed\n");
cb();
});
}
else if (opname == "quit" || opname == "q")
{
::close(0);
finished = true;
}
else
{
fprintf(
stderr, "Unknown operation: %s. Supported operations:\n"
"open <pool_id> <inode_id> [block_size]\n"
"config <property> <value>\n"
"get <key>\nset <key> <value>\ndel <key>\nlist [<start> [end]]\n"
"close\nquit\n", opname.c_str()
);
cb();
}
}
int main(int narg, const char *args[])
{
setvbuf(stdout, NULL, _IONBF, 0);
setvbuf(stderr, NULL, _IONBF, 0);
exe_name = args[0];
kv_cli_t *p = new kv_cli_t();
p->run(kv_cli_t::parse_args(narg, args));
delete p;
return 0;
}

2064
src/kv_db.cpp Normal file

File diff suppressed because it is too large Load Diff

36
src/kv_db.h Normal file
View File

@@ -0,0 +1,36 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
//
// Vitastor shared key/value database
// Parallel optimistic B-Tree O:-)
#pragma once
#include "cluster_client.h"
struct kv_db_t;
struct kv_dbw_t
{
kv_dbw_t(cluster_client_t *cli);
~kv_dbw_t();
void open(inode_t inode_id, json11::Json cfg, std::function<void(int)> cb);
void set_config(json11::Json cfg);
void close(std::function<void()> cb);
uint64_t get_size();
void get(const std::string & key, std::function<void(int res, const std::string & value)> cb,
bool allow_old_cached = false);
void set(const std::string & key, const std::string & value, std::function<void(int res)> cb,
std::function<bool(int res, const std::string & value)> cas_compare = NULL);
void del(const std::string & key, std::function<void(int res)> cb,
std::function<bool(int res, const std::string & value)> cas_compare = NULL);
void* list_start(const std::string & start);
void list_next(void *handle, std::function<void(int res, const std::string & key, const std::string & value)> cb);
void list_close(void *handle);
kv_db_t *db;
};

697
src/kv_stress.cpp Normal file
View File

@@ -0,0 +1,697 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
//
// Vitastor shared key/value database stress tester / benchmark
#define _XOPEN_SOURCE
#include <limits.h>
#include <netinet/tcp.h>
#include <sys/epoll.h>
#include <unistd.h>
#include <fcntl.h>
//#include <signal.h>
#include "epoll_manager.h"
#include "str_util.h"
#include "kv_db.h"
const char *exe_name = NULL;
struct kv_test_listing_t
{
uint64_t count = 0, done = 0;
void *handle = NULL;
std::string next_after;
std::set<std::string> inflights;
timespec tv_begin;
bool error = false;
};
struct kv_test_lat_t
{
const char *name = NULL;
uint64_t usec = 0, count = 0;
};
struct kv_test_stat_t
{
kv_test_lat_t get, add, update, del, list;
uint64_t list_keys = 0;
};
class kv_test_t
{
public:
// Config
json11::Json::object kv_cfg;
std::string key_prefix, key_suffix;
uint64_t inode_id = 0;
uint64_t op_count = 1000000;
uint64_t runtime_sec = 0;
uint64_t parallelism = 4;
uint64_t reopen_prob = 1;
uint64_t get_prob = 30000;
uint64_t add_prob = 20000;
uint64_t update_prob = 20000;
uint64_t del_prob = 5000;
uint64_t list_prob = 300;
uint64_t min_key_len = 10;
uint64_t max_key_len = 70;
uint64_t min_value_len = 50;
uint64_t max_value_len = 300;
uint64_t min_list_count = 10;
uint64_t max_list_count = 1000;
uint64_t print_stats_interval = 1;
bool json_output = false;
uint64_t log_level = 1;
bool trace = false;
bool stop_on_error = false;
// FIXME: Multiple clients
kv_test_stat_t stat, prev_stat;
timespec prev_stat_time, start_stat_time;
// State
kv_dbw_t *db = NULL;
ring_loop_t *ringloop = NULL;
epoll_manager_t *epmgr = NULL;
cluster_client_t *cli = NULL;
ring_consumer_t consumer;
bool finished = false;
uint64_t total_prob = 0;
uint64_t ops_sent = 0, ops_done = 0;
int stat_timer_id = -1;
int in_progress = 0;
bool reopening = false;
std::set<kv_test_listing_t*> listings;
std::set<std::string> changing_keys;
std::map<std::string, std::string> values;
~kv_test_t();
static json11::Json::object parse_args(int narg, const char *args[]);
void parse_config(json11::Json cfg);
void run(json11::Json cfg);
void loop();
void print_stats(kv_test_stat_t & prev_stat, timespec & prev_stat_time);
void print_total_stats();
void start_change(const std::string & key);
void stop_change(const std::string & key);
void add_stat(kv_test_lat_t & stat, timespec tv_begin);
};
kv_test_t::~kv_test_t()
{
if (db)
delete db;
if (cli)
{
cli->flush();
delete cli;
}
if (epmgr)
delete epmgr;
if (ringloop)
delete ringloop;
}
json11::Json::object kv_test_t::parse_args(int narg, const char *args[])
{
json11::Json::object cfg;
for (int i = 1; i < narg; i++)
{
if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
{
printf(
"Vitastor Key/Value DB stress tester / benchmark\n"
"(c) Vitaliy Filippov, 2023+ (VNPL-1.1)\n"
"\n"
"USAGE: %s --pool_id POOL_ID --inode_id INODE_ID [OPTIONS]\n"
" --op_count 1000000\n"
" Total operations to run during test. 0 means unlimited\n"
" --key_prefix \"\"\n"
" Prefix for all keys read or written (to avoid collisions)\n"
" --key_suffix \"\"\n"
" Suffix for all keys read or written (to avoid collisions, but scan all DB)\n"
" --runtime 0\n"
" Run for this number of seconds. 0 means unlimited\n"
" --parallelism 4\n"
" Run this number of operations in parallel\n"
" --get_prob 30000\n"
" Fraction of key retrieve operations\n"
" --add_prob 20000\n"
" Fraction of key addition operations\n"
" --update_prob 20000\n"
" Fraction of key update operations\n"
" --del_prob 30000\n"
" Fraction of key delete operations\n"
" --list_prob 300\n"
" Fraction of listing operations\n"
" --min_key_len 10\n"
" Minimum key size in bytes\n"
" --max_key_len 70\n"
" Maximum key size in bytes\n"
" --min_value_len 50\n"
" Minimum value size in bytes\n"
" --max_value_len 300\n"
" Maximum value size in bytes\n"
" --min_list_count 10\n"
" Minimum number of keys read in listing (0 = all keys)\n"
" --max_list_count 1000\n"
" Maximum number of keys read in listing\n"
" --print_stats 1\n"
" Print operation statistics every this number of seconds\n"
" --json\n"
" JSON output\n"
" --stop_on_error 0\n"
" Stop on first execution error, mismatch, lost key or extra key during listing\n"
" --kv_memory_limit 128M\n"
" Maximum memory to use for vitastor-kv index cache\n"
" --kv_allocate_blocks 4\n"
" Number of PG blocks used for new tree block allocation in parallel\n"
" --kv_evict_max_misses 10\n"
" Eviction algorithm parameter: retry eviction from another random spot\n"
" if this number of keys is used currently or was used recently\n"
" --kv_evict_attempts_per_level 3\n"
" Retry eviction at most this number of times per tree level, starting\n"
" with bottom-most levels\n"
" --kv_evict_unused_age 1000\n"
" Evict only keys unused during this number of last operations\n"
" --kv_log_level 1\n"
" Log level. 0 = errors, 1 = warnings, 10 = trace operations\n",
exe_name
);
exit(0);
}
else if (args[i][0] == '-' && args[i][1] == '-')
{
const char *opt = args[i]+2;
cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
}
}
return cfg;
}
void kv_test_t::parse_config(json11::Json cfg)
{
inode_id = INODE_WITH_POOL(cfg["pool_id"].uint64_value(), cfg["inode_id"].uint64_value());
if (cfg["op_count"].uint64_value() > 0)
op_count = cfg["op_count"].uint64_value();
key_prefix = cfg["key_prefix"].string_value();
key_suffix = cfg["key_suffix"].string_value();
if (cfg["runtime"].uint64_value() > 0)
runtime_sec = cfg["runtime"].uint64_value();
if (cfg["parallelism"].uint64_value() > 0)
parallelism = cfg["parallelism"].uint64_value();
if (!cfg["reopen_prob"].is_null())
reopen_prob = cfg["reopen_prob"].uint64_value();
if (!cfg["get_prob"].is_null())
get_prob = cfg["get_prob"].uint64_value();
if (!cfg["add_prob"].is_null())
add_prob = cfg["add_prob"].uint64_value();
if (!cfg["update_prob"].is_null())
update_prob = cfg["update_prob"].uint64_value();
if (!cfg["del_prob"].is_null())
del_prob = cfg["del_prob"].uint64_value();
if (!cfg["list_prob"].is_null())
list_prob = cfg["list_prob"].uint64_value();
if (!cfg["min_key_len"].is_null())
min_key_len = cfg["min_key_len"].uint64_value();
if (cfg["max_key_len"].uint64_value() > 0)
max_key_len = cfg["max_key_len"].uint64_value();
if (!cfg["min_value_len"].is_null())
min_value_len = cfg["min_value_len"].uint64_value();
if (cfg["max_value_len"].uint64_value() > 0)
max_value_len = cfg["max_value_len"].uint64_value();
if (!cfg["min_list_count"].is_null())
min_list_count = cfg["min_list_count"].uint64_value();
if (!cfg["max_list_count"].is_null())
max_list_count = cfg["max_list_count"].uint64_value();
if (!cfg["print_stats"].is_null())
print_stats_interval = cfg["print_stats"].uint64_value();
if (!cfg["json"].is_null())
json_output = true;
if (!cfg["stop_on_error"].is_null())
stop_on_error = cfg["stop_on_error"].bool_value();
if (!cfg["kv_memory_limit"].is_null())
kv_cfg["kv_memory_limit"] = cfg["kv_memory_limit"];
if (!cfg["kv_allocate_blocks"].is_null())
kv_cfg["kv_allocate_blocks"] = cfg["kv_allocate_blocks"];
if (!cfg["kv_evict_max_misses"].is_null())
kv_cfg["kv_evict_max_misses"] = cfg["kv_evict_max_misses"];
if (!cfg["kv_evict_attempts_per_level"].is_null())
kv_cfg["kv_evict_attempts_per_level"] = cfg["kv_evict_attempts_per_level"];
if (!cfg["kv_evict_unused_age"].is_null())
kv_cfg["kv_evict_unused_age"] = cfg["kv_evict_unused_age"];
if (!cfg["kv_log_level"].is_null())
{
log_level = cfg["kv_log_level"].uint64_value();
trace = log_level >= 10;
kv_cfg["kv_log_level"] = cfg["kv_log_level"];
}
total_prob = reopen_prob+get_prob+add_prob+update_prob+del_prob+list_prob;
stat.get.name = "get";
stat.add.name = "add";
stat.update.name = "update";
stat.del.name = "del";
stat.list.name = "list";
}
void kv_test_t::run(json11::Json cfg)
{
srand48(time(NULL));
parse_config(cfg);
// Create client
ringloop = new ring_loop_t(512);
epmgr = new epoll_manager_t(ringloop);
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
db = new kv_dbw_t(cli);
// Load image metadata
while (!cli->is_ready())
{
ringloop->loop();
if (cli->is_ready())
break;
ringloop->wait();
}
// Run
reopening = true;
db->open(inode_id, kv_cfg, [this](int res)
{
reopening = false;
if (res < 0)
{
fprintf(stderr, "ERROR: Open index: %d (%s)\n", res, strerror(-res));
exit(1);
}
if (trace)
printf("Index opened\n");
ringloop->wakeup();
});
consumer.loop = [this]() { loop(); };
ringloop->register_consumer(&consumer);
if (print_stats_interval)
stat_timer_id = epmgr->tfd->set_timer(print_stats_interval*1000, true, [this](int) { print_stats(prev_stat, prev_stat_time); });
clock_gettime(CLOCK_REALTIME, &start_stat_time);
prev_stat_time = start_stat_time;
while (!finished)
{
ringloop->loop();
if (!finished)
ringloop->wait();
}
if (stat_timer_id >= 0)
epmgr->tfd->clear_timer(stat_timer_id);
ringloop->unregister_consumer(&consumer);
// Print total stats
print_total_stats();
// Destroy the client
delete db;
db = NULL;
cli->flush();
delete cli;
delete epmgr;
delete ringloop;
cli = NULL;
epmgr = NULL;
ringloop = NULL;
}
static const char *base64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789@+/";
std::string random_str(int len)
{
std::string str;
str.resize(len);
for (int i = 0; i < len; i++)
{
str[i] = base64_chars[lrand48() % 64];
}
return str;
}
void kv_test_t::loop()
{
if (reopening)
{
return;
}
if (ops_done >= op_count)
{
finished = true;
}
while (!finished && ops_sent < op_count && in_progress < parallelism)
{
uint64_t dice = (lrand48() % total_prob);
if (dice < reopen_prob)
{
reopening = true;
db->close([this]()
{
if (trace)
printf("Index closed\n");
db->open(inode_id, kv_cfg, [this](int res)
{
reopening = false;
if (res < 0)
{
fprintf(stderr, "ERROR: Reopen index: %d (%s)\n", res, strerror(-res));
finished = true;
return;
}
if (trace)
printf("Index reopened\n");
ringloop->wakeup();
});
});
return;
}
else if (dice < reopen_prob+get_prob)
{
// get existing
auto key = random_str(max_key_len);
auto k_it = values.lower_bound(key);
if (k_it == values.end())
continue;
key = k_it->first;
if (changing_keys.find(key) != changing_keys.end())
continue;
in_progress++;
ops_sent++;
if (trace)
printf("get %s\n", key.c_str());
timespec tv_begin;
clock_gettime(CLOCK_REALTIME, &tv_begin);
db->get(key, [this, key, tv_begin](int res, const std::string & value)
{
add_stat(stat.get, tv_begin);
ops_done++;
in_progress--;
auto it = values.find(key);
if (res != (it == values.end() ? -ENOENT : 0))
{
fprintf(stderr, "ERROR: get %s: %d (%s)\n", key.c_str(), res, strerror(-res));
if (stop_on_error)
exit(1);
}
else if (it != values.end() && value != it->second)
{
fprintf(stderr, "ERROR: get %s: mismatch: %s vs %s\n", key.c_str(), value.c_str(), it->second.c_str());
if (stop_on_error)
exit(1);
}
ringloop->wakeup();
});
}
else if (dice < reopen_prob+get_prob+add_prob+update_prob)
{
bool is_add = false;
std::string key;
if (dice < reopen_prob+get_prob+add_prob)
{
// add
is_add = true;
uint64_t key_len = min_key_len + (max_key_len > min_key_len ? lrand48() % (max_key_len-min_key_len) : 0);
key = key_prefix + random_str(key_len) + key_suffix;
}
else
{
// update
key = random_str(max_key_len);
auto k_it = values.lower_bound(key);
if (k_it == values.end())
continue;
key = k_it->first;
}
if (changing_keys.find(key) != changing_keys.end())
continue;
uint64_t value_len = min_value_len + (max_value_len > min_value_len ? lrand48() % (max_value_len-min_value_len) : 0);
auto value = random_str(value_len);
start_change(key);
ops_sent++;
in_progress++;
if (trace)
printf("set %s = %s\n", key.c_str(), value.c_str());
timespec tv_begin;
clock_gettime(CLOCK_REALTIME, &tv_begin);
db->set(key, value, [this, key, value, tv_begin, is_add](int res)
{
add_stat(is_add ? stat.add : stat.update, tv_begin);
stop_change(key);
ops_done++;
in_progress--;
if (res != 0)
{
fprintf(stderr, "ERROR: set %s = %s: %d (%s)\n", key.c_str(), value.c_str(), res, strerror(-res));
if (stop_on_error)
exit(1);
}
else
{
values[key] = value;
}
ringloop->wakeup();
}, NULL);
}
else if (dice < reopen_prob+get_prob+add_prob+update_prob+del_prob)
{
// delete
auto key = random_str(max_key_len);
auto k_it = values.lower_bound(key);
if (k_it == values.end())
continue;
key = k_it->first;
if (changing_keys.find(key) != changing_keys.end())
continue;
start_change(key);
ops_sent++;
in_progress++;
if (trace)
printf("del %s\n", key.c_str());
timespec tv_begin;
clock_gettime(CLOCK_REALTIME, &tv_begin);
db->del(key, [this, key, tv_begin](int res)
{
add_stat(stat.del, tv_begin);
stop_change(key);
ops_done++;
in_progress--;
if (res != 0)
{
fprintf(stderr, "ERROR: del %s: %d (%s)\n", key.c_str(), res, strerror(-res));
if (stop_on_error)
exit(1);
}
else
{
values.erase(key);
}
ringloop->wakeup();
}, NULL);
}
else if (dice < reopen_prob+get_prob+add_prob+update_prob+del_prob+list_prob)
{
// list
ops_sent++;
in_progress++;
auto key = random_str(max_key_len);
auto lst = new kv_test_listing_t;
auto k_it = values.lower_bound(key);
lst->count = min_list_count + (max_list_count > min_list_count ? lrand48() % (max_list_count-min_list_count) : 0);
lst->handle = db->list_start(k_it == values.begin() ? key_prefix : key);
lst->next_after = k_it == values.begin() ? key_prefix : key;
lst->inflights = changing_keys;
listings.insert(lst);
if (trace)
printf("list from %s\n", key.c_str());
clock_gettime(CLOCK_REALTIME, &lst->tv_begin);
db->list_next(lst->handle, [this, lst](int res, const std::string & key, const std::string & value)
{
if (log_level >= 11)
printf("list: %s = %s\n", key.c_str(), value.c_str());
if (res >= 0 && key_prefix.size() && (key.size() < key_prefix.size() ||
key.substr(0, key_prefix.size()) != key_prefix))
{
// stop at this key
res = -ENOENT;
}
if (res < 0 || (lst->count > 0 && lst->done >= lst->count))
{
add_stat(stat.list, lst->tv_begin);
if (res == 0)
{
// ok (done >= count)
}
else if (res != -ENOENT)
{
fprintf(stderr, "ERROR: list: %d (%s)\n", res, strerror(-res));
lst->error = true;
}
else
{
auto k_it = lst->next_after == "" ? values.begin() : values.upper_bound(lst->next_after);
while (k_it != values.end())
{
while (k_it != values.end() && lst->inflights.find(k_it->first) != lst->inflights.end())
k_it++;
if (k_it != values.end())
{
fprintf(stderr, "ERROR: list: missing key %s\n", (k_it++)->first.c_str());
lst->error = true;
}
}
}
if (lst->error && stop_on_error)
exit(1);
ops_done++;
in_progress--;
db->list_close(lst->handle);
delete lst;
listings.erase(lst);
ringloop->wakeup();
}
else
{
stat.list_keys++;
// Do not check modified keys in listing
// Listing may return their old or new state
if ((!key_suffix.size() || key.size() >= key_suffix.size() &&
key.substr(key.size()-key_suffix.size()) == key_suffix) &&
lst->inflights.find(key) == lst->inflights.end())
{
lst->done++;
auto k_it = lst->next_after == "" ? values.begin() : values.upper_bound(lst->next_after);
while (true)
{
while (k_it != values.end() && lst->inflights.find(k_it->first) != lst->inflights.end())
{
k_it++;
}
if (k_it == values.end() || k_it->first > key)
{
fprintf(stderr, "ERROR: list: extra key %s\n", key.c_str());
lst->error = true;
break;
}
else if (k_it->first < key)
{
fprintf(stderr, "ERROR: list: missing key %s\n", k_it->first.c_str());
lst->error = true;
lst->next_after = k_it->first;
k_it++;
}
else
{
if (k_it->second != value)
{
fprintf(stderr, "ERROR: list: mismatch: %s = %s but should be %s\n",
key.c_str(), value.c_str(), k_it->second.c_str());
lst->error = true;
}
lst->next_after = k_it->first;
break;
}
}
}
db->list_next(lst->handle, NULL);
}
});
}
}
}
void kv_test_t::add_stat(kv_test_lat_t & stat, timespec tv_begin)
{
timespec tv_end;
clock_gettime(CLOCK_REALTIME, &tv_end);
int64_t usec = (tv_end.tv_sec - tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - tv_begin.tv_nsec)/1000;
if (usec > 0)
{
stat.usec += usec;
stat.count++;
}
}
void kv_test_t::print_stats(kv_test_stat_t & prev_stat, timespec & prev_stat_time)
{
timespec cur_stat_time;
clock_gettime(CLOCK_REALTIME, &cur_stat_time);
int64_t usec = (cur_stat_time.tv_sec - prev_stat_time.tv_sec)*1000000 +
(cur_stat_time.tv_nsec - prev_stat_time.tv_nsec)/1000;
if (usec > 0)
{
kv_test_lat_t *lats[] = { &stat.get, &stat.add, &stat.update, &stat.del, &stat.list };
kv_test_lat_t *prev[] = { &prev_stat.get, &prev_stat.add, &prev_stat.update, &prev_stat.del, &prev_stat.list };
if (!json_output)
{
char buf[128] = { 0 };
for (int i = 0; i < sizeof(lats)/sizeof(lats[0]); i++)
{
snprintf(buf, sizeof(buf)-1, "%.1f %s/s (%lu us)", (lats[i]->count-prev[i]->count)*1000000.0/usec,
lats[i]->name, (lats[i]->usec-prev[i]->usec)/(lats[i]->count-prev[i]->count > 0 ? lats[i]->count-prev[i]->count : 1));
int k;
for (k = strlen(buf); k < strlen(lats[i]->name)+21; k++)
buf[k] = ' ';
buf[k] = 0;
printf("%s", buf);
}
printf("\n");
}
else
{
int64_t runtime = (cur_stat_time.tv_sec - start_stat_time.tv_sec)*1000000 +
(cur_stat_time.tv_nsec - start_stat_time.tv_nsec)/1000;
printf("{\"runtime\":%.1f", (double)runtime/1000000.0);
for (int i = 0; i < sizeof(lats)/sizeof(lats[0]); i++)
{
if (lats[i]->count > prev[i]->count)
{
printf(
",\"%s\":{\"avg\":{\"iops\":%.1f,\"usec\":%lu},\"total\":{\"count\":%lu,\"usec\":%lu}}",
lats[i]->name, (lats[i]->count-prev[i]->count)*1000000.0/usec,
(lats[i]->usec-prev[i]->usec)/(lats[i]->count-prev[i]->count),
lats[i]->count, lats[i]->usec
);
}
}
printf("}\n");
}
}
prev_stat = stat;
prev_stat_time = cur_stat_time;
}
void kv_test_t::print_total_stats()
{
if (!json_output)
printf("Total:\n");
kv_test_stat_t start_stats;
timespec start_stat_time = this->start_stat_time;
print_stats(start_stats, start_stat_time);
}
void kv_test_t::start_change(const std::string & key)
{
changing_keys.insert(key);
for (auto lst: listings)
{
lst->inflights.insert(key);
}
}
void kv_test_t::stop_change(const std::string & key)
{
changing_keys.erase(key);
}
int main(int narg, const char *args[])
{
setvbuf(stdout, NULL, _IONBF, 0);
setvbuf(stderr, NULL, _IONBF, 0);
exe_name = args[0];
kv_test_t *p = new kv_test_t();
p->run(kv_test_t::parse_args(narg, args));
delete p;
return 0;
}

View File

@@ -22,7 +22,7 @@ void osd_messenger_t::init()
{
rdma_context = msgr_rdma_context_t::create(
rdma_device != "" ? rdma_device.c_str() : NULL,
rdma_port_num, rdma_gid_index, rdma_mtu, log_level
rdma_port_num, rdma_gid_index, rdma_mtu, rdma_odp, log_level
);
if (!rdma_context)
{
@@ -167,6 +167,7 @@ void osd_messenger_t::parse_config(const json11::Json & config)
this->rdma_max_msg = config["rdma_max_msg"].uint64_value();
if (!this->rdma_max_msg || this->rdma_max_msg > 128*1024*1024)
this->rdma_max_msg = 129*1024;
this->rdma_odp = config["rdma_odp"].bool_value();
#endif
this->receive_buffer_size = (uint32_t)config["tcp_header_buffer_size"].uint64_value();
if (!this->receive_buffer_size || this->receive_buffer_size > 1024*1024*1024)
@@ -490,7 +491,14 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl)
fprintf(stderr, "Connected to OSD %lu using RDMA\n", cl->osd_num);
}
cl->peer_state = PEER_RDMA;
tfd->set_fd_handler(cl->peer_fd, false, NULL);
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
{
// Do not miss the disconnection!
if (epoll_events & EPOLLRDHUP)
{
handle_peer_epoll(peer_fd, epoll_events);
}
});
// Add the initial receive request
try_recv_rdma(cl);
}

View File

@@ -131,6 +131,7 @@ protected:
msgr_rdma_context_t *rdma_context = NULL;
uint64_t rdma_max_sge = 0, rdma_max_send = 0, rdma_max_recv = 0;
uint64_t rdma_max_msg = 0;
bool rdma_odp = false;
#endif
std::vector<int> read_ready_clients;
@@ -197,7 +198,9 @@ protected:
void handle_reply_ready(osd_op_t *op);
#ifdef WITH_RDMA
bool try_send_rdma(osd_client_t *cl);
void try_send_rdma(osd_client_t *cl);
void try_send_rdma_odp(osd_client_t *cl);
void try_send_rdma_nodp(osd_client_t *cl);
bool try_recv_rdma(osd_client_t *cl);
void handle_rdma_events();
#endif

View File

@@ -47,11 +47,29 @@ msgr_rdma_connection_t::~msgr_rdma_connection_t()
if (qp)
ibv_destroy_qp(qp);
if (recv_buffers.size())
{
for (auto b: recv_buffers)
free(b);
{
if (b.mr)
ibv_dereg_mr(b.mr);
free(b.buf);
}
recv_buffers.clear();
}
if (send_out.mr)
{
ibv_dereg_mr(send_out.mr);
send_out.mr = NULL;
}
if (send_out.buf)
{
free(send_out.buf);
send_out.buf = NULL;
}
send_out_size = 0;
}
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level)
msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level)
{
int res;
ibv_device **dev_list = NULL;
@@ -136,21 +154,27 @@ msgr_rdma_context_t *msgr_rdma_context_t::create(const char *ib_devname, uint8_t
fprintf(stderr, "Couldn't query RDMA device for its features\n");
goto cleanup;
}
if (!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
ctx->odp = odp;
if (ctx->odp &&
(!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT) ||
!(ctx->attrx.odp_caps.general_caps & IBV_ODP_SUPPORT_IMPLICIT) ||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_SEND) ||
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV))
!(ctx->attrx.odp_caps.per_transport_caps.rc_odp_caps & IBV_ODP_SUPPORT_RECV)))
{
fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable or does not support RC send and receive with ODP\n");
goto cleanup;
ctx->odp = false;
if (log_level > 0)
fprintf(stderr, "The RDMA device isn't implicit ODP (On-Demand Paging) capable, disabling it\n");
}
}
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
if (!ctx->mr)
if (ctx->odp)
{
fprintf(stderr, "Couldn't register RDMA memory region\n");
goto cleanup;
ctx->mr = ibv_reg_mr(ctx->pd, NULL, SIZE_MAX, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ON_DEMAND);
if (!ctx->mr)
{
fprintf(stderr, "Couldn't register RDMA memory region\n");
goto cleanup;
}
}
ctx->channel = ibv_create_comp_channel(ctx->context);
@@ -365,12 +389,34 @@ static void try_send_rdma_wr(osd_client_t *cl, ibv_sge *sge, int op_sge)
cl->rdma_conn->cur_send++;
}
bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
static int try_send_rdma_copy(osd_client_t *cl, uint8_t *dst, int dst_len)
{
auto rc = cl->rdma_conn;
int total_dst_len = dst_len;
while (dst_len > 0 && rc->send_pos < cl->send_list.size())
{
iovec & iov = cl->send_list[rc->send_pos];
uint32_t len = (uint32_t)(iov.iov_len-rc->send_buf_pos < dst_len
? iov.iov_len-rc->send_buf_pos : dst_len);
memcpy(dst, iov.iov_base+rc->send_buf_pos, len);
dst += len;
dst_len -= len;
rc->send_buf_pos += len;
if (rc->send_buf_pos >= iov.iov_len)
{
rc->send_pos++;
rc->send_buf_pos = 0;
}
}
return total_dst_len-dst_len;
}
void osd_messenger_t::try_send_rdma_odp(osd_client_t *cl)
{
auto rc = cl->rdma_conn;
if (!cl->send_list.size() || rc->cur_send >= rc->max_send)
{
return true;
return;
}
uint64_t op_size = 0, op_sge = 0;
ibv_sge sge[rc->max_sge];
@@ -408,15 +454,70 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl)
rc->send_sizes.push_back(op_size);
try_send_rdma_wr(cl, sge, op_sge);
}
return true;
}
static void try_recv_rdma_wr(osd_client_t *cl, void *buf)
void osd_messenger_t::try_send_rdma_nodp(osd_client_t *cl)
{
auto rc = cl->rdma_conn;
if (!rc->send_out_size)
{
// Allocate send ring buffer, if not yet
rc->send_out_size = rc->max_msg*rdma_max_send;
rc->send_out.buf = malloc_or_die(rc->send_out_size);
if (!rdma_context->odp)
{
rc->send_out.mr = ibv_reg_mr(rdma_context->pd, rc->send_out.buf, rc->send_out_size, 0);
if (!rc->send_out.mr)
{
fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
exit(1);
}
}
}
// Copy data into the buffer and send it
uint8_t *dst = NULL;
int dst_len = 0;
int copied = 1;
while (!rc->send_out_full && copied > 0 && rc->cur_send < rc->max_send)
{
dst = (uint8_t*)rc->send_out.buf + rc->send_out_pos;
dst_len = (rc->send_out_pos < rc->send_out_size ? rc->send_out_size-rc->send_out_pos : rc->send_done_pos-rc->send_out_pos);
if (dst_len > rc->max_msg)
dst_len = rc->max_msg;
copied = try_send_rdma_copy(cl, dst, dst_len);
if (copied > 0)
{
rc->send_out_pos += copied;
if (rc->send_out_pos == rc->send_out_size)
rc->send_out_pos = 0;
assert(rc->send_out_pos < rc->send_out_size);
if (rc->send_out_pos >= rc->send_done_pos)
rc->send_out_full = true;
ibv_sge sge = {
.addr = (uintptr_t)dst,
.length = (uint32_t)copied,
.lkey = rdma_context->odp ? rdma_context->mr->lkey : rc->send_out.mr->lkey,
};
try_send_rdma_wr(cl, &sge, 1);
rc->send_sizes.push_back(copied);
}
}
}
void osd_messenger_t::try_send_rdma(osd_client_t *cl)
{
if (rdma_context->odp)
try_send_rdma_odp(cl);
else
try_send_rdma_nodp(cl);
}
static void try_recv_rdma_wr(osd_client_t *cl, msgr_rdma_buf_t b)
{
ibv_sge sge = {
.addr = (uintptr_t)buf,
.addr = (uintptr_t)b.buf,
.length = (uint32_t)cl->rdma_conn->max_msg,
.lkey = cl->rdma_conn->ctx->mr->lkey,
.lkey = cl->rdma_conn->ctx->odp ? cl->rdma_conn->ctx->mr->lkey : b.mr->lkey,
};
ibv_recv_wr *bad_wr = NULL;
ibv_recv_wr wr = {
@@ -438,9 +539,19 @@ bool osd_messenger_t::try_recv_rdma(osd_client_t *cl)
auto rc = cl->rdma_conn;
while (rc->cur_recv < rc->max_recv)
{
void *buf = malloc_or_die(rc->max_msg);
rc->recv_buffers.push_back(buf);
try_recv_rdma_wr(cl, buf);
msgr_rdma_buf_t b;
b.buf = malloc_or_die(rc->max_msg);
if (!rdma_context->odp)
{
b.mr = ibv_reg_mr(rdma_context->pd, b.buf, rc->max_msg, IBV_ACCESS_LOCAL_WRITE);
if (!b.mr)
{
fprintf(stderr, "Failed to register RDMA memory region: %s\n", strerror(errno));
exit(1);
}
}
rc->recv_buffers.push_back(b);
try_recv_rdma_wr(cl, b);
}
return true;
}
@@ -492,7 +603,7 @@ void osd_messenger_t::handle_rdma_events()
if (!is_send)
{
rc->cur_recv--;
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf], wc[i].byte_len))
if (!handle_read_buffer(cl, rc->recv_buffers[rc->next_recv_buf].buf, wc[i].byte_len))
{
// handle_read_buffer may stop the client
continue;
@@ -505,6 +616,14 @@ void osd_messenger_t::handle_rdma_events()
rc->cur_send--;
uint64_t sent_size = rc->send_sizes.at(0);
rc->send_sizes.erase(rc->send_sizes.begin(), rc->send_sizes.begin()+1);
if (!rdma_context->odp)
{
rc->send_done_pos += sent_size;
rc->send_out_full = false;
if (rc->send_done_pos == rc->send_out_size)
rc->send_done_pos = 0;
assert(rc->send_done_pos < rc->send_out_size);
}
int send_pos = 0, send_buf_pos = 0;
while (sent_size > 0)
{

View File

@@ -23,6 +23,7 @@ struct msgr_rdma_context_t
ibv_device *dev = NULL;
ibv_device_attr_ex attrx;
ibv_pd *pd = NULL;
bool odp = false;
ibv_mr *mr = NULL;
ibv_comp_channel *channel = NULL;
ibv_cq *cq = NULL;
@@ -35,10 +36,16 @@ struct msgr_rdma_context_t
int max_cqe = 0;
int used_max_cqe = 0;
static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, int log_level);
static msgr_rdma_context_t *create(const char *ib_devname, uint8_t ib_port, uint8_t gid_index, uint32_t mtu, bool odp, int log_level);
~msgr_rdma_context_t();
};
struct msgr_rdma_buf_t
{
void *buf = NULL;
ibv_mr *mr = NULL;
};
struct msgr_rdma_connection_t
{
msgr_rdma_context_t *ctx = NULL;
@@ -50,8 +57,11 @@ struct msgr_rdma_connection_t
int send_pos = 0, send_buf_pos = 0;
int next_recv_buf = 0;
std::vector<void*> recv_buffers;
std::vector<msgr_rdma_buf_t> recv_buffers;
std::vector<uint64_t> send_sizes;
msgr_rdma_buf_t send_out;
int send_out_pos = 0, send_done_pos = 0, send_out_size = 0;
bool send_out_full = false;
~msgr_rdma_connection_t();
static msgr_rdma_connection_t *create(msgr_rdma_context_t *ctx, uint32_t max_send, uint32_t max_recv, uint32_t max_sge, uint32_t max_msg);

View File

@@ -3,6 +3,7 @@
#define _XOPEN_SOURCE
#include <limits.h>
#include <sys/epoll.h>
#include "messenger.h"
@@ -119,9 +120,9 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
try_send(cl);
}
}
else if (cl->write_msg.msg_iovlen > 0 || !try_send(cl))
else
{
if (cl->write_state == 0)
if ((cl->write_msg.msg_iovlen > 0 || !try_send(cl)) && (cl->write_state == 0))
{
cl->write_state = CL_WRITE_READY;
write_ready_clients.push_back(cur_op->peer_fd);
@@ -283,7 +284,14 @@ void osd_messenger_t::handle_send(int result, osd_client_t *cl)
fprintf(stderr, "Successfully connected with client %d using RDMA\n", cl->peer_fd);
}
cl->peer_state = PEER_RDMA;
tfd->set_fd_handler(cl->peer_fd, false, NULL);
tfd->set_fd_handler(cl->peer_fd, false, [this](int peer_fd, int epoll_events)
{
// Do not miss the disconnection!
if (epoll_events & EPOLLRDHUP)
{
handle_peer_epoll(peer_fd, epoll_events);
}
});
// Add the initial receive request
try_recv_rdma(cl);
}

View File

@@ -225,7 +225,7 @@ public:
cfg = obj;
}
// Create client
ringloop = new ring_loop_t(512);
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
epmgr = new epoll_manager_t(ringloop);
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
if (!inode)

View File

@@ -124,7 +124,7 @@ void nfs_proxy_t::run(json11::Json cfg)
cfg = obj;
}
// Create client
ringloop = new ring_loop_t(512);
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
epmgr = new epoll_manager_t(ringloop);
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
cmd = new cli_tool_t();

View File

@@ -541,11 +541,15 @@ void osd_t::print_slow()
}
else if (op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)
{
for (uint64_t i = 0; i < op->req.sec_stab.len; i += sizeof(obj_ver_id))
for (uint64_t i = 0; i < op->req.sec_stab.len && i < sizeof(obj_ver_id)*12; i += sizeof(obj_ver_id))
{
obj_ver_id *ov = (obj_ver_id*)((uint8_t*)op->buf + i);
bufprintf(i == 0 ? " %lx:%lx v%lu" : ", %lx:%lx v%lu", ov->oid.inode, ov->oid.stripe, ov->version);
}
if (op->req.sec_stab.len > sizeof(obj_ver_id)*12)
{
bufprintf(", ... (%lu items)", op->req.sec_stab.len/sizeof(obj_ver_id));
}
}
else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
{

View File

@@ -19,6 +19,14 @@ static void handle_sigint(int sig)
exit(0);
}
static const char* help_text =
"Vitastor OSD (block object storage daemon) " VERSION "\n"
"(c) Vitaliy Filippov, 2019+ (VNPL-1.1)\n"
"\n"
"OSDs are usually started by vitastor-disk.\n"
"Manual usage: vitastor-osd [--option value] ...\n"
;
int main(int narg, char *args[])
{
setvbuf(stdout, NULL, _IONBF, 0);
@@ -37,10 +45,20 @@ int main(int narg, char *args[])
char *opt = args[i]+2;
config[std::string(opt)] = std::string(args[++i]);
}
else if (!strcmp(args[i], "--help"))
{
printf("%s", help_text);
return 0;
}
}
if (!config.size())
{
printf("%s", help_text);
return 1;
}
signal(SIGINT, handle_sigint);
signal(SIGTERM, handle_sigint);
ring_loop_t *ringloop = new ring_loop_t(512);
ring_loop_t *ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
osd = new osd_t(config, ringloop);
while (1)
{

View File

@@ -239,8 +239,9 @@ static void* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size
{
int edd = 0;
int erased[pg_size];
// we should distinguish stripes which are not at all and missing stripes
for (int i = 0; i < pg_size; i++)
erased[i] = (stripes[i].read_end == 0 || stripes[i].missing ? 1 : 0);
erased[i] = (stripes[i].read_end == 0 ? 2 : (stripes[i].missing ? 1 : 0));
for (int i = 0; i < pg_minsize; i++)
if (stripes[i].read_end != 0 && stripes[i].missing)
edd++;
@@ -253,7 +254,7 @@ static void* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size
#ifdef WITH_ISAL
int smrow = 0;
uint8_t *submatrix = (uint8_t*)malloc_or_die(pg_minsize*pg_minsize*2);
for (int i = 0; i < pg_size; i++)
for (int i = 0; i < pg_size && smrow < pg_minsize; i++)
{
if (!erased[i])
{
@@ -279,7 +280,7 @@ static void* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size
smrow = 0;
for (int i = 0; i < pg_minsize; i++)
{
if (erased[i])
if (erased[i] == 1)
{
memcpy(submatrix + pg_minsize*smrow, submatrix + (pg_minsize+i)*pg_minsize, pg_minsize);
smrow++;

View File

@@ -29,6 +29,7 @@ void test15(bool second);
void test16();
void test_recover_22_d2();
void test_ec43_error_bruteforce();
void test_recover_53_d5();
int main(int narg, char *args[])
{
@@ -67,6 +68,8 @@ int main(int narg, char *args[])
test_recover_22_d2();
// Error bruteforce
test_ec43_error_bruteforce();
// Test 19
test_recover_53_d5();
// End
printf("all ok\n");
return 0;
@@ -1112,7 +1115,7 @@ void test_recover_22_d2()
/***
EC 4+2 error location bruteforce
18. EC 4+2 error location bruteforce
***/
@@ -1178,3 +1181,66 @@ void test_ec43_error_bruteforce()
free(write_buf);
use_ec(7, 4, false);
}
/***
19. EC 5+3 recover 5th data block but not 4th
***/
void test_recover_53_d5()
{
const int bmp = 128*1024 / 4096 / 8;
use_ec(8, 5, true);
osd_num_t osd_set[8] = { 1, 2, 3, 0, 0, 6, 7, 8 };
osd_rmw_stripe_t stripes[8] = {};
unsigned bitmaps[8] = { 0 };
// Read 512+128K
split_stripes(5, 128*1024, 512*1024, 128*1024, stripes);
assert(stripes[0].req_start == 0 && stripes[0].req_end == 0);
assert(stripes[1].req_start == 0 && stripes[1].req_end == 0);
assert(stripes[2].req_start == 0 && stripes[2].req_end == 0);
assert(stripes[3].req_start == 0 && stripes[3].req_end == 0);
assert(stripes[4].req_start == 0 && stripes[4].req_end == 128*1024);
uint8_t *data_buf = (uint8_t*)malloc_or_die(128*1024*8);
for (int i = 0; i < 8; i++)
{
stripes[i].read_start = stripes[i].req_start;
stripes[i].read_end = stripes[i].req_end;
stripes[i].read_buf = data_buf + i*128*1024;
stripes[i].bmp_buf = bitmaps + i;
}
// Read using parity
assert(extend_missing_stripes(stripes, osd_set, 5, 8) == 0);
assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024);
assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024);
assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024);
assert(stripes[3].read_start == 0 && stripes[3].read_end == 0);
assert(stripes[4].read_start == 0 && stripes[4].read_end == 128*1024);
assert(stripes[5].read_start == 0 && stripes[5].read_end == 128*1024);
assert(stripes[6].read_start == 0 && stripes[6].read_end == 128*1024);
assert(stripes[7].read_start == 0 && stripes[7].read_end == 0);
bitmaps[0] = 0xffffffff;
bitmaps[1] = 0xffffffff;
bitmaps[2] = 0xffffffff;
bitmaps[3] = 0;
bitmaps[4] = 0;
bitmaps[5] = 0xffffffff;
bitmaps[6] = 0x64646464;
bitmaps[7] = 0;
set_pattern(stripes[0].read_buf, 128*1024, 0x70a549add9a2280a);
set_pattern(stripes[1].read_buf, 128*1024, 0xa70a549add9a2280);
set_pattern(stripes[2].read_buf, 128*1024, 0x0a70a549add9a228);
set_pattern(stripes[3].read_buf, 128*1024, 0); // 0x80a70a549add9a22
set_pattern(stripes[4].read_buf, 128*1024, 0); // 0x280a70a549add9a2
set_pattern(stripes[5].read_buf, 128*1024, 0x7572c28f7a91eb22); // xor
set_pattern(stripes[6].read_buf, 128*1024, 0xb4542b32a560fe26); // 2nd EC chunk
set_pattern(stripes[7].read_buf, 128*1024, 0);
// Reconstruct
reconstruct_stripes_ec(stripes, 8, 5, bmp);
check_pattern(stripes[4].read_buf, 128*1024, 0x280a70a549add9a2);
assert(bitmaps[4] == 0xFFFFFFFF);
free(data_buf);
// Done
use_ec(8, 5, false);
}

View File

@@ -17,7 +17,7 @@ ring_loop_t::ring_loop_t(int qd)
{
throw std::runtime_error(std::string("io_uring_queue_init: ") + strerror(-ret));
}
free_ring_data_ptr = *ring.cq.kring_entries;
free_ring_data_ptr = *ring.sq.kring_entries;
ring_datas = (struct ring_data_t*)calloc(free_ring_data_ptr, sizeof(ring_data_t));
free_ring_data = (int*)malloc(sizeof(int) * free_ring_data_ptr);
if (!ring_datas || !free_ring_data)

View File

@@ -15,6 +15,8 @@
#include <functional>
#include <vector>
#define RINGLOOP_DEFAULT_SIZE 1024
static inline void my_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, const void *addr, unsigned len, off_t offset)
{
// Prepare a read/write operation without clearing user_data
@@ -139,11 +141,9 @@ public:
if (free_ring_data_ptr == 0)
return NULL;
struct io_uring_sqe* sqe = io_uring_get_sqe(&ring);
if (sqe)
{
*sqe = { 0 };
io_uring_sqe_set_data(sqe, ring_datas + free_ring_data[--free_ring_data_ptr]);
}
assert(sqe);
*sqe = { 0 };
io_uring_sqe_set_data(sqe, ring_datas + free_ring_data[--free_ring_data_ptr]);
return sqe;
}
inline void set_immediate(const std::function<void()> cb)

View File

@@ -30,7 +30,7 @@ void stub_exec_op(osd_messenger_t *msgr, osd_op_t *op);
int main(int narg, char *args[])
{
ring_consumer_t looper;
ring_loop_t *ringloop = new ring_loop_t(512);
ring_loop_t *ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
osd_messenger_t *msgr = new osd_messenger_t();
msgr->osd_num = 1351;

View File

@@ -11,7 +11,7 @@ int main(int narg, char *args[])
config["meta_device"] = "./test_meta.bin";
config["journal_device"] = "./test_journal.bin";
config["data_device"] = "./test_data.bin";
ring_loop_t *ringloop = new ring_loop_t(512);
ring_loop_t *ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
blockstore_t *bs = new blockstore_t(config, ringloop, epmgr->tfd);

View File

@@ -68,7 +68,7 @@ int main(int narg, char *args[])
| cfg["inode_id"].uint64_value();
uint64_t base_ver = 0;
// Create client
auto ringloop = new ring_loop_t(512);
auto ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
auto epmgr = new epoll_manager_t(ringloop);
auto cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
cli->on_ready([&]()

View File

@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: Vitastor
Description: Vitastor client library
Version: 1.1.0
Version: 1.2.0
Libs: -L${libdir} -lvitastor_client
Cflags: -I${includedir}

View File

@@ -114,7 +114,7 @@ vitastor_c *vitastor_c_create_qemu_uring(QEMUSetFDHandler *aio_set_fd_handler, v
ring_loop_t *ringloop = NULL;
try
{
ringloop = new ring_loop_t(512);
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
}
catch (std::exception & e)
{
@@ -136,7 +136,7 @@ vitastor_c *vitastor_c_create_uring(const char *config_path, const char *etcd_ho
ring_loop_t *ringloop = NULL;
try
{
ringloop = new ring_loop_t(512);
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
}
catch (std::exception & e)
{
@@ -167,7 +167,7 @@ vitastor_c *vitastor_c_create_uring_json(const char **options, int options_len)
ring_loop_t *ringloop = NULL;
try
{
ringloop = new ring_loop_t(512);
ringloop = new ring_loop_t(RINGLOOP_DEFAULT_SIZE);
}
catch (std::exception & e)
{

View File

@@ -29,7 +29,7 @@ start_osd_on()
{
local i=$1
local dev=$2
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL \
build/src/vitastor-osd --osd_num $i --bind_address $ETCD_IP $NO_SAME $OSD_ARGS --etcd_address $ETCD_URL \
$(build/src/vitastor-disk simple-offsets --format options $OFFSET_ARGS $dev $OFFSET_ARGS 2>/dev/null) \
>>./testdata/osd$i.log 2>&1 &
eval OSD${i}_PID=$!