Compare commits

...

190 Commits

Author SHA1 Message Date
88cb2df946 WIP dump/load heap
Some checks failed
Test / test_write (push) Successful in 35s
Test / test_write_no_same (push) Successful in 10s
Test / test_write_xor (push) Successful in 38s
Test / test_write_iothreads (push) Successful in 39s
Test / test_heal_pg_size_2 (push) Successful in 2m19s
Test / test_heal_local_read (push) Successful in 2m21s
Test / test_heal_ec (push) Successful in 2m20s
Test / test_reweight_half (push) Successful in 11s
Test / test_heal_antietcd (push) Successful in 2m20s
Test / test_heal_csum_32k_dmj (push) Failing after 2m34s
Test / test_heal_csum_4k_dmj (push) Successful in 2m31s
Test / test_heal_csum_32k (push) Failing after 2m48s
Test / test_resize (push) Failing after 13s
Test / test_resize_auto (push) Successful in 10s
Test / test_snapshot_pool2 (push) Successful in 18s
Test / test_osd_tags (push) Successful in 10s
Test / test_enospc (push) Successful in 13s
Test / test_enospc_xor (push) Successful in 13s
Test / test_enospc_imm (push) Successful in 11s
Test / test_enospc_imm_xor (push) Successful in 14s
Test / test_scrub (push) Successful in 16s
Test / test_heal_csum_4k (push) Successful in 2m26s
Test / test_heal_csum_4k_dj (push) Successful in 2m33s
Test / test_scrub_zero_osd_2 (push) Successful in 17s
Test / test_scrub_xor (push) Successful in 14s
Test / test_scrub_pg_size_3 (push) Successful in 14s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 18s
Test / test_nfs (push) Successful in 14s
Test / test_scrub_ec (push) Successful in 17s
Test / test_heal_csum_32k_dj (push) Failing after 10m13s
2025-09-07 12:12:53 +03:00
7be9ed93d5 Add missing list_stable_limit support 2025-09-06 11:10:32 +03:00
b019988e2b Fix checksum validation in !inmemory_journal mode 2025-09-06 11:10:32 +03:00
0db5400cb7 Fix checksum padding during read 2025-09-06 11:10:32 +03:00
5a13db107b Remove unused bs->buffer_area + buffer calc 2025-09-06 11:10:32 +03:00
1288cfb0af Add missing memset zero_object to 0 2025-09-06 11:10:32 +03:00
2eba37db8a Mark in-memory data reads as SKIP_CSUM 2025-09-06 11:10:32 +03:00
f9e0b0db27 Add missing calc_crc32c after updating block checksums 2025-09-06 11:10:32 +03:00
f6acd5e79c Fix checksum padding during flush 2025-09-06 11:10:32 +03:00
6fbeb5c668 Fix free_read_buffers 2025-09-06 11:10:32 +03:00
8e55869b71 Fix assert(region_marker) at the end of the block in find_block_run() 2025-09-06 11:10:32 +03:00
7bb9004435 Fix assert(is_buffer_area_free) with size=0 2025-09-06 11:10:32 +03:00
d9d484e8bb Add a copy of wyhash 2025-09-06 11:10:32 +03:00
0c0ab64155 Fix test build with isa-l 2025-09-06 11:10:32 +03:00
84fca8abca Remove assert !region_marker & FREE 2025-09-06 11:10:32 +03:00
eaf0fe66a1 Use wyhash 2025-09-06 11:10:32 +03:00
5ca25c0e7d Change emhashes 2025-09-06 11:10:32 +03:00
1b7f2eac8e unordered_map mvcc 2025-09-06 11:10:32 +03:00
ddd16e8613 Fix bad resharding due to the lack of iteration order in a hashmap 2025-09-06 11:10:32 +03:00
046a9f7a67 Use emhash::try_get 2025-09-06 11:10:32 +03:00
bd55b24827 Use emhash hashmap (2x faster) 2025-09-06 11:10:32 +03:00
09d69f7968 Use unordered_maps for object-block index 2025-09-06 11:10:32 +03:00
cc4d170ef0 B-tree is slower... 2025-09-06 11:10:32 +03:00
335b73a3d5 Unordered_map for pool settings 2025-09-06 11:10:32 +03:00
fc6c5a853e Fix read fio bench 2025-09-06 11:10:32 +03:00
445393dfc4 Fix vitastor-disk prepare and param validation 2025-09-06 11:10:32 +03:00
021762193b Fix skipping of corrupted objects, fix use_buffer_area with zero size 2025-09-06 11:10:32 +03:00
a6dee28f4e Support heap format in simple-offsets 2025-09-06 11:10:32 +03:00
6e0ad777e3 Return all unstable versions in listing 2025-09-06 11:10:32 +03:00
13d069cf5f Actually fsync after stabilize 2025-09-06 11:10:32 +03:00
504246b1db Move test & build_test to top-level cmakelists 2025-09-06 11:10:32 +03:00
fd647021eb Add missing request_trim 2025-09-06 11:10:32 +03:00
049a7b260e Use multilist_index_t instead of multiple bitmap allocators 2025-09-06 11:10:32 +03:00
17d0da1a74 Implement another multilist-style allocator for metadata blocks 2025-09-06 11:10:32 +03:00
fe277127bb Fix a bug with unstable_big over unstable_small 2025-09-06 11:10:32 +03:00
79f3147d0c Limit the number of unstable versions per object 2025-09-06 11:10:32 +03:00
5126e67c3f Integrate moving objects 2025-09-06 11:10:32 +03:00
90b1bdee43 Support moving objects between blocks 2025-09-06 11:10:32 +03:00
5d501d0d43 Fix zero-length writes 2025-09-06 11:10:32 +03:00
157a62628a Fix op_stable slowdowns 2025-09-06 11:10:32 +03:00
d4da42bb05 Fix a bs_read bug 2025-09-06 11:10:32 +03:00
2883df733a Fix test dependencies 2025-09-06 11:10:32 +03:00
77e3870f8f Add read_blocks() API 2025-09-06 11:10:32 +03:00
b82be8136a Fix block checksum calculation in write_journal for the old blockstore version 2025-09-06 11:10:32 +03:00
6c4f407575 Remove block_order parameter 2025-09-06 11:10:32 +03:00
c0489d237b Extract (flags & BS_HEAP_TYPE) into a function 2025-09-06 11:10:32 +03:00
7c250f165c Make dump-journal --format data default 2025-09-06 11:10:32 +03:00
b46dbbbefb Add fio options 2025-09-06 11:10:32 +03:00
1810fbe622 Fix loading for out-of-order lsns 2025-09-06 11:10:32 +03:00
4d6c9bc294 Add include for older gcc 2025-09-06 11:10:32 +03:00
e7df9683f1 Fix vitastor-disk build (with old metadata and journal formats) 2025-09-06 11:10:32 +03:00
a5dd943fcc Collapse intent_writes on other write types too 2025-09-06 11:10:32 +03:00
c9b527f2e2 Fix buffer overflow in test_heap 2025-09-06 11:10:32 +03:00
1551a49454 Add a test for parallel reads with block checksums 2025-09-06 11:10:32 +03:00
d46feccd03 Remove cancel_all_writes
Not needed because a) parallel writes to the same object are forbidden
b) subsequent writes don't depend on previous ones anyway.
2025-09-06 11:10:32 +03:00
959e2e2df9 Add a test for "perfect_csum_update" mode 2025-09-06 11:10:32 +03:00
b7bc3d652d Add 2 tests for intent writes 2025-09-06 11:10:32 +03:00
fc2762d60a Add a test with fsync 2025-09-06 11:10:32 +03:00
85b3c691e9 Implement buffered disk_mock_t mode, extract ringloop_mock.cpp 2025-09-06 11:10:32 +03:00
0ac4645a9e unaligned_intent does not need special handling anymore 2025-09-06 11:10:32 +03:00
87922bc660 Add an option for global coverage 2025-09-06 11:10:32 +03:00
77b97b0613 Fix some blockstore bugs discovered by the mocked test! 2025-09-06 11:10:32 +03:00
0713120315 Add a basic mocked blockstore test 2025-09-06 11:10:32 +03:00
bce082a444 Add mocks for blockstore integration tests: timerfd, ring_loop_mock_t and disk_mock_t 2025-09-06 11:10:32 +03:00
383305da88 Use only space_left, not sqes_left 2025-09-06 11:10:32 +03:00
639c809827 Test calc_checksums 2025-09-06 11:10:31 +03:00
71d78c1409 Check 2/3 blocks in test_recheck 2025-09-06 11:10:31 +03:00
3e2e2f9846 Fix intent writes with padded checksums 2025-09-06 11:10:31 +03:00
cb48c70083 Fix reads from intent writes 2025-09-06 11:10:31 +03:00
989571bb74 Disable punching block checksums and allow to enable it with a parameter 2025-09-06 11:10:31 +03:00
0e1d069ad7 Process big_writes as intents to avoid fsyncs 2025-09-06 11:10:31 +03:00
a36b4e5933 Fix collapsing intent-over-intent checksums 2025-09-06 11:10:31 +03:00
c809d86846 Allow 1 intent_write over big_write in fsync mode 2025-09-06 11:10:31 +03:00
0bd5eb1f20 Block lists by previous writes 2025-09-06 11:10:31 +03:00
2c8ddc5431 Allow multiple writes with the same version 2025-09-06 11:10:31 +03:00
e1f3829bb1 Add tests for intent writes 2025-09-06 11:10:31 +03:00
a980d65f78 Call finish_load after async recheck 2025-09-06 11:10:31 +03:00
bcc93e548e More tests for incorrect data cases 2025-09-06 11:10:31 +03:00
68e9f71723 Do not block writes on previous writes 2025-09-06 11:10:31 +03:00
1ae4b9a799 Add atomic_write_size parameter 2025-09-06 11:10:31 +03:00
79d4b57f0e Use ui32 for block sizes 2025-09-06 11:10:31 +03:00
cc73e42488 Remove extra unneeded read_entry-s 2025-09-06 11:10:31 +03:00
6cb0fdb571 Do not recheck data location on intent-write 2025-09-06 11:10:31 +03:00
8e1ea15f58 Remove FIXMEs 2025-09-06 11:10:31 +03:00
d7f1b3a2dd Fsync after stabilizing 2025-09-06 11:10:31 +03:00
0049a6ed4a Fsync & update metadata when block checksums are enabled 2025-09-06 11:10:31 +03:00
05ed9a27a4 Return new_lsn from erase and rollback 2025-09-06 11:10:31 +03:00
300a149513 Allow to cancel compaction for unfinished writes 2025-09-06 11:10:31 +03:00
0223016ce6 Fix space allocation & compaction on start 2025-09-06 11:10:31 +03:00
9368cc7d9b Use inflight_lsn iterators 2025-09-06 11:10:31 +03:00
74743ccd3f Free space on overwrites correctly 2025-09-06 11:10:31 +03:00
ec1c7e6be4 Use fsynced_lsn in flusher 2025-09-06 11:10:31 +03:00
bc643b24cf Correctly track fsynced_lsn when fsyncs are enabled 2025-09-06 11:10:31 +03:00
2a66cc3f11 Extract multilist_alloc_t 2025-09-06 11:10:31 +03:00
bd74ce4b70 Check for overlaps during blockstore loading 2025-09-06 11:10:31 +03:00
7c07303d12 Fix multilist_alloc_t bug, move verify and print to lib 2025-09-06 11:10:31 +03:00
ac6bacc46e Fix pending_ops 2025-09-06 11:10:31 +03:00
f7fbfb8174 Fix repeating cur_oid 2025-09-06 11:10:31 +03:00
ebf85a7515 Mark overwritten heap_writes as immediately compacted 2025-09-06 11:10:31 +03:00
36413d89c3 Move "ack write" debug message 2025-09-06 11:10:31 +03:00
fb9505d5db Fsync data on trim_lsn, not when writing compacted data 2025-09-06 11:10:31 +03:00
77e1badfad Batch big_write data fsyncs 2025-09-06 11:10:31 +03:00
b83359fdbc Use the same "inflight" queue to track compaction 2025-09-06 11:10:31 +03:00
2f4f46d7eb Use new LSNs on stabilize 2025-09-06 11:10:31 +03:00
65a4aecb8c Assign new LSN on stabilize 2025-09-06 11:10:31 +03:00
d08a2fb8ee WIP Only save MVCC copy when overwriting an object 2025-09-06 11:10:31 +03:00
5739d52600 Do not use wr_offset 2025-09-06 11:10:31 +03:00
d44dba43b3 Prevent compaction of incomplete object writes 2025-09-06 11:10:31 +03:00
8984556689 Experimental INTENT_WRITE write mode with WA=2 instead of 3 2025-09-06 11:10:31 +03:00
457b47d311 Add test_compact_block 2025-09-06 11:10:31 +03:00
af62f6374b Remove alloc_buffer_area 2025-09-06 11:10:31 +03:00
1c7f3710be Two more unordered_maps 2025-09-06 11:10:31 +03:00
a67a09da7c Remove sync_to_repeat map and use simpler repeating 2025-09-06 11:10:31 +03:00
4ed29c19c4 Use a sequence of bitmap_allocs for metadata instead of std::sets... 2025-09-06 11:10:31 +03:00
dd3f64cf62 It seems tcmalloc is actually slower, disable it 2025-09-06 11:10:31 +03:00
e1f4fcb76a Use single add_used_space call instead of unmark+mark allocated_block 2025-09-06 11:10:31 +03:00
2d24b4d70d Remove compact_queue_lsn map 2025-09-06 11:10:31 +03:00
d9e2705db7 Implement a really crazy "multi-linked-list" allocator for buffered data 2025-09-06 11:10:31 +03:00
5935762730 Use linked list heap in blockstore code
WIP, still slower than the old version :-E
2025-09-06 11:10:31 +03:00
a945a46d56 Use linked lists in heap to avoid excessive memory copying 2025-09-06 11:10:31 +03:00
82ac6416d3 Integrate "heap" metadata storage into blockstore 2025-09-06 11:10:31 +03:00
df4661230e "Heap" metadata storage scheme 2025-09-06 11:10:31 +03:00
72a29b7031 Remove no_inode_stats from blockstore interface 2025-09-06 11:10:31 +03:00
2d42f29385 Add vitastor-disk prepare --dry-run option
All checks were successful
Test / test_write (push) Successful in 37s
Test / test_write_xor (push) Successful in 39s
Test / test_write_no_same (push) Successful in 9s
Test / test_write_iothreads (push) Successful in 37s
Test / test_heal_pg_size_2 (push) Successful in 2m20s
Test / test_heal_local_read (push) Successful in 2m23s
Test / test_heal_ec (push) Successful in 2m24s
Test / test_reweight_half (push) Successful in 10s
Test / test_heal_antietcd (push) Successful in 2m24s
Test / test_heal_csum_32k_dmj (push) Successful in 2m25s
Test / test_heal_csum_32k_dj (push) Successful in 2m26s
Test / test_heal_csum_32k (push) Successful in 2m25s
Test / test_heal_csum_4k_dmj (push) Successful in 2m25s
Test / test_resize_auto (push) Successful in 9s
Test / test_resize (push) Successful in 14s
Test / test_osd_tags (push) Successful in 8s
Test / test_snapshot_pool2 (push) Successful in 15s
Test / test_enospc (push) Successful in 12s
Test / test_enospc_xor (push) Successful in 13s
Test / test_enospc_imm (push) Successful in 13s
Test / test_enospc_imm_xor (push) Successful in 15s
Test / test_scrub (push) Successful in 14s
Test / test_scrub_zero_osd_2 (push) Successful in 14s
Test / test_scrub_xor (push) Successful in 19s
Test / test_scrub_pg_size_3 (push) Successful in 17s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 18s
Test / test_scrub_ec (push) Successful in 16s
Test / test_nfs (push) Successful in 12s
Test / test_heal_csum_4k_dj (push) Successful in 2m28s
Test / test_heal_csum_4k (push) Successful in 2m24s
2025-09-05 23:34:11 +00:00
17240c6144 Add librdmacm-dev to build-deps 2025-09-06 02:09:44 +03:00
9e627a4414 Log has_invalid objects
All checks were successful
Test / test_write (push) Successful in 38s
Test / test_write_xor (push) Successful in 41s
Test / test_write_no_same (push) Successful in 12s
Test / test_write_iothreads (push) Successful in 42s
Test / test_heal_pg_size_2 (push) Successful in 2m22s
Test / test_heal_local_read (push) Successful in 2m26s
Test / test_heal_ec (push) Successful in 2m25s
Test / test_reweight_half (push) Successful in 12s
Test / test_heal_antietcd (push) Successful in 2m27s
Test / test_heal_csum_32k_dj (push) Successful in 2m26s
Test / test_heal_csum_32k_dmj (push) Successful in 2m35s
Test / test_heal_csum_32k (push) Successful in 2m35s
Test / test_heal_csum_4k_dmj (push) Successful in 2m30s
Test / test_resize_auto (push) Successful in 13s
Test / test_snapshot_pool2 (push) Successful in 19s
Test / test_osd_tags (push) Successful in 10s
Test / test_enospc (push) Successful in 13s
Test / test_enospc_xor (push) Successful in 20s
Test / test_enospc_imm (push) Successful in 13s
Test / test_enospc_imm_xor (push) Successful in 17s
Test / test_scrub (push) Successful in 13s
Test / test_heal_csum_4k_dj (push) Successful in 2m30s
Test / test_scrub_zero_osd_2 (push) Successful in 19s
Test / test_heal_csum_4k (push) Successful in 2m33s
Test / test_scrub_xor (push) Successful in 14s
Test / test_scrub_pg_size_3 (push) Successful in 16s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 18s
Test / test_scrub_ec (push) Successful in 17s
Test / test_nfs (push) Successful in 13s
Test / test_resize (push) Successful in 14s
2025-09-05 02:14:03 +03:00
90b1019636 Do not warn on incomplete+has_invalid PG states as unexpected
Some checks failed
Test / test_write (push) Has been cancelled
Test / test_write_xor (push) Has been cancelled
Test / test_write_iothreads (push) Has been cancelled
Test / test_write_no_same (push) Has been cancelled
Test / test_heal_pg_size_2 (push) Has been cancelled
Test / test_heal_local_read (push) Has been cancelled
Test / test_heal_ec (push) Has been cancelled
Test / test_heal_antietcd (push) Has been cancelled
Test / test_reweight_half (push) Has been cancelled
Test / test_heal_csum_32k_dmj (push) Has been cancelled
Test / test_heal_csum_32k_dj (push) Has been cancelled
Test / test_heal_csum_32k (push) Has been cancelled
Test / test_heal_csum_4k_dmj (push) Has been cancelled
Test / test_heal_csum_4k_dj (push) Has been cancelled
Test / test_heal_csum_4k (push) Has been cancelled
Test / test_resize (push) Has been cancelled
Test / test_resize_auto (push) Has been cancelled
Test / test_snapshot_pool2 (push) Has been cancelled
Test / test_osd_tags (push) Has been cancelled
Test / test_enospc (push) Has been cancelled
Test / test_enospc_xor (push) Has been cancelled
Test / test_enospc_imm (push) Has been cancelled
Test / test_enospc_imm_xor (push) Has been cancelled
Test / test_scrub (push) Has been cancelled
Test / test_scrub_zero_osd_2 (push) Has been cancelled
Test / test_scrub_xor (push) Has been cancelled
Test / test_scrub_pg_size_3 (push) Has been cancelled
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Has been cancelled
Test / test_scrub_ec (push) Has been cancelled
Test / test_nfs (push) Has been cancelled
2025-09-05 02:10:57 +03:00
df604afbd5 Fix OSD reweight values between 0 and 1 not working
Some checks failed
Test / test_write (push) Has been cancelled
Test / test_write_xor (push) Has been cancelled
Test / test_write_iothreads (push) Has been cancelled
Test / test_write_no_same (push) Has been cancelled
Test / test_heal_pg_size_2 (push) Has been cancelled
Test / test_heal_local_read (push) Has been cancelled
Test / test_heal_ec (push) Has been cancelled
Test / test_heal_antietcd (push) Has been cancelled
Test / test_reweight_half (push) Has been cancelled
Test / test_heal_csum_32k_dmj (push) Has been cancelled
Test / test_heal_csum_32k_dj (push) Has been cancelled
Test / test_heal_csum_32k (push) Has been cancelled
Test / test_heal_csum_4k_dmj (push) Has been cancelled
Test / test_heal_csum_4k_dj (push) Has been cancelled
Test / test_heal_csum_4k (push) Has been cancelled
Test / test_resize (push) Has been cancelled
Test / test_resize_auto (push) Has been cancelled
Test / test_snapshot_pool2 (push) Has been cancelled
Test / test_osd_tags (push) Has been cancelled
Test / test_enospc (push) Has been cancelled
Test / test_enospc_xor (push) Has been cancelled
Test / test_enospc_imm (push) Has been cancelled
Test / test_enospc_imm_xor (push) Has been cancelled
Test / test_scrub (push) Has been cancelled
Test / test_scrub_zero_osd_2 (push) Has been cancelled
Test / test_scrub_xor (push) Has been cancelled
Test / test_scrub_pg_size_3 (push) Has been cancelled
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Has been cancelled
Test / test_scrub_ec (push) Has been cancelled
Test / test_nfs (push) Has been cancelled
2025-09-05 02:05:01 +03:00
47c7aa62de Remove unused SUBMIT_SCRUB_READ
All checks were successful
Test / test_switch_primary (push) Successful in 31s
Test / test_write (push) Successful in 29s
Test / test_write_no_same (push) Successful in 8s
Test / test_write_xor (push) Successful in 34s
Test / test_write_iothreads (push) Successful in 35s
Test / test_heal_pg_size_2 (push) Successful in 2m17s
Test / test_heal_local_read (push) Successful in 2m20s
Test / test_heal_ec (push) Successful in 2m18s
Test / test_heal_antietcd (push) Successful in 2m16s
Test / test_heal_csum_32k_dmj (push) Successful in 2m19s
Test / test_heal_csum_32k_dj (push) Successful in 2m22s
Test / test_heal_csum_32k (push) Successful in 2m20s
Test / test_heal_csum_4k_dmj (push) Successful in 2m20s
Test / test_resize (push) Successful in 15s
Test / test_resize_auto (push) Successful in 9s
Test / test_osd_tags (push) Successful in 7s
Test / test_snapshot_pool2 (push) Successful in 13s
Test / test_enospc (push) Successful in 11s
Test / test_enospc_xor (push) Successful in 12s
Test / test_enospc_imm (push) Successful in 9s
Test / test_enospc_imm_xor (push) Successful in 13s
Test / test_scrub (push) Successful in 13s
Test / test_scrub_zero_osd_2 (push) Successful in 15s
Test / test_scrub_xor (push) Successful in 15s
Test / test_scrub_pg_size_3 (push) Successful in 15s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s
Test / test_scrub_ec (push) Successful in 13s
Test / test_nfs (push) Successful in 9s
Test / test_heal_csum_4k_dj (push) Successful in 2m19s
Test / test_heal_csum_4k (push) Successful in 2m19s
2025-08-30 16:30:52 +03:00
9f2dc48d0f Fix OSD assertion failed: n_subops != sent when all object copies are corrupted 2025-08-30 16:30:47 +03:00
6d951b21fb Install ibverbs-providers in vitastor Docker builds 2025-08-30 02:28:26 +03:00
552f28cb3e Fix #86 - base64_decode on arm64 O_o
All checks were successful
Test / test_switch_primary (push) Successful in 32s
Test / test_write (push) Successful in 33s
Test / test_write_xor (push) Successful in 34s
Test / test_write_no_same (push) Successful in 8s
Test / test_write_iothreads (push) Successful in 38s
Test / test_heal_pg_size_2 (push) Successful in 2m18s
Test / test_heal_ec (push) Successful in 2m19s
Test / test_heal_antietcd (push) Successful in 2m21s
Test / test_heal_csum_32k_dmj (push) Successful in 2m27s
Test / test_heal_csum_32k_dj (push) Successful in 2m30s
Test / test_heal_csum_4k_dmj (push) Successful in 2m18s
Test / test_heal_csum_32k (push) Successful in 2m30s
Test / test_resize_auto (push) Successful in 8s
Test / test_resize (push) Successful in 14s
Test / test_osd_tags (push) Successful in 8s
Test / test_snapshot_pool2 (push) Successful in 15s
Test / test_enospc (push) Successful in 10s
Test / test_enospc_xor (push) Successful in 12s
Test / test_enospc_imm (push) Successful in 12s
Test / test_enospc_imm_xor (push) Successful in 16s
Test / test_scrub (push) Successful in 13s
Test / test_scrub_zero_osd_2 (push) Successful in 14s
Test / test_scrub_xor (push) Successful in 15s
Test / test_scrub_pg_size_3 (push) Successful in 14s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 18s
Test / test_scrub_ec (push) Successful in 14s
Test / test_nfs (push) Successful in 11s
Test / test_heal_csum_4k_dj (push) Successful in 2m18s
Test / test_heal_csum_4k (push) Successful in 2m18s
Test / test_heal_local_read (push) Successful in 2m17s
2025-08-30 02:27:05 +03:00
e87b6e26f7 Fix Proxmox 9.0 support - oops :) 2025-08-26 20:24:12 +03:00
0c89886374 Release 2.3.0
All checks were successful
Test / test_switch_primary (push) Successful in 32s
Test / test_write (push) Successful in 37s
Test / test_write_no_same (push) Successful in 11s
Test / test_write_xor (push) Successful in 39s
Test / test_write_iothreads (push) Successful in 41s
Test / test_heal_pg_size_2 (push) Successful in 2m20s
Test / test_heal_local_read (push) Successful in 2m21s
Test / test_heal_ec (push) Successful in 2m22s
Test / test_heal_csum_32k_dmj (push) Successful in 2m21s
Test / test_heal_csum_32k_dj (push) Successful in 2m20s
Test / test_heal_csum_32k (push) Successful in 2m19s
Test / test_resize (push) Successful in 14s
Test / test_heal_csum_4k_dmj (push) Successful in 2m25s
Test / test_resize_auto (push) Successful in 9s
Test / test_osd_tags (push) Successful in 8s
Test / test_snapshot_pool2 (push) Successful in 15s
Test / test_enospc (push) Successful in 19s
Test / test_enospc_xor (push) Successful in 20s
Test / test_enospc_imm (push) Successful in 12s
Test / test_enospc_imm_xor (push) Successful in 16s
Test / test_scrub (push) Successful in 17s
Test / test_scrub_zero_osd_2 (push) Successful in 19s
Test / test_scrub_xor (push) Successful in 18s
Test / test_scrub_pg_size_3 (push) Successful in 16s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 18s
Test / test_scrub_ec (push) Successful in 16s
Test / test_nfs (push) Successful in 12s
Test / test_heal_csum_4k_dj (push) Successful in 2m20s
Test / test_heal_csum_4k (push) Successful in 2m22s
Test / test_heal_antietcd (push) Successful in 2m21s
New features:

- Add a new kernel device mounting method: [ublk](https://vitastor.io/docs/usage/ublk.html).
  It's the fastest method for random IOPS, and it's on par with VDUSE for linear MB/s.
- Disable io_uring waits being reported as iowait on kernels which support it (6.15+)
- Allow to enforce permissions at the VitastorFS NFS server side
- Add qemu_file_mirror_path option to the config to allow to trick Veeam and make it work
- Speed up CRC32C calculation in OSD by fixing a bug and enabling AVX512 version
- Support QEMU 10
- Support Debian 13 Trixie and Proxmox 9.0
- Remove the dependency on system liburing in package builds (build it statically)

Bug fixes:

- Fix checksums NEVER BEING ENABLED in vitastor-disk prepare, even when explicitly requested :-D
- Use default uid and gid from NFS AUTH_SYS when creating files
- Fix object bitmaps supposedly & possibly being corrupted in some rare cases with EC N+2+
- Avoid multiple inflight overwrites to meta blocks - fixes possible data corruption with
  one specific SSD model: Memblaze PBlaze5 910 (github #79)
- Fix a bug in antietcd which was leading to leases sometimes not expiring correctly
- Fix a bug in NFS where ".." entry had its `cookie` equal to 0 instead of 1 (github #78)
- Fix snapshots not being deleted during VM deletion in Proxmox plugin (github #85)
- Fix monitor not filtering OSDs by block size correctly
2025-08-25 21:39:19 +03:00
e79bef8751 Fix pve-qemu-10.0 patch
Some checks reported warnings
Test / buildenv (push) Has been cancelled
Test / test_write (push) Has been cancelled
Test / test_write_xor (push) Has been cancelled
Test / test_write_iothreads (push) Has been cancelled
Test / test_write_no_same (push) Has been cancelled
Test / test_heal_pg_size_2 (push) Has been cancelled
Test / test_heal_local_read (push) Has been cancelled
Test / test_heal_ec (push) Has been cancelled
Test / test_heal_antietcd (push) Has been cancelled
Test / test_heal_csum_32k_dmj (push) Has been cancelled
Test / test_heal_csum_32k_dj (push) Has been cancelled
Test / test_heal_csum_32k (push) Has been cancelled
Test / test_heal_csum_4k_dmj (push) Has been cancelled
Test / test_heal_csum_4k_dj (push) Has been cancelled
Test / test_heal_csum_4k (push) Has been cancelled
Test / test_resize (push) Has been cancelled
Test / test_resize_auto (push) Has been cancelled
Test / test_snapshot_pool2 (push) Has been cancelled
Test / test_osd_tags (push) Has been cancelled
Test / test_enospc (push) Has been cancelled
Test / test_enospc_xor (push) Has been cancelled
Test / test_enospc_imm (push) Has been cancelled
Test / test_enospc_imm_xor (push) Has been cancelled
Test / test_scrub (push) Has been cancelled
Test / test_scrub_zero_osd_2 (push) Has been cancelled
Test / test_scrub_xor (push) Has been cancelled
Test / test_scrub_pg_size_3 (push) Has been cancelled
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Has been cancelled
Test / test_scrub_ec (push) Has been cancelled
Test / test_nfs (push) Has been cancelled
2025-08-25 21:38:18 +03:00
ad76f84e1c Fix compat.h for even older kernel headers 2025-08-25 21:38:04 +03:00
db827cb34c Add --enforce 1 to docs 2025-08-25 16:05:40 +03:00
e5c6d85ea1 Add Debian trixie to docs 2025-08-25 00:47:23 +03:00
6cc44c1f54 Add qemu_file_mirror_path docs 2025-08-25 00:45:52 +03:00
c20450c1f1 Add build.sh for Debian Trixie 2025-08-24 22:15:55 +03:00
db63e58b3d Add UBLK docs
All checks were successful
Test / test_switch_primary (push) Successful in 32s
Test / test_write (push) Successful in 32s
Test / test_write_xor (push) Successful in 36s
Test / test_write_no_same (push) Successful in 10s
Test / test_write_iothreads (push) Successful in 38s
Test / test_heal_pg_size_2 (push) Successful in 2m18s
Test / test_heal_local_read (push) Successful in 2m18s
Test / test_heal_ec (push) Successful in 2m17s
Test / test_heal_antietcd (push) Successful in 2m18s
Test / test_heal_csum_32k_dmj (push) Successful in 2m18s
Test / test_heal_csum_32k_dj (push) Successful in 2m19s
Test / test_heal_csum_32k (push) Successful in 2m18s
Test / test_heal_csum_4k_dmj (push) Successful in 2m19s
Test / test_resize (push) Successful in 15s
Test / test_resize_auto (push) Successful in 9s
Test / test_snapshot_pool2 (push) Successful in 14s
Test / test_osd_tags (push) Successful in 8s
Test / test_enospc (push) Successful in 12s
Test / test_enospc_xor (push) Successful in 14s
Test / test_enospc_imm (push) Successful in 11s
Test / test_enospc_imm_xor (push) Successful in 12s
Test / test_scrub (push) Successful in 13s
Test / test_scrub_zero_osd_2 (push) Successful in 14s
Test / test_scrub_xor (push) Successful in 13s
Test / test_scrub_pg_size_3 (push) Successful in 13s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 17s
Test / test_scrub_ec (push) Successful in 15s
Test / test_nfs (push) Successful in 13s
Test / test_heal_csum_4k_dj (push) Successful in 2m16s
Test / test_heal_csum_4k (push) Successful in 2m12s
2025-08-24 16:55:58 +03:00
31b7021330 Implement Vitastor ublk server 2025-08-24 16:55:58 +03:00
2ebe3a468c Mark all symbols hidden by default, export only required ones 2025-08-24 16:55:58 +03:00
9892fccfb0 Disable io_uring waits being reported as iowait 2025-08-24 16:55:58 +03:00
0be86a306d Remove old liburing version support as it's now included 2025-08-24 16:55:58 +03:00
d77a775948 Fix liburing include/compat for older kernels 2025-08-24 16:55:58 +03:00
8cc82bab39 Include liburing in static build 2025-08-24 16:55:58 +03:00
f9d5e33ddd Fix filtering OSDs by block size in monitor
All checks were successful
Test / test_switch_primary (push) Successful in 33s
Test / test_write (push) Successful in 33s
Test / test_write_no_same (push) Successful in 10s
Test / test_write_xor (push) Successful in 37s
Test / test_write_iothreads (push) Successful in 37s
Test / test_heal_pg_size_2 (push) Successful in 2m17s
Test / test_heal_local_read (push) Successful in 2m18s
Test / test_heal_ec (push) Successful in 2m19s
Test / test_heal_antietcd (push) Successful in 2m18s
Test / test_heal_csum_32k_dmj (push) Successful in 2m18s
Test / test_heal_csum_32k_dj (push) Successful in 2m23s
Test / test_heal_csum_32k (push) Successful in 2m18s
Test / test_heal_csum_4k_dmj (push) Successful in 2m19s
Test / test_resize (push) Successful in 13s
Test / test_resize_auto (push) Successful in 8s
Test / test_osd_tags (push) Successful in 7s
Test / test_snapshot_pool2 (push) Successful in 15s
Test / test_enospc (push) Successful in 10s
Test / test_enospc_xor (push) Successful in 13s
Test / test_enospc_imm (push) Successful in 12s
Test / test_enospc_imm_xor (push) Successful in 13s
Test / test_scrub (push) Successful in 14s
Test / test_scrub_zero_osd_2 (push) Successful in 14s
Test / test_scrub_xor (push) Successful in 15s
Test / test_scrub_pg_size_3 (push) Successful in 13s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 18s
Test / test_scrub_ec (push) Successful in 16s
Test / test_nfs (push) Successful in 12s
Test / test_heal_csum_4k_dj (push) Successful in 2m13s
Test / test_heal_csum_4k (push) Successful in 2m19s
2025-08-24 16:46:36 +03:00
1563932024
f83418d93e Fix snapshots not being deleted during VM deletion in Proxmox plugin (#85)
Co-authored-by: zhu.chengzhen <zhu.chengzhen@jingjiamicro.com>

I accept Vitastor CLA agreement: https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md
2025-08-24 16:28:12 +03:00
fbf14fb0cb Allow to enforce permissions at the server side
All checks were successful
Test / test_switch_primary (push) Successful in 34s
Test / test_write (push) Successful in 31s
Test / test_write_xor (push) Successful in 35s
Test / test_write_no_same (push) Successful in 9s
Test / test_write_iothreads (push) Successful in 38s
Test / test_heal_pg_size_2 (push) Successful in 2m17s
Test / test_heal_local_read (push) Successful in 2m17s
Test / test_heal_ec (push) Successful in 2m16s
Test / test_heal_antietcd (push) Successful in 2m18s
Test / test_heal_csum_32k_dmj (push) Successful in 2m22s
Test / test_heal_csum_32k (push) Successful in 2m19s
Test / test_heal_csum_32k_dj (push) Successful in 2m27s
Test / test_heal_csum_4k_dmj (push) Successful in 2m18s
Test / test_resize (push) Successful in 13s
Test / test_resize_auto (push) Successful in 9s
Test / test_osd_tags (push) Successful in 8s
Test / test_snapshot_pool2 (push) Successful in 14s
Test / test_enospc (push) Successful in 11s
Test / test_enospc_xor (push) Successful in 12s
Test / test_enospc_imm (push) Successful in 12s
Test / test_enospc_imm_xor (push) Successful in 14s
Test / test_scrub (push) Successful in 14s
Test / test_scrub_zero_osd_2 (push) Successful in 14s
Test / test_scrub_xor (push) Successful in 15s
Test / test_scrub_pg_size_3 (push) Successful in 14s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 17s
Test / test_scrub_ec (push) Successful in 17s
Test / test_nfs (push) Successful in 12s
Test / test_heal_csum_4k_dj (push) Successful in 2m17s
Test / test_heal_csum_4k (push) Successful in 2m19s
2025-08-24 16:20:19 +03:00
fb1c3e00f4 Use default uid and gid from NFS AUTH_SYS when creating files 2025-08-24 16:03:40 +03:00
ston3lu
d8332171e9 According to the NFSv3 protocol: Directory traversal should start with cookie '0' - "." entry as the first entry, cookie should be '0' - '.." entry as the second entry, cookie should be '1' 2025-08-24 14:44:48 +03:00
c24cc9bf0b Bump antietcd version to 1.1.3
All checks were successful
Test / test_rebalance_verify_imm (push) Successful in 1m47s
Test / test_write_no_same (push) Successful in 8s
Test / test_rebalance_verify_ec_imm (push) Successful in 1m54s
Test / test_write_xor (push) Successful in 36s
Test / test_write_iothreads (push) Successful in 36s
Test / test_heal_pg_size_2 (push) Successful in 2m15s
Test / test_heal_local_read (push) Successful in 2m17s
Test / test_heal_ec (push) Successful in 2m17s
Test / test_heal_antietcd (push) Successful in 2m18s
Test / test_heal_csum_32k_dmj (push) Successful in 2m20s
Test / test_heal_csum_32k_dj (push) Successful in 2m25s
Test / test_heal_csum_32k (push) Successful in 2m20s
Test / test_heal_csum_4k_dmj (push) Successful in 2m19s
Test / test_resize_auto (push) Successful in 7s
Test / test_resize (push) Successful in 13s
Test / test_osd_tags (push) Successful in 9s
Test / test_snapshot_pool2 (push) Successful in 15s
Test / test_enospc (push) Successful in 10s
Test / test_enospc_xor (push) Successful in 14s
Test / test_enospc_imm (push) Successful in 12s
Test / test_enospc_imm_xor (push) Successful in 13s
Test / test_scrub (push) Successful in 15s
Test / test_scrub_zero_osd_2 (push) Successful in 14s
Test / test_scrub_xor (push) Successful in 12s
Test / test_scrub_pg_size_3 (push) Successful in 14s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s
Test / test_scrub_ec (push) Successful in 15s
Test / test_nfs (push) Successful in 11s
Test / test_heal_csum_4k_dj (push) Successful in 2m19s
Test / test_heal_csum_4k (push) Successful in 2m21s
2025-08-23 17:45:56 +03:00
9f57c75acf unordered_map flush_versions 2025-08-23 17:45:56 +03:00
53b12641d1 Fix the fix :) 2025-08-23 17:45:56 +03:00
flynn.yang
5c5c8825dc Avoid multiple inflight overwrites to meta blocks
https://github.com/vitalif/vitastor/pull/83

By https://github.com/Flynn049

By submitting this pull request, I accept Vitastor CLA
2025-08-23 17:45:03 +03:00
3a261ac3fc Support QEMU 10 2025-08-18 10:51:04 +03:00
04514435de Fix checking config in qemu driver 2025-08-12 01:16:57 +03:00
07303020fc Fix enabling checksums in blockstore-disk O_o
Some checks reported warnings
Test / test_heal_pg_size_2 (push) Blocked by required conditions
Test / test_heal_local_read (push) Blocked by required conditions
Test / test_heal_ec (push) Blocked by required conditions
Test / test_heal_antietcd (push) Blocked by required conditions
Test / test_heal_csum_32k_dmj (push) Blocked by required conditions
Test / test_heal_csum_32k_dj (push) Blocked by required conditions
Test / test_heal_csum_32k (push) Blocked by required conditions
Test / test_heal_csum_4k_dmj (push) Blocked by required conditions
Test / test_heal_csum_4k_dj (push) Blocked by required conditions
Test / test_heal_csum_4k (push) Blocked by required conditions
Test / test_resize (push) Blocked by required conditions
Test / test_resize_auto (push) Blocked by required conditions
Test / test_snapshot_pool2 (push) Blocked by required conditions
Test / test_osd_tags (push) Blocked by required conditions
Test / test_enospc (push) Blocked by required conditions
Test / test_enospc_xor (push) Blocked by required conditions
Test / test_enospc_imm (push) Blocked by required conditions
Test / test_enospc_imm_xor (push) Blocked by required conditions
Test / test_scrub (push) Blocked by required conditions
Test / test_scrub_zero_osd_2 (push) Blocked by required conditions
Test / test_scrub_xor (push) Blocked by required conditions
Test / test_scrub_pg_size_3 (push) Blocked by required conditions
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Blocked by required conditions
Test / test_scrub_ec (push) Blocked by required conditions
Test / test_nfs (push) Blocked by required conditions
Test / buildenv (push) Has been cancelled
Test / build (push) Has been cancelled
Test / make_test (push) Has been cancelled
Test / npm_lint (push) Has been cancelled
Test / test_add_osd (push) Has been cancelled
2025-08-10 18:04:52 +03:00
feaf7a15cf Use CRC32C implementation from ISA-L when available - enables VPCLMULQDQ version with AVX512 which is ~2x faster
Some checks reported warnings
Test / test_heal_csum_4k_dmj (push) Blocked by required conditions
Test / test_heal_csum_4k_dj (push) Blocked by required conditions
Test / test_heal_csum_4k (push) Blocked by required conditions
Test / test_resize (push) Blocked by required conditions
Test / test_resize_auto (push) Blocked by required conditions
Test / test_snapshot_pool2 (push) Blocked by required conditions
Test / test_osd_tags (push) Blocked by required conditions
Test / test_enospc (push) Blocked by required conditions
Test / test_enospc_xor (push) Blocked by required conditions
Test / test_enospc_imm (push) Blocked by required conditions
Test / test_enospc_imm_xor (push) Blocked by required conditions
Test / test_scrub (push) Blocked by required conditions
Test / test_scrub_zero_osd_2 (push) Blocked by required conditions
Test / test_scrub_xor (push) Blocked by required conditions
Test / test_scrub_pg_size_3 (push) Blocked by required conditions
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Blocked by required conditions
Test / test_scrub_ec (push) Blocked by required conditions
Test / test_nfs (push) Blocked by required conditions
Test / build (push) Has been cancelled
Test / make_test (push) Has been cancelled
Test / npm_lint (push) Has been cancelled
Test / test_add_osd (push) Has been cancelled
Test / test_cas (push) Has been cancelled
Test / test_change_pg_count (push) Has been cancelled
Test / test_change_pg_count_ec (push) Has been cancelled
Test / test_change_pg_count_online (push) Has been cancelled
Test / test_change_pg_size (push) Has been cancelled
Test / test_create_nomaxid (push) Has been cancelled
Test / test_etcd_fail (push) Has been cancelled
Test / buildenv (push) Has been cancelled
2025-08-10 18:03:47 +03:00
29dda5066f Stop doing cpuid repeatedly in runtime 2025-08-10 18:03:22 +03:00
1de53ef7e6 Move crc32c_pad to util/crc32c.c
Some checks reported warnings
Test / test_failure_domain (push) Has been cancelled
Test / test_snapshot (push) Has been cancelled
Test / test_snapshot_ec (push) Has been cancelled
Test / test_minsize_1 (push) Has been cancelled
Test / test_move_reappear (push) Has been cancelled
Test / test_rm (push) Has been cancelled
Test / test_rm_degraded (push) Has been cancelled
Test / test_snapshot_chain (push) Has been cancelled
Test / test_snapshot_chain_ec (push) Has been cancelled
Test / test_snapshot_down (push) Has been cancelled
Test / test_snapshot_down_ec (push) Has been cancelled
Test / test_splitbrain (push) Has been cancelled
Test / test_rebalance_verify (push) Has been cancelled
Test / test_rebalance_verify_imm (push) Has been cancelled
Test / test_rebalance_verify_ec (push) Has been cancelled
Test / test_rebalance_verify_ec_imm (push) Has been cancelled
Test / test_dd (push) Has been cancelled
Test / test_root_node (push) Has been cancelled
Test / test_switch_primary (push) Has been cancelled
Test / test_write (push) Has been cancelled
Test / test_write_xor (push) Has been cancelled
Test / test_write_iothreads (push) Has been cancelled
Test / test_write_no_same (push) Has been cancelled
Test / test_heal_pg_size_2 (push) Has been cancelled
Test / test_heal_local_read (push) Has been cancelled
Test / test_heal_ec (push) Has been cancelled
Test / test_heal_antietcd (push) Has been cancelled
Test / test_heal_csum_32k_dmj (push) Has been cancelled
Test / test_heal_csum_32k_dj (push) Has been cancelled
Test / test_heal_csum_32k (push) Has been cancelled
2025-08-10 18:03:17 +03:00
4793dbe9c3 Use set_immediate() in osd_flush to prevent stack overflows on repeated errors
Some checks reported warnings
Test / test_dd (push) Has been cancelled
Test / test_root_node (push) Has been cancelled
Test / test_switch_primary (push) Has been cancelled
Test / test_write (push) Has been cancelled
Test / test_write_xor (push) Has been cancelled
Test / test_write_iothreads (push) Has been cancelled
Test / test_write_no_same (push) Has been cancelled
Test / test_heal_pg_size_2 (push) Has been cancelled
Test / test_heal_local_read (push) Has been cancelled
Test / test_heal_ec (push) Has been cancelled
Test / test_heal_antietcd (push) Has been cancelled
Test / make_test (push) Has been cancelled
Test / test_heal_csum_32k_dmj (push) Has been cancelled
Test / test_heal_csum_32k_dj (push) Has been cancelled
Test / test_heal_csum_32k (push) Has been cancelled
Test / test_heal_csum_4k_dmj (push) Has been cancelled
Test / test_heal_csum_4k_dj (push) Has been cancelled
Test / test_heal_csum_4k (push) Has been cancelled
Test / test_resize (push) Has been cancelled
Test / test_resize_auto (push) Has been cancelled
Test / test_snapshot_pool2 (push) Has been cancelled
Test / test_osd_tags (push) Has been cancelled
Test / test_enospc (push) Has been cancelled
Test / test_enospc_xor (push) Has been cancelled
Test / test_enospc_imm (push) Has been cancelled
Test / test_enospc_imm_xor (push) Has been cancelled
Test / test_scrub (push) Has been cancelled
Test / test_scrub_zero_osd_2 (push) Has been cancelled
Test / test_scrub_xor (push) Has been cancelled
Test / test_scrub_pg_size_3 (push) Has been cancelled
2025-08-10 17:58:01 +03:00
918ea34af2 Remove "Only allow to overwrite part of the bitmap" blockstore API feature
Some checks reported warnings
Test / test_switch_primary (push) Successful in 32s
Test / test_write (push) Successful in 31s
Test / test_write_no_same (push) Successful in 8s
Test / test_write_xor (push) Successful in 35s
Test / test_heal_ec (push) Has started running
Test / test_heal_antietcd (push) Has been cancelled
Test / test_heal_csum_32k_dmj (push) Has been cancelled
Test / test_heal_csum_32k_dj (push) Has been cancelled
Test / test_heal_csum_32k (push) Has been cancelled
Test / test_heal_csum_4k_dmj (push) Has been cancelled
Test / test_heal_csum_4k_dj (push) Has been cancelled
Test / test_heal_csum_4k (push) Has been cancelled
Test / test_resize (push) Has been cancelled
Test / test_resize_auto (push) Has been cancelled
Test / test_snapshot_pool2 (push) Has been cancelled
Test / test_osd_tags (push) Has been cancelled
Test / test_enospc (push) Has been cancelled
Test / test_enospc_xor (push) Has been cancelled
Test / test_enospc_imm (push) Has been cancelled
Test / test_enospc_imm_xor (push) Has been cancelled
Test / test_scrub (push) Has been cancelled
Test / test_scrub_zero_osd_2 (push) Has been cancelled
Test / test_scrub_xor (push) Has been cancelled
Test / test_scrub_pg_size_3 (push) Has been cancelled
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Has been cancelled
Test / test_scrub_ec (push) Has been cancelled
Test / test_nfs (push) Has been cancelled
Test / test_write_iothreads (push) Has been cancelled
Test / test_heal_pg_size_2 (push) Has been cancelled
Test / test_heal_local_read (push) Has been cancelled
2025-08-10 17:57:04 +03:00
2db8184cd8 Fix bitmap calculation for EC N+1 & the new store and EC N+2+ for the old store 2025-08-10 17:57:04 +03:00
0e964b3c8c Fix renaming from ddeb 2025-08-09 16:11:22 +03:00
1b9296ff6c Add qemu_file_mirror_path option to the config to allow to trick Veeam
All checks were successful
Test / test_switch_primary (push) Successful in 32s
Test / test_write (push) Successful in 31s
Test / test_write_no_same (push) Successful in 11s
Test / test_write_xor (push) Successful in 34s
Test / test_write_iothreads (push) Successful in 37s
Test / test_heal_pg_size_2 (push) Successful in 2m19s
Test / test_heal_local_read (push) Successful in 2m19s
Test / test_heal_ec (push) Successful in 2m19s
Test / test_heal_antietcd (push) Successful in 2m21s
Test / test_heal_csum_32k_dmj (push) Successful in 2m19s
Test / test_heal_csum_4k_dmj (push) Successful in 2m20s
Test / test_heal_csum_32k_dj (push) Successful in 2m28s
Test / test_heal_csum_32k (push) Successful in 2m26s
Test / test_resize_auto (push) Successful in 9s
Test / test_resize (push) Successful in 14s
Test / test_osd_tags (push) Successful in 9s
Test / test_snapshot_pool2 (push) Successful in 17s
Test / test_enospc (push) Successful in 11s
Test / test_enospc_xor (push) Successful in 13s
Test / test_enospc_imm (push) Successful in 10s
Test / test_enospc_imm_xor (push) Successful in 17s
Test / test_scrub (push) Successful in 15s
Test / test_scrub_zero_osd_2 (push) Successful in 14s
Test / test_scrub_xor (push) Successful in 15s
Test / test_scrub_pg_size_3 (push) Successful in 14s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s
Test / test_scrub_ec (push) Successful in 16s
Test / test_nfs (push) Successful in 12s
Test / test_heal_csum_4k_dj (push) Successful in 2m22s
Test / test_heal_csum_4k (push) Successful in 2m17s
When qemu_file_mirror_path is set to "/dir/" in /etc/vitastor/vitastor.conf, QEMU driver
returns "/dir/image_name" as the filename in qemu-img info and in QAPI to trick software
like Veeam which expects file-based access. After that, vitastor-nfs can be mounted to that
directory to make backups work :-)
2025-08-06 01:15:54 +03:00
6bf136c199 Add a note about /etc/apt/preferences to docs 2025-08-05 11:25:06 +03:00
b529f77264 Release 2.2.3
- Support Ubuntu 24.04 Noble
- Fix clients hanging on online PG count change with in-flight operations
- Fix volume_size_info in PVE VitastorPlugin
- Implement bdrv_refresh_filename for the QEMU driver
- Fix RDMA-CM broken in 2.2.0
- Fix possible "invalid %N$ use detected" OSD crashes
- Fix vitastor-nfs build without RDMA
- Fix docker build by adding the forgotten apt/preferences
- Fix libfio_blockstore.so
- Fix "trigger event loop automatically" API version check in the QEMU driver
- Add Content-Type header for prometheus metrics
- Add a patch for libvirt 11.5
2025-07-30 10:50:02 +03:00
bf9519dcdc docker -i 2025-07-30 10:50:02 +03:00
4ba687738b Use archive for debian buster 2025-07-30 10:50:02 +03:00
8427f6fe46 Use the new build method for RPM packages too 2025-07-30 02:38:52 +03:00
efa6bc3e70 Fix docker pull commands in docs 2025-07-30 02:38:52 +03:00
da33e9b12d Add noble repository 2025-07-30 02:38:52 +03:00
Vitaliy Filippov
265127c1a7 Fix vitastor-nfs build without RDMA 2025-07-30 02:38:52 +03:00
2b30acfc1d Do not use podman and do not use build command to build packages (we are not building a docker image) 2025-07-30 02:38:52 +03:00
7fbc38ef29 Add a build file for ubuntu 24.04 2025-07-29 02:51:59 +03:00
e5070e991a Add forgotten apt/preferences 2025-07-29 01:18:55 +03:00
625552c441 Fix RDMA-CM broken in 2.2.0
All checks were successful
Test / test_write (push) Successful in 34s
Test / test_write_no_same (push) Successful in 9s
Test / test_write_xor (push) Successful in 34s
Test / test_write_iothreads (push) Successful in 36s
Test / test_heal_pg_size_2 (push) Successful in 2m17s
Test / test_heal_local_read (push) Successful in 2m19s
Test / test_heal_ec (push) Successful in 2m18s
Test / test_heal_antietcd (push) Successful in 2m17s
Test / test_heal_csum_32k_dmj (push) Successful in 2m20s
Test / test_heal_csum_32k_dj (push) Successful in 2m22s
Test / test_heal_csum_32k (push) Successful in 2m24s
Test / test_heal_csum_4k_dmj (push) Successful in 2m19s
Test / test_resize_auto (push) Successful in 7s
Test / test_resize (push) Successful in 12s
Test / test_osd_tags (push) Successful in 7s
Test / test_snapshot_pool2 (push) Successful in 14s
Test / test_enospc (push) Successful in 10s
Test / test_enospc_xor (push) Successful in 12s
Test / test_enospc_imm (push) Successful in 10s
Test / test_enospc_imm_xor (push) Successful in 14s
Test / test_scrub (push) Successful in 15s
Test / test_scrub_zero_osd_2 (push) Successful in 15s
Test / test_scrub_xor (push) Successful in 14s
Test / test_scrub_pg_size_3 (push) Successful in 13s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s
Test / test_scrub_ec (push) Successful in 14s
Test / test_nfs (push) Successful in 11s
Test / test_heal_csum_4k_dj (push) Successful in 2m18s
Test / test_heal_csum_4k (push) Successful in 2m18s
Test / test_rebalance_verify (push) Successful in 2m4s
2025-07-29 00:14:05 +03:00
78c95c94f6 Use uint8_t* for BS buffers in OSD
All checks were successful
Test / test_switch_primary (push) Successful in 32s
Test / test_write (push) Successful in 30s
Test / test_write_xor (push) Successful in 34s
Test / test_write_no_same (push) Successful in 10s
Test / test_write_iothreads (push) Successful in 35s
Test / test_heal_pg_size_2 (push) Successful in 2m18s
Test / test_heal_local_read (push) Successful in 2m18s
Test / test_heal_ec (push) Successful in 2m17s
Test / test_heal_antietcd (push) Successful in 2m21s
Test / test_heal_csum_32k_dmj (push) Successful in 2m20s
Test / test_heal_csum_32k_dj (push) Successful in 2m32s
Test / test_heal_csum_32k (push) Successful in 2m30s
Test / test_heal_csum_4k_dmj (push) Successful in 2m26s
Test / test_resize (push) Successful in 13s
Test / test_resize_auto (push) Successful in 9s
Test / test_osd_tags (push) Successful in 8s
Test / test_snapshot_pool2 (push) Successful in 13s
Test / test_enospc (push) Successful in 12s
Test / test_enospc_xor (push) Successful in 13s
Test / test_enospc_imm (push) Successful in 10s
Test / test_enospc_imm_xor (push) Successful in 14s
Test / test_scrub (push) Successful in 12s
Test / test_scrub_zero_osd_2 (push) Successful in 14s
Test / test_scrub_xor (push) Successful in 14s
Test / test_scrub_pg_size_3 (push) Successful in 15s
Test / test_scrub_ec (push) Successful in 15s
Test / test_nfs (push) Successful in 10s
Test / test_heal_csum_4k_dj (push) Successful in 2m19s
Test / test_heal_csum_4k (push) Successful in 2m20s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s
2025-07-26 14:13:12 +03:00
488e20bf55 Remove BS_OP_SYNC_STAB_ALL from OSD
Some checks reported warnings
Test / test_rebalance_verify_ec (push) Has been cancelled
Test / test_rebalance_verify_ec_imm (push) Has been cancelled
Test / test_dd (push) Has been cancelled
Test / test_root_node (push) Has been cancelled
Test / test_switch_primary (push) Has been cancelled
Test / test_write (push) Has been cancelled
Test / test_write_xor (push) Has been cancelled
Test / test_write_iothreads (push) Has been cancelled
Test / test_write_no_same (push) Has been cancelled
Test / test_heal_pg_size_2 (push) Has been cancelled
Test / test_heal_local_read (push) Has been cancelled
Test / test_heal_ec (push) Has been cancelled
Test / test_heal_antietcd (push) Has been cancelled
Test / test_heal_csum_32k_dmj (push) Has been cancelled
Test / test_heal_csum_32k_dj (push) Has been cancelled
Test / buildenv (push) Has been cancelled
Test / test_heal_csum_32k (push) Has been cancelled
Test / test_heal_csum_4k_dmj (push) Has been cancelled
Test / test_heal_csum_4k_dj (push) Has been cancelled
Test / test_heal_csum_4k (push) Has been cancelled
Test / test_resize (push) Has been cancelled
Test / test_resize_auto (push) Has been cancelled
Test / test_snapshot_pool2 (push) Has been cancelled
Test / test_osd_tags (push) Has been cancelled
Test / test_enospc (push) Has been cancelled
Test / test_enospc_xor (push) Has been cancelled
Test / test_enospc_imm (push) Has been cancelled
Test / test_enospc_imm_xor (push) Has been cancelled
Test / test_scrub (push) Has been cancelled
Test / test_scrub_zero_osd_2 (push) Has been cancelled
2025-07-26 14:13:12 +03:00
25d6281b3e Remove BS_OP_SYNC_STAB_ALL 2025-07-26 14:13:11 +03:00
1676e50b3a Fix debug trace for fio_bs 2025-07-26 14:12:48 +03:00
8049e3c14a Use OP_WRITE_STABLE in bs_fio 2025-07-26 14:12:48 +03:00
93a30efd86 Do not use numbered printf args
Some checks reported warnings
Test / test_enospc_imm_xor (push) Blocked by required conditions
Test / test_scrub (push) Blocked by required conditions
Test / test_scrub_zero_osd_2 (push) Blocked by required conditions
Test / test_scrub_xor (push) Blocked by required conditions
Test / test_scrub_pg_size_3 (push) Blocked by required conditions
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Blocked by required conditions
Test / test_scrub_ec (push) Blocked by required conditions
Test / test_nfs (push) Blocked by required conditions
Test / buildenv (push) Successful in 9s
Test / build (push) Successful in 3m50s
Test / npm_lint (push) Successful in 10s
Test / test_cas (push) Successful in 7s
Test / make_test (push) Successful in 33s
Test / test_change_pg_count (push) Successful in 30s
Test / test_change_pg_count_ec (push) Successful in 31s
Test / test_change_pg_size (push) Successful in 8s
Test / test_create_nomaxid (push) Successful in 7s
Test / test_add_osd (push) Successful in 1m14s
Test / test_etcd_fail_antietcd (push) Successful in 38s
Test / test_etcd_fail (push) Successful in 41s
Test / test_change_pg_count_online (push) Successful in 1m8s
Test / test_interrupted_rebalance (push) Successful in 47s
Test / test_create_halfhost (push) Successful in 6s
Test / test_interrupted_rebalance_imm (push) Successful in 45s
Test / test_interrupted_rebalance_ec (push) Successful in 47s
Test / test_failure_domain (push) Successful in 9s
Test / test_minsize_1 (push) Has been cancelled
Test / test_snapshot_ec (push) Has been cancelled
Test / test_interrupted_rebalance_ec_imm (push) Has been cancelled
Test / test_snapshot (push) Has been cancelled
2025-07-26 14:11:15 +03:00
83fb121f36 Fix "Trigger event loop automatically" API version check
All checks were successful
Test / test_switch_primary (push) Successful in 31s
Test / test_write (push) Successful in 32s
Test / test_write_xor (push) Successful in 33s
Test / test_write_no_same (push) Successful in 8s
Test / test_write_iothreads (push) Successful in 37s
Test / test_heal_pg_size_2 (push) Successful in 2m16s
Test / test_heal_local_read (push) Successful in 2m19s
Test / test_heal_ec (push) Successful in 2m18s
Test / test_heal_antietcd (push) Successful in 2m16s
Test / test_heal_csum_32k_dmj (push) Successful in 2m20s
Test / test_heal_csum_32k_dj (push) Successful in 2m19s
Test / test_heal_csum_32k (push) Successful in 2m25s
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s
Test / test_resize (push) Successful in 14s
Test / test_resize_auto (push) Successful in 8s
Test / test_osd_tags (push) Successful in 9s
Test / test_snapshot_pool2 (push) Successful in 16s
Test / test_enospc (push) Successful in 11s
Test / test_enospc_xor (push) Successful in 12s
Test / test_enospc_imm (push) Successful in 10s
Test / test_enospc_imm_xor (push) Successful in 14s
Test / test_scrub (push) Successful in 13s
Test / test_scrub_zero_osd_2 (push) Successful in 14s
Test / test_scrub_xor (push) Successful in 13s
Test / test_scrub_pg_size_3 (push) Successful in 14s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s
Test / test_scrub_ec (push) Successful in 17s
Test / test_nfs (push) Successful in 12s
Test / test_heal_csum_4k (push) Successful in 2m19s
Test / test_heal_csum_4k_dj (push) Successful in 2m30s
2025-07-20 17:14:22 +03:00
afc97b757b Implement bdrv_refresh_filename
Some checks failed
Test / test_rebalance_verify_ec_imm (push) Successful in 1m36s
Test / test_write (push) Successful in 34s
Test / test_write_no_same (push) Successful in 7s
Test / test_write_xor (push) Successful in 35s
Test / test_write_iothreads (push) Successful in 35s
Test / test_heal_local_read (push) Failing after 2m16s
Test / test_heal_pg_size_2 (push) Successful in 2m20s
Test / test_heal_ec (push) Successful in 2m18s
Test / test_heal_antietcd (push) Successful in 2m18s
Test / test_heal_csum_32k_dmj (push) Successful in 2m21s
Test / test_heal_csum_32k_dj (push) Successful in 2m19s
Test / test_heal_csum_32k (push) Successful in 2m29s
Test / test_heal_csum_4k_dmj (push) Successful in 2m35s
Test / test_resize (push) Successful in 13s
Test / test_resize_auto (push) Successful in 8s
Test / test_osd_tags (push) Successful in 7s
Test / test_snapshot_pool2 (push) Successful in 15s
Test / test_enospc (push) Successful in 10s
Test / test_enospc_xor (push) Successful in 13s
Test / test_enospc_imm (push) Successful in 10s
Test / test_scrub (push) Successful in 19s
Test / test_enospc_imm_xor (push) Successful in 24s
Test / test_scrub_zero_osd_2 (push) Successful in 23s
Test / test_scrub_xor (push) Successful in 21s
Test / test_scrub_pg_size_3 (push) Successful in 14s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s
Test / test_nfs (push) Successful in 10s
Test / test_scrub_ec (push) Successful in 15s
Test / test_heal_csum_4k (push) Successful in 2m24s
Test / test_heal_csum_4k_dj (push) Successful in 2m41s
2025-07-20 16:30:01 +03:00
68905cbf41 Fix online PG count change, add a test for it
All checks were successful
Test / test_switch_primary (push) Successful in 32s
Test / test_write (push) Successful in 32s
Test / test_write_no_same (push) Successful in 8s
Test / test_write_xor (push) Successful in 34s
Test / test_write_iothreads (push) Successful in 37s
Test / test_heal_pg_size_2 (push) Successful in 2m15s
Test / test_heal_local_read (push) Successful in 2m19s
Test / test_heal_ec (push) Successful in 2m18s
Test / test_heal_antietcd (push) Successful in 2m16s
Test / test_heal_csum_32k_dmj (push) Successful in 2m23s
Test / test_heal_csum_32k_dj (push) Successful in 2m27s
Test / test_heal_csum_32k (push) Successful in 2m26s
Test / test_heal_csum_4k_dmj (push) Successful in 2m20s
Test / test_resize_auto (push) Successful in 8s
Test / test_resize (push) Successful in 12s
Test / test_osd_tags (push) Successful in 7s
Test / test_snapshot_pool2 (push) Successful in 13s
Test / test_enospc (push) Successful in 11s
Test / test_enospc_xor (push) Successful in 13s
Test / test_enospc_imm (push) Successful in 11s
Test / test_enospc_imm_xor (push) Successful in 15s
Test / test_scrub (push) Successful in 12s
Test / test_scrub_zero_osd_2 (push) Successful in 15s
Test / test_scrub_xor (push) Successful in 12s
Test / test_scrub_pg_size_3 (push) Successful in 14s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 18s
Test / test_scrub_ec (push) Successful in 14s
Test / test_nfs (push) Successful in 13s
Test / test_heal_csum_4k_dj (push) Successful in 2m21s
Test / test_heal_csum_4k (push) Successful in 2m21s
2025-07-15 02:33:28 +03:00
3fff667f13 Add Content-Type header for prometheus metrics
Some checks failed
Test / test_rebalance_verify_ec_imm (push) Successful in 1m40s
Test / test_write_no_same (push) Successful in 7s
Test / test_write (push) Successful in 33s
Test / test_write_xor (push) Successful in 32s
Test / test_write_iothreads (push) Successful in 35s
Test / test_heal_pg_size_2 (push) Successful in 2m20s
Test / test_heal_local_read (push) Successful in 2m26s
Test / test_heal_ec (push) Successful in 2m27s
Test / test_heal_antietcd (push) Successful in 2m41s
Test / test_heal_csum_32k_dmj (push) Successful in 2m25s
Test / test_heal_csum_32k_dj (push) Successful in 2m24s
Test / test_heal_csum_32k (push) Successful in 2m22s
Test / test_resize (push) Successful in 12s
Test / test_resize_auto (push) Failing after 5s
Test / test_heal_csum_4k_dmj (push) Successful in 2m26s
Test / test_osd_tags (push) Successful in 8s
Test / test_snapshot_pool2 (push) Successful in 15s
Test / test_enospc (push) Successful in 11s
Test / test_enospc_xor (push) Successful in 12s
Test / test_enospc_imm (push) Successful in 10s
Test / test_enospc_imm_xor (push) Successful in 12s
Test / test_scrub (push) Successful in 13s
Test / test_scrub_zero_osd_2 (push) Successful in 16s
Test / test_scrub_xor (push) Successful in 16s
Test / test_scrub_pg_size_3 (push) Successful in 14s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 17s
Test / test_scrub_ec (push) Successful in 14s
Test / test_nfs (push) Successful in 9s
Test / test_heal_csum_4k_dj (push) Successful in 2m21s
Test / test_heal_csum_4k (push) Successful in 2m19s
2025-07-14 19:51:35 +03:00
980aec1d9b Add patch for libvirt 11.5 2025-07-04 23:15:18 +03:00
227 changed files with 18884 additions and 7446 deletions

View File

@@ -20,7 +20,7 @@ RUN echo 'deb http://deb.debian.org/debian bullseye-backports main' >> /etc/apt/
RUN apt-get update
RUN apt-get -y install etcd qemu-system-x86 qemu-block-extra qemu-utils fio libasan5 \
liburing1 liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
libgoogle-perftools-dev devscripts libjerasure-dev cmake libibverbs-dev libisal-dev
RUN apt-get -y build-dep fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`
RUN apt-get update && apt-get -y install jq lp-solve sudo nfs-common fdisk parted
RUN apt-get --download-only source fio qemu=`dpkg -s qemu-system-x86|grep ^Version:|awk '{print $2}'`

View File

@@ -144,6 +144,24 @@ jobs:
echo ""
done
test_change_pg_count_online:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_change_pg_count_online.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_change_pg_size:
runs-on: ubuntu-latest
needs: build
@@ -792,6 +810,24 @@ jobs:
echo ""
done
test_reweight_half:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_reweight_half.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_heal_csum_32k_dmj:
runs-on: ubuntu-latest
needs: build

3
.gitmodules vendored
View File

@@ -4,3 +4,6 @@
[submodule "json11"]
path = json11
url = ../json11.git
[submodule "emhash"]
path = emhash
url = ../emhash.git

View File

@@ -2,6 +2,19 @@ cmake_minimum_required(VERSION 2.8.12)
project(vitastor)
set(VITASTOR_VERSION "2.2.2")
set(VITASTOR_VERSION "2.3.0")
include(CTest)
add_custom_target(build_tests)
add_custom_target(test
COMMAND
echo leak:tcmalloc > ${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt &&
env LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt ${CMAKE_CTEST_COMMAND}
)
# make -j16 -C ../../build test_heap && ../../build/src/test/test_heap
# make -j16 -C ../../build test_heap && rm -f $(find ../../build -name '*.gcda') && ctest -V -T test -T coverage -R heap --test-dir ../../build && (cd ../../build; gcovr -f ../src --html --html-nested -o coverage/index.html; cd ../src/test)
# make -j16 -C ../../build test_blockstore && rm -f $(find ../../build -name '*.gcda') && ctest -V -T test -T coverage -R blockstore --test-dir ../../build && (cd ../../build; gcovr -f ../src --html --html-nested -o coverage/index.html; cd ../src/test)
# kcov --include-path=../../../src ../../kcov ./test_blockstore
add_dependencies(test build_tests)
add_subdirectory(src)

View File

@@ -19,7 +19,7 @@ Vitastor нацелен в первую очередь на SSD и SSD+HDD кл
TCP и RDMA и на хорошем железе может достигать задержки 4 КБ чтения и записи на уровне ~0.1 мс,
что примерно в 10 раз быстрее, чем Ceph и другие популярные программные СХД.
Vitastor поддерживает QEMU-драйвер, протоколы NBD и NFS, драйверы OpenStack, OpenNebula, Proxmox, Kubernetes.
Vitastor поддерживает QEMU-драйвер, протоколы UBLK, NBD и NFS, драйверы OpenStack, OpenNebula, Proxmox, Kubernetes.
Другие драйверы могут также быть легко реализованы.
Подробности смотрите в документации по ссылкам. Можете начать отсюда: [Быстрый старт](docs/intro/quickstart.ru.md).
@@ -64,8 +64,9 @@ Vitastor поддерживает QEMU-драйвер, протоколы NBD и
- [vitastor-cli](docs/usage/cli.ru.md) (консольный интерфейс)
- [vitastor-disk](docs/usage/disk.ru.md) (управление дисками)
- [fio](docs/usage/fio.ru.md) для тестов производительности
- [NBD](docs/usage/nbd.ru.md) для монтирования ядром
- [QEMU и qemu-img](docs/usage/qemu.ru.md)
- [UBLK](docs/usage/ublk.ru.md) для монтирования ядром
- [NBD](docs/usage/nbd.ru.md) - старый интерфейс для монтирования ядром
- [QEMU, qemu-img и VDUSE](docs/usage/qemu.ru.md)
- [NFS](docs/usage/nfs.ru.md) кластерная файловая система и псевдо-ФС прокси
- [Администрирование](docs/usage/admin.ru.md)
- Производительность

View File

@@ -19,7 +19,7 @@ supports TCP and RDMA and may achieve 4 KB read and write latency as low as ~0.1
with proper hardware which is ~10 times faster than other popular SDS's like Ceph
or internal systems of public clouds.
Vitastor supports QEMU, NBD, NFS protocols, OpenStack, OpenNebula, Proxmox, Kubernetes drivers.
Vitastor supports QEMU, UBLK, NBD, NFS protocols, OpenStack, OpenNebula, Proxmox, Kubernetes drivers.
More drivers may be created easily.
Read more details in the documentation. You can start from here: [Quick Start](docs/intro/quickstart.en.md).
@@ -64,8 +64,9 @@ Read more details in the documentation. You can start from here: [Quick Start](d
- [vitastor-cli](docs/usage/cli.en.md) (command-line interface)
- [vitastor-disk](docs/usage/disk.en.md) (disk management tool)
- [fio](docs/usage/fio.en.md) for benchmarks
- [NBD](docs/usage/nbd.en.md) for kernel mounts
- [QEMU and qemu-img](docs/usage/qemu.en.md)
- [UBLK](docs/usage/ublk.en.md) for kernel mounts
- [NBD](docs/usage/nbd.en.md) - old interface for kernel mounts
- [QEMU, qemu-img and VDUSE](docs/usage/qemu.en.md)
- [NFS](docs/usage/nfs.en.md) clustered file system and pseudo-FS proxy
- [Administration](docs/usage/admin.en.md)
- Performance

View File

@@ -36,7 +36,7 @@ RUN (echo deb http://vitastor.io/debian bookworm main > /etc/apt/sources.list.d/
((echo 'Package: *'; echo 'Pin: origin "vitastor.io"'; echo 'Pin-Priority: 1000') > /etc/apt/preferences.d/vitastor.pref) && \
wget -q -O /etc/apt/trusted.gpg.d/vitastor.gpg https://vitastor.io/debian/pubkey.gpg && \
apt-get update && \
apt-get install -y vitastor-client && \
apt-get install -y vitastor-client ibverbs-providers && \
wget https://vitastor.io/archive/qemu/qemu-bookworm-9.2.2%2Bds-1%2Bvitastor4/qemu-utils_9.2.2%2Bds-1%2Bvitastor4_amd64.deb && \
wget https://vitastor.io/archive/qemu/qemu-bookworm-9.2.2%2Bds-1%2Bvitastor4/qemu-block-extra_9.2.2%2Bds-1%2Bvitastor4_amd64.deb && \
dpkg -x qemu-utils*.deb tmp1 && \

View File

@@ -1,4 +1,4 @@
VITASTOR_VERSION ?= v2.2.2
VITASTOR_VERSION ?= v2.3.0
all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v2.2.2
image: vitalif/vitastor-csi:v2.3.0
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -121,7 +121,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v2.2.2
image: vitalif/vitastor-csi:v2.3.0
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "2.2.2"
vitastorCSIDriverVersion = "2.3.0"
)
// Config struct fills the parameters of request or user input

View File

@@ -1,7 +1,4 @@
#!/bin/bash
cat < vitastor.Dockerfile > ../Dockerfile
cd ..
mkdir -p packages
sudo podman build --build-arg DISTRO=debian --build-arg REL=bookworm -v `pwd`/packages:/root/packages -f Dockerfile .
rm Dockerfile
docker build --build-arg DISTRO=debian --build-arg REL=bookworm -t vitastor-buildenv:bookworm -f vitastor-buildenv.Dockerfile .
docker run -i --rm -e REL=bookworm -v `dirname $0`/../:/root/vitastor vitastor-buildenv:bookworm /root/vitastor/debian/vitastor-build.sh

View File

@@ -1,7 +1,4 @@
#!/bin/bash
cat < vitastor.Dockerfile > ../Dockerfile
cd ..
mkdir -p packages
sudo podman build --build-arg DISTRO=debian --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f Dockerfile .
rm Dockerfile
docker build --build-arg DISTRO=debian --build-arg REL=bullseye -t vitastor-buildenv:bullseye -f vitastor-buildenv.Dockerfile .
docker run -i --rm -e REL=bullseye -v `dirname $0`/../:/root/vitastor vitastor-buildenv:bullseye /root/vitastor/debian/vitastor-build.sh

View File

@@ -1,7 +1,4 @@
#!/bin/bash
cat < vitastor.Dockerfile > ../Dockerfile
cd ..
mkdir -p packages
sudo podman build --build-arg DISTRO=debian --build-arg REL=buster -v `pwd`/packages:/root/packages -f Dockerfile .
rm Dockerfile
docker build --build-arg DISTRO=debian --build-arg REL=buster -t vitastor-buildenv:buster -f vitastor-buildenv.Dockerfile .
docker run -i --rm -e REL=buster -v `dirname $0`/../:/root/vitastor vitastor-buildenv:buster /root/vitastor/debian/vitastor-build.sh

4
debian/build-vitastor-trixie.sh vendored Executable file
View File

@@ -0,0 +1,4 @@
#!/bin/bash
docker build --build-arg DISTRO=debian --build-arg REL=trixie -t vitastor-buildenv:trixie -f vitastor-buildenv.Dockerfile .
docker run -i --rm -e REL=trixie -v `dirname $0`/../:/root/vitastor vitastor-buildenv:trixie /root/vitastor/debian/vitastor-build.sh

View File

@@ -1,7 +1,5 @@
#!/bin/bash
# Ubuntu 22.04 Jammy Jellyfish
cat < vitastor.Dockerfile > ../Dockerfile
cd ..
mkdir -p packages
sudo podman build --build-arg DISTRO=ubuntu --build-arg REL=jammy -v `pwd`/packages:/root/packages -f Dockerfile .
rm Dockerfile
docker build --build-arg DISTRO=ubuntu --build-arg REL=jammy -t vitastor-buildenv:jammy -f vitastor-buildenv.Dockerfile .
docker run -i --rm -e REL=jammy -v `dirname $0`/../:/root/vitastor vitastor-buildenv:jammy /root/vitastor/debian/vitastor-build.sh

5
debian/build-vitastor-ubuntu-noble.sh vendored Executable file
View File

@@ -0,0 +1,5 @@
#!/bin/bash
# 24.04 Noble Numbat
docker build --build-arg DISTRO=ubuntu --build-arg REL=noble -t vitastor-buildenv:noble -f vitastor-buildenv.Dockerfile .
docker run -i --rm -e REL=noble -v `dirname $0`/../:/root/vitastor vitastor-buildenv:noble /root/vitastor/debian/vitastor-build.sh

2
debian/changelog vendored
View File

@@ -1,4 +1,4 @@
vitastor (2.2.2-1) unstable; urgency=medium
vitastor (2.3.0-1) unstable; urgency=medium
* Bugfixes

4
debian/control vendored
View File

@@ -2,9 +2,9 @@ Source: vitastor
Section: admin
Priority: optional
Maintainer: Vitaliy Filippov <vitalif@yourcmc.ru>
Build-Depends: debhelper, liburing-dev (>= 0.6), g++ (>= 8), libstdc++6 (>= 8),
Build-Depends: debhelper, g++ (>= 8), libstdc++6 (>= 8),
linux-libc-dev, libgoogle-perftools-dev, libjerasure-dev, libgf-complete-dev,
libibverbs-dev, libisal-dev, cmake, pkg-config, libnl-3-dev, libnl-genl-3-dev,
libibverbs-dev, librdmacm-dev, libisal-dev, cmake, pkg-config, libnl-3-dev, libnl-genl-3-dev,
node-bindings <!nocheck>, node-gyp, node-nan
Standards-Version: 4.5.0
Homepage: https://vitastor.io/

View File

@@ -26,7 +26,7 @@ RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" -o "$REL" = "bookworm" ]; then
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
RUN apt-get update
RUN DEBIAN_FRONTEND=noninteractive TZ=Europe/Moscow apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts
RUN DEBIAN_FRONTEND=noninteractive TZ=Europe/Moscow apt-get -y install fio libgoogle-perftools-dev devscripts
RUN DEBIAN_FRONTEND=noninteractive TZ=Europe/Moscow apt-get -y build-dep qemu
# To build a custom version
#RUN cp /root/packages/qemu-orig/* /root

60
debian/vitastor-build.sh vendored Executable file
View File

@@ -0,0 +1,60 @@
#!/bin/bash
# To be ran inside buildenv docker
set -e -x
[ -e /usr/lib/x86_64-linux-gnu/pkgconfig/libisal.pc ] || cp /root/vitastor/debian/libisal.pc /usr/lib/x86_64-linux-gnu/pkgconfig
mkdir -p /root/fio-build/
cd /root/fio-build/
rm -rf /root/fio-build/*
dpkg-source -x /root/fio*.dsc
FULLVER=`head -n1 /root/vitastor/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'`
VER=${FULLVER%%-*}
rm -rf /root/vitastor-$VER
mkdir /root/vitastor-$VER
cd /root/vitastor
cp -a $(ls | grep -v packages) /root/vitastor-$VER
rm -rf /root/vitastor/packages/vitastor-$REL
mkdir -p /root/vitastor/packages/vitastor-$REL
mv /root/vitastor-$VER /root/vitastor/packages/vitastor-$REL/
cd /root/vitastor/packages/vitastor-$REL/vitastor-$VER
rm -rf fio
ln -s /root/fio-build/fio-*/ ./fio
FIO=`head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'`
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h
sh copy-fio-includes.sh
rm fio
mkdir -p a b debian/patches
mv fio-copy b/fio
diff -NaurpbB a b > debian/patches/fio-headers.patch || true
echo fio-headers.patch >> debian/patches/series
rm -rf a b
echo "dep:fio=$FIO" > debian/fio_version
cd /root/vitastor/packages/vitastor-$REL/vitastor-$VER
mkdir mon/node_modules
cd mon/node_modules
curl -s https://git.yourcmc.ru/vitalif/antietcd/archive/master.tar.gz | tar -zx
curl -s https://git.yourcmc.ru/vitalif/tinyraft/archive/master.tar.gz | tar -zx
cd /root/vitastor/packages/vitastor-$REL
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_$VER.orig.tar.xz vitastor-$VER
cd vitastor-$VER
DEBEMAIL="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$FULLVER""$REL" "Rebuild for $REL"
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa
rm -rf /root/vitastor/packages/vitastor-$REL/vitastor-*/
# Why does ubuntu rename debug packages to *.ddeb?
cd /root/vitastor/packages/vitastor-$REL
if ls *.ddeb >/dev/null; then
perl -i -pe 's/\.ddeb/.deb/' *.buildinfo *.changes
for i in *.ddeb; do
mv $i ${i%%.ddeb}.deb
done
fi

31
debian/vitastor-buildenv.Dockerfile vendored Normal file
View File

@@ -0,0 +1,31 @@
# Build environment for building Vitastor packages for Debian inside a container
# cd ..
# docker build --build-arg DISTRO=debian --build-arg REL=bullseye -f debian/vitastor.Dockerfile -t vitastor-buildenv:bullseye .
# docker run --rm -e REL=bullseye -v ./:/root/vitastor /root/vitastor/debian/vitastor-build.sh
ARG DISTRO=debian
ARG REL=
FROM $DISTRO:$REL
ARG DISTRO=debian
ARG REL=
WORKDIR /root
RUN set -e -x; \
if [ "$REL" = "buster" ]; then \
perl -i -pe 's/deb.debian.org/archive.debian.org/' /etc/apt/sources.list; \
apt-get update; \
apt-get -y install wget; \
wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg; \
echo "deb https://vitastor.io/debian $REL main" >> /etc/apt/sources.list; \
fi; \
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
perl -i -pe 's/Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/*.sources || true; \
echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
RUN apt-get update && \
apt-get -y install fio libgoogle-perftools-dev devscripts libjerasure-dev cmake \
libibverbs-dev librdmacm-dev libisal-dev libnl-3-dev libnl-genl-3-dev curl nodejs npm node-nan node-bindings && \
apt-get -y build-dep fio && \
apt-get --download-only source fio

View File

@@ -2,6 +2,7 @@ usr/bin/vita
usr/bin/vitastor-cli
usr/bin/vitastor-rm
usr/bin/vitastor-nbd
usr/bin/vitastor-ublk
usr/bin/vitastor-nfs
usr/bin/vitastor-kv
usr/bin/vitastor-kv-stress

View File

@@ -1,65 +0,0 @@
# Build Vitastor packages for Debian inside a container
# cd ..; podman build --build-arg DISTRO=debian --build-arg REL=bullseye -v `pwd`/packages:/root/packages -f debian/vitastor.Dockerfile .
ARG DISTRO=debian
ARG REL=
FROM $DISTRO:$REL
ARG DISTRO=debian
ARG REL=
WORKDIR /root
RUN set -e -x; \
if [ "$REL" = "buster" ]; then \
apt-get update; \
apt-get -y install wget; \
wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg; \
echo "deb https://vitastor.io/debian $REL main" >> /etc/apt/sources.list; \
fi; \
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
perl -i -pe 's/Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/debian.sources || true; \
echo 'APT::Install-Recommends false;' >> /etc/apt/apt.conf; \
echo 'APT::Install-Suggests false;' >> /etc/apt/apt.conf
RUN apt-get update && \
apt-get -y install fio liburing-dev libgoogle-perftools-dev devscripts libjerasure-dev cmake \
libibverbs-dev librdmacm-dev libisal-dev libnl-3-dev libnl-genl-3-dev curl nodejs npm node-nan node-bindings && \
apt-get -y build-dep fio && \
apt-get --download-only source fio
ADD . /root/vitastor
RUN set -e -x; \
[ -e /usr/lib/x86_64-linux-gnu/pkgconfig/libisal.pc ] || cp /root/vitastor/debian/libisal.pc /usr/lib/x86_64-linux-gnu/pkgconfig; \
mkdir -p /root/fio-build/; \
cd /root/fio-build/; \
rm -rf /root/fio-build/*; \
dpkg-source -x /root/fio*.dsc; \
mkdir -p /root/packages/vitastor-$REL; \
rm -rf /root/packages/vitastor-$REL/*; \
cd /root/packages/vitastor-$REL; \
FULLVER=$(head -n1 /root/vitastor/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
VER=${FULLVER%%-*}; \
cp -r /root/vitastor vitastor-$VER; \
cd vitastor-$VER; \
ln -s /root/fio-build/fio-*/ ./fio; \
FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
sh copy-fio-includes.sh; \
rm fio; \
mkdir -p a b debian/patches; \
mv fio-copy b/fio; \
diff -NaurpbB a b > debian/patches/fio-headers.patch || true; \
echo fio-headers.patch >> debian/patches/series; \
rm -rf a b; \
echo "dep:fio=$FIO" > debian/fio_version; \
cd /root/packages/vitastor-$REL/vitastor-$VER; \
mkdir mon/node_modules; \
cd mon/node_modules; \
curl -s https://git.yourcmc.ru/vitalif/antietcd/archive/master.tar.gz | tar -zx; \
curl -s https://git.yourcmc.ru/vitalif/tinyraft/archive/master.tar.gz | tar -zx; \
cd /root/packages/vitastor-$REL; \
tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_$VER.orig.tar.xz vitastor-$VER; \
cd vitastor-$VER; \
DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$FULLVER""$REL" "Rebuild for $REL"; \
DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
rm -rf /root/packages/vitastor-$REL/vitastor-*/

View File

@@ -3,7 +3,7 @@
FROM debian:bookworm
ADD etc/apt /etc/apt/
RUN apt-get update && apt-get -y install vitastor udev systemd qemu-system-x86 qemu-system-common qemu-block-extra qemu-utils jq nfs-common && apt-get clean
RUN apt-get update && apt-get -y install vitastor ibverbs-providers udev systemd qemu-system-x86 qemu-system-common qemu-block-extra qemu-utils jq nfs-common && apt-get clean
ADD sleep.sh /usr/bin/
ADD install.sh /usr/bin/
ADD scripts /opt/scripts/

View File

@@ -1,4 +1,4 @@
VITASTOR_VERSION ?= v2.2.2
VITASTOR_VERSION ?= v2.3.0
all: build push

View File

@@ -0,0 +1,3 @@
Package: *
Pin: release n=bookworm-backports
Pin-Priority: 500

View File

@@ -4,7 +4,7 @@
#
# Desired Vitastor version
VITASTOR_VERSION=v2.2.2
VITASTOR_VERSION=v2.3.0
# Additional arguments for all containers
# For example, you may want to specify a custom logging driver here

View File

@@ -25,6 +25,9 @@ affect their interaction with the cluster.
- [nbd_max_part](#nbd_max_part)
- [osd_nearfull_ratio](#osd_nearfull_ratio)
- [hostname](#hostname)
- [ublk_queue_depth](#ublk_queue_depth)
- [ublk_max_io_size](#ublk_max_io_size)
- [qemu_file_mirror_path](#qemu_file_mirror_path)
## client_iothread_count
@@ -225,3 +228,28 @@ without destroying and recreating OSDs.
Clients use host name to find their distance to OSDs when [localized reads](pool.en.md#local_reads)
are enabled. By default, standard [gethostname](https://man7.org/linux/man-pages/man2/gethostname.2.html)
function is used to determine host name, but you can also override it with this parameter.
## ublk_queue_depth
- Type: integer
- Default: 256
Default queue depth for [Vitastor ublk servers](../usage/ublk.en.md).
## ublk_max_io_size
- Type: integer
Default maximum I/O size for Vitastor [ublk servers](../usage/ublk.en.md).
The largest of 1 MB and pool block size multiplied by EC data chunk count is used if not specified.
## qemu_file_mirror_path
- Type: string
When set to an FS directory path (for example, `/mnt/vitastor/`), `qemu-img info` and similar
QAPI commands return the name of the image inside this directory instead of normal
`vitastor://?image=abc` URI as `filename`.
This allows to then mount this path using [vitastor-nfs](../usage/nfs.en.md) and trick
third-party systems like Veeam which rely on `filename` in the image info but don't support Vitastor.

View File

@@ -25,6 +25,9 @@
- [nbd_max_part](#nbd_max_part)
- [osd_nearfull_ratio](#osd_nearfull_ratio)
- [hostname](#hostname)
- [ublk_queue_depth](#ublk_queue_depth)
- [ublk_max_io_size](#ublk_max_io_size)
- [qemu_file_mirror_path](#qemu_file_mirror_path)
## client_iothread_count
@@ -230,3 +233,30 @@ RDMA и хотите повысить пиковую производитель
[локальные чтения](pool.ru.md#local_reads). По умолчанию для определения имени
хоста используется стандартная функция [gethostname](https://man7.org/linux/man-pages/man2/gethostname.2.html),
но вы также можете задать имя хоста вручную данным параметром.
## ublk_queue_depth
- Тип: целое число
- Значение по умолчанию: 256
Глубина очереди по умолчанию для [ublk-серверов Vitastor](../usage/ublk.ru.md).
## ublk_max_io_size
- Тип: целое число
Максимальный размер запроса ввода-вывода для [ublk-серверов Vitastor](../usage/ublk.ru.md).
Если не задан, используется максимум из 1 МБ и размера блока пула, умноженного на число частей
данных EC-пула.
## qemu_file_mirror_path
- Тип: строка
Если установить эту опцию равной пути к каталогу в ФС, команда `qemu-img info` и подобные
команды QAPI будут возвращать в поле `filename` имя образа внутри заданного каталога вместо
обычного адреса типа `vitastor://?image=abc`.
Это позволяет смонтировать этот путь с помощью [vitastor-nfs](../usage/nfs.ru.md) и обмануть
сторонние системы типа Veeam, которые полагаются на поле `filename` в информации об образе QEMU,
но не поддерживают Vitastor.

View File

@@ -74,7 +74,7 @@ Consider `use_rdmacm` for such networks.
## use_rdmacm
- Type: boolean
- Default: true
- Default: false
Use an alternative implementation of RDMA through RDMA-CM (Connection
Manager). Works with all RDMA networks: Infiniband, iWARP and

View File

@@ -74,7 +74,7 @@ RDMA-устройства, но они не имеют соединения с
## use_rdmacm
- Тип: булево (да/нет)
- Значение по умолчанию: true
- Значение по умолчанию: false
Использовать альтернативную реализацию RDMA на основе RDMA-CM (Connection
Manager). Работает со всеми типами RDMA-сетей: Infiniband, iWARP и

View File

@@ -491,7 +491,7 @@ Can be used to slow down scrubbing if it affects user load too much.
## scrub_list_limit
- Type: integer
- Default: 1000
- Default: 262144
- Can be changed online: yes
Number of objects to list in one listing operation during scrub.

View File

@@ -514,7 +514,7 @@ fsync небезопасным даже с режимом "directsync".
## scrub_list_limit
- Тип: целое число
- Значение по умолчанию: 1000
- Значение по умолчанию: 262144
- Можно менять на лету: да
Размер загружаемых за одну операцию списков объектов в процессе фоновой

View File

@@ -283,3 +283,36 @@
[локальные чтения](pool.ru.md#local_reads). По умолчанию для определения имени
хоста используется стандартная функция [gethostname](https://man7.org/linux/man-pages/man2/gethostname.2.html),
но вы также можете задать имя хоста вручную данным параметром.
- name: ublk_queue_depth
type: int
default: 256
online: false
info: Default queue depth for [Vitastor ublk servers](../usage/ublk.en.md).
info_ru: Глубина очереди по умолчанию для [ublk-серверов Vitastor](../usage/ublk.ru.md).
- name: ublk_max_io_size
type: int
online: false
info: |
Default maximum I/O size for Vitastor [ublk servers](../usage/ublk.en.md).
The largest of 1 MB and pool block size multiplied by EC data chunk count is used if not specified.
info_ru: |
Максимальный размер запроса ввода-вывода для [ublk-серверов Vitastor](../usage/ublk.ru.md).
Если не задан, используется максимум из 1 МБ и размера блока пула, умноженного на число частей
данных EC-пула.
- name: qemu_file_mirror_path
type: string
info: |
When set to an FS directory path (for example, `/mnt/vitastor/`), `qemu-img info` and similar
QAPI commands return the name of the image inside this directory instead of normal
`vitastor://?image=abc` URI as `filename`.
This allows to then mount this path using [vitastor-nfs](../usage/nfs.en.md) and trick
third-party systems like Veeam which rely on `filename` in the image info but don't support Vitastor.
info_ru: |
Если установить эту опцию равной пути к каталогу в ФС, команда `qemu-img info` и подобные
команды QAPI будут возвращать в поле `filename` имя образа внутри заданного каталога вместо
обычного адреса типа `vitastor://?image=abc`.
Это позволяет смонтировать этот путь с помощью [vitastor-nfs](../usage/nfs.ru.md) и обмануть
сторонние системы типа Veeam, которые полагаются на поле `filename` в информации об образе QEMU,
но не поддерживают Vitastor.

View File

@@ -24,6 +24,8 @@
{{../../installation/kubernetes.en.md}}
{{../../installation/s3.en.md}}
{{../../installation/source.en.md}}
{{../../config.en.md|indent=1}}
@@ -54,6 +56,8 @@
{{../../usage/fio.en.md}}
{{../../usage/ublk.en.md}}
{{../../usage/nbd.en.md}}
{{../../usage/qemu.en.md}}

View File

@@ -26,6 +26,8 @@
{{../../installation/source.ru.md}}
{{../../installation/s3.ru.md}}
{{../../config.ru.md|indent=1}}
{{../../config/common.ru.md|indent=2}}
@@ -54,6 +56,8 @@
{{../../usage/fio.ru.md}}
{{../../usage/ublk.ru.md}}
{{../../usage/nbd.ru.md}}
{{../../usage/qemu.ru.md}}

View File

@@ -51,7 +51,7 @@
Рассмотрите включение `use_rdmacm` для таких сетей.
- name: use_rdmacm
type: bool
default: true
default: false
info: |
Use an alternative implementation of RDMA through RDMA-CM (Connection
Manager). Works with all RDMA networks: Infiniband, iWARP and

View File

@@ -566,7 +566,7 @@
сильно влияет на пользовательскую нагрузку.
- name: scrub_list_limit
type: int
default: 1000
default: 262144
online: true
info: |
Number of objects to list in one listing operation during scrub.

View File

@@ -26,9 +26,9 @@ at Vitastor Kubernetes operator: https://github.com/Antilles7227/vitastor-operat
The instruction is very simple.
1. Download a Docker image of the desired version: \
`docker pull vitastor:v2.2.2`
`docker pull vitalif/vitastor:v2.3.0`
2. Install scripts to the host system: \
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.2 install.sh`
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitalif/vitastor:v2.3.0 install.sh`
3. Reload udev rules: \
`udevadm control --reload-rules`

View File

@@ -25,9 +25,9 @@ Vitastor можно установить в Docker/Podman. При этом etcd,
Инструкция по установке максимально простая.
1. Скачайте Docker-образ желаемой версии: \
`docker pull vitastor:v2.2.2`
`docker pull vitalif/vitastor:v2.3.0`
2. Установите скрипты в хост-систему командой: \
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.2 install.sh`
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitalif/vitastor:v2.3.0 install.sh`
3. Перезагрузите правила udev: \
`udevadm control --reload-rules`

View File

@@ -11,12 +11,20 @@
- Trust Vitastor package signing key:
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
- Add Vitastor package repository to your /etc/apt/sources.list:
- Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
- Debian 13 (Trixie/Sid): `deb https://vitastor.io/debian trixie main`
- Debian 12 (Bookworm): `deb https://vitastor.io/debian bookworm main`
- Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
- Ubuntu 22.04 (Jammy): `deb https://vitastor.io/debian jammy main`
- Ubuntu 24.04 (Noble): `deb https://vitastor.io/debian noble main`
- Add `-oldstable` to bookworm/bullseye/buster in this line to install the last
stable version from 0.9.x branch instead of 1.x
- To always prefer vitastor-patched QEMU and Libvirt versions, add the following to `/etc/apt/preferences`:
```
Package: *
Pin: origin "vitastor.io"
Pin-Priority: 501
```
- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`
## CentOS
@@ -42,7 +50,6 @@
recommended because io_uring is a relatively new technology and there is
at least one bug which reproduces with io_uring and HP SmartArray
controllers in 5.4
- liburing 0.4 or newer
- lp_solve
- etcd 3.4.15 or newer. Earlier versions won't work because of various bugs,
for example [#12402](https://github.com/etcd-io/etcd/pull/12402).

View File

@@ -11,12 +11,20 @@
- Добавьте ключ репозитория Vitastor:
`wget https://vitastor.io/debian/pubkey.gpg -O /etc/apt/trusted.gpg.d/vitastor.gpg`
- Добавьте репозиторий Vitastor в /etc/apt/sources.list:
- Debian 12 (Bookworm/Sid): `deb https://vitastor.io/debian bookworm main`
- Debian 13 (Trixie/Sid): `deb https://vitastor.io/debian trixie main`
- Debian 12 (Bookworm): `deb https://vitastor.io/debian bookworm main`
- Debian 11 (Bullseye): `deb https://vitastor.io/debian bullseye main`
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main`
- Ubuntu 22.04 (Jammy): `deb https://vitastor.io/debian jammy main`
- Ubuntu 24.04 (Noble): `deb https://vitastor.io/debian noble main`
- Добавьте `-oldstable` к слову bookworm/bullseye/buster в этой строке, чтобы
установить последнюю стабильную версию из ветки 0.9.x вместо 1.x
- Чтобы всегда предпочитались версии пакетов QEMU и Libvirt с патчами Vitastor, добавьте в `/etc/apt/preferences`:
```
Package: *
Pin: origin "vitastor.io"
Pin-Priority: 501
```
- Установите пакеты: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu-system-x86`
## CentOS
@@ -41,7 +49,6 @@
- Ядро Linux 5.4 или новее, для поддержки io_uring. Рекомендуется даже 5.8,
так как io_uring - относительно новый интерфейс и в версиях до 5.8 встречались
некоторые баги, например, зависание с io_uring и контроллером HP SmartArray
- liburing 0.4 или новее
- lp_solve
- etcd 3.4.15 или новее. Более старые версии не будут работать из-за разных багов,
например, [#12402](https://github.com/etcd-io/etcd/pull/12402).

View File

@@ -9,7 +9,7 @@
To enable Vitastor support in Proxmox Virtual Environment (6.4-8.x are supported):
- Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts:
bookworm for 8.1+, pve8.0 for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
trixie for 9.0+, bookworm for 8.1+, pve8.0 for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
- Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
- Define storage in `/etc/pve/storage.cfg` (see below)
- Block network access from VMs to Vitastor network (to OSDs and etcd),

View File

@@ -9,7 +9,7 @@
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.x):
- Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox:
bookworm для 8.1+, pve8.0 для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
trixie для 9.0+, bookworm для 8.1+, pve8.0 для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
- Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
- Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
- Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию

View File

@@ -15,7 +15,7 @@
- gcc and g++ 8 or newer, clang 10 or newer, or other compiler with C++11 plus
designated initializers support from C++20
- CMake
- liburing, jerasure headers and libraries
- jerasure headers and libraries
- ISA-L, libibverbs and librdmacm headers and libraries (optional)
- tcmalloc (google-perftools-dev)

View File

@@ -15,7 +15,7 @@
- gcc и g++ >= 8, либо clang >= 10, либо другой компилятор с поддержкой C++11 плюс
назначенных инициализаторов (designated initializers) из C++20
- CMake
- Заголовки и библиотеки liburing, jerasure
- Заголовки и библиотеки jerasure
- Опционально - заголовки и библиотеки ISA-L, libibverbs, librdmacm
- tcmalloc (google-perftools-dev)

View File

@@ -52,7 +52,7 @@
- Generic user-space client library
- [Native QEMU driver](../usage/qemu.en.md)
- [Loadable fio engine for benchmarks](../usage/fio.en.md)
- [NBD proxy for kernel mounts](../usage/nbd.en.md)
- [UBLK](../usage/ublk.en.md) and [NBD](../usage/nbd.en.md) servers for kernel mounts
- [Simplified NFS proxy for file-based image access emulation (suitable for VMWare)](../usage/nfs.en.md#pseudo-fs)
## Roadmap

View File

@@ -54,7 +54,7 @@
- Общая пользовательская клиентская библиотека для работы с кластером
- [Драйвер диска для QEMU](../usage/qemu.ru.md)
- [Драйвер диска для утилиты тестирования производительности fio](../usage/fio.ru.md)
- [NBD-прокси для монтирования образов ядром](../usage/nbd.ru.md) ("блочное устройство в режиме пользователя")
- [UBLK](../usage/ublk.ru.md) и [NBD](../usage/nbd.ru.md) серверы для монтирования образов ядром ("блочное устройство в режиме пользователя")
- [Упрощённая NFS-прокси для эмуляции файлового доступа к образам (подходит для VMWare)](../usage/nfs.ru.md#псевдо-фс)
## Планы развития

View File

@@ -73,6 +73,8 @@ Options (automatic mode):
--max_other 10%
Use disks for OSD data even if they already have non-Vitastor partitions,
but only if these take up no more than this percent of disk space.
--dry-run
Check and print new OSD count for each disk but do not actually create them.
```
Options (single-device mode):

View File

@@ -74,6 +74,8 @@ vitastor-disk - инструмент командной строки для уп
--max_other 10%
Использовать диски под данные OSD, даже если на них уже есть не-Vitastor-овые
разделы, но только в случае, если они занимают не более данного процента диска.
--dry-run
Проверить и вывести число новых OSD для каждого диска, но не создавать их.
```
Опции для режима одного OSD:

View File

@@ -89,6 +89,8 @@ POSIX features currently not implemented in VitastorFS:
instead of actually allocated space
- Access times (`atime`) are not tracked (like `-o noatime`)
- Modification time (`mtime`) is updated lazily every second (like `-o lazytime`)
- Permission enforcement is disabled by default (and Linux NFS client doesn't
enforce them too). Use `--enforce 1` to enable it.
Other notable missing features which should be addressed in the future:
- Inode ID reuse. Currently inode IDs always grow, the limit is 2^48 inodes, so
@@ -258,4 +260,5 @@ Options:
| `--nfspath <PATH>` | set NFS export path to \<PATH> (default is /) |
| `--pidfile <FILE>` | write process ID to the specified file |
| `--logfile <FILE>` | log to the specified file |
| `--enforce 1` | enforce permissions at the server side (no by default) |
| `--foreground 1` | stay in foreground, do not daemonize |

View File

@@ -91,6 +91,8 @@ JSON-формате :-). Для инспекции содержимого БД
stat(2), так что `du` всегда показывает сумму размеров файлов, а не фактически занятое место
- Времена доступа (`atime`) не отслеживаются (как будто ФС смонтирована с `-o noatime`)
- Времена модификации (`mtime`) отслеживаются асинхронно (как будто ФС смонтирована с `-o lazytime`)
- Привилегии доступа по умолчанию не проверяются сервером (клиент NFS Linux их также не проверяет).
Чтобы включить проверки, используйте опцию `--enforce 1`.
Другие недостающие функции, которые нужно добавить в будущем:
- Переиспользование номеров инодов. В текущей реализации номера инодов всё время
@@ -270,4 +272,5 @@ VitastorFS из GPUDirect.
| `--nfspath <PATH>` | установить путь NFS-экспорта в \<PATH> (по умолчанию /) |
| `--pidfile <FILE>` | записать ID процесса в заданный файл |
| `--logfile <FILE>` | записывать логи в заданный файл |
| `--enforce 1` | проверять права доступа на стороне сервера (по умолчанию нет) |
| `--foreground 1` | не уходить в фон после запуска |

View File

@@ -130,23 +130,16 @@ Linux kernel, starting with version 5.15, supports a new interface for attaching
to the host - VDUSE (vDPA Device in Userspace). QEMU, starting with 7.2, has support for
exporting QEMU block devices over this protocol using qemu-storage-daemon.
VDUSE is currently the best interface to attach Vitastor disks as kernel devices because:
- It avoids data copies and thus achieves much better performance than [NBD](nbd.en.md)
- It doesn't have NBD timeout problem - the device doesn't die if an operation executes for too long
VDUSE advantages:
- VDUSE copies memory 1 time instead of 2, and is thus faster than [NBD](nbd.en.md) for linear read/write.
- It doesn't have NBD timeout problem - the device doesn't die if an operation executes for too long.
- It doesn't have hung device problem - if the userspace process dies it can be restarted (!)
and block device will continue operation
- It doesn't seem to have the device number limit
and block device will continue operation (UBLK can do it too).
- It doesn't seem to have the device number limit (UBLK also doesn't).
Example performance comparison:
| | direct fio | NBD | VDUSE |
|----------------------|-------------|-------------|-------------|
| linear write | 3.85 GB/s | 1.12 GB/s | 3.85 GB/s |
| 4k random write Q128 | 240000 iops | 120000 iops | 178000 iops |
| 4k random write Q1 | 9500 iops | 7620 iops | 7640 iops |
| linear read | 4.3 GB/s | 1.8 GB/s | 2.85 GB/s |
| 4k random read Q128 | 287000 iops | 140000 iops | 189000 iops |
| 4k random read Q1 | 9600 iops | 7640 iops | 7780 iops |
At the same time, VDUSE may be slower or faster than [UBLK](ublk.en.md) for linear read/write,
and iops-wise it's sometimes even slower than NBD. See performance comparison examples at the page [UBLK](ublk.en.md).
To try VDUSE you need at least Linux 5.15, built with VDUSE support
(CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
@@ -193,3 +186,12 @@ To remove the device:
vdpa dev del test1
kill <qemu-storage-daemon_process_PID>
```
## Veeam
Vitastor QEMU driver has a feature that allows to trick third-party systems like Veeam not able to parse qemu-img
vitastor URIs: [qemu_file_mirror_path](../config/client.en.md#qemu_file_mirror_path).
To make such systems work, you should set this option to an FS directory path (for example, `/mnt/vitastor/`) and
mount this directory using [`vitastor-nfs mount --block`](../usage/nfs.en.md). It will make them access
your images using files and, hopefully, succeed in doing their normal job :).

View File

@@ -132,24 +132,16 @@ qemu-system-x86_64 -enable-kvm -m 2048 -M accel=kvm,memory-backend=mem \
к системе - VDUSE (vDPA Device in Userspace), а в QEMU, начиная с версии 7.2, есть поддержка
экспорта блочных устройств QEMU по этому протоколу через qemu-storage-daemon.
VDUSE - на данный момент лучший интерфейс для подключения дисков Vitastor в виде блочных
устройств на уровне ядра, ибо:
- VDUSE не копирует данные и поэтому достигает значительно лучшей производительности, чем [NBD](nbd.ru.md)
- Также оно не имеет проблемы NBD-таймаута - устройство не умирает, если операция выполняется слишком долго
- Также оно не имеет проблемы подвисающих устройств - если процесс-обработчик умирает, его можно
перезапустить (!) и блочное устройство продолжит работать
- По-видимому, у него нет предела числа подключаемых в систему устройств
Преимущества VDUSE:
Пример сравнения производительности:
- VDUSE копирует данные 1 раз, а не 2, и поэтому он быстрее, чем [NBD](nbd.ru.md) при линейном доступе.
- VDUSE не имеет проблемы NBD-таймаута - устройство не умирает, если операция выполняется слишком долго.
- VDUSE не имеет проблемы подвисающих устройств - если процесс-обработчик умирает, его можно
перезапустить (!) и блочное устройство продолжит работать (в UBLK это тоже поддерживается).
- По-видимому, у него нет предела числа подключаемых в систему устройств (в UBLK лимита тоже нет).
| | Прямой fio | NBD | VDUSE |
|--------------------------|-------------|-------------|-------------|
| линейная запись | 3.85 GB/s | 1.12 GB/s | 3.85 GB/s |
| 4k случайная запись Q128 | 240000 iops | 120000 iops | 178000 iops |
| 4k случайная запись Q1 | 9500 iops | 7620 iops | 7640 iops |
| линейное чтение | 4.3 GB/s | 1.8 GB/s | 2.85 GB/s |
| 4k случайное чтение Q128 | 287000 iops | 140000 iops | 189000 iops |
| 4k случайное чтение Q1 | 9600 iops | 7640 iops | 7780 iops |
Однако, при линейном доступе VDUSE может быть медленнее UBLK (а может быть и быстрее), а по iops
VDUSE иногда даже медленнее NBD. Пример сравнения производительности смотрите на странице [UBLK](ublk.ru.md).
Чтобы попробовать VDUSE, вам нужно ядро Linux как минимум версии 5.15, собранное с поддержкой
VDUSE (CONFIG_VDPA=m, CONFIG_VDPA_USER=m, CONFIG_VIRTIO_VDPA=m).
@@ -196,3 +188,12 @@ vdpa dev add name test1 mgmtdev vduse
vdpa dev del test1
kill <PID_процесса_qemu-storage-daemon>
```
## Veeam
Драйвер Vitastor QEMU имеет функцию, которая позволяет обманывать сторонние системы типа Veeam, которые
не могут сами по себе разобрать адреса дисков в vitastor: [qemu_file_mirror_path](../config/client.ru.md#qemu_file_mirror_path).
Чтобы заставить такие системы работать, вам нужно установить эту опцию равной пути к некоторому каталогу
в ФС (например, `/mnt/vitastor/`) и примонтировать этот каталог с помощью [`vitastor-nfs mount --block`](../usage/nfs.ru.md).
Они начнут обращаться к образам как к файлам и, вероятно, смогут заработать корректно :).

116
docs/usage/ublk.en.md Normal file
View File

@@ -0,0 +1,116 @@
[Documentation](../../README.md#documentation) → Usage → UBLK
-----
[Читать на русском](ublk.ru.md)
# UBLK
[ublk](https://docs.kernel.org/block/ublk.html) is a new io_uring-based Linux interface
for user-space block device drivers, available since Linux 6.0.
It's not zero-copy, but it's still a fast implementation, outperforming both [NBD](nbd.en.md)
and [VDUSE](qemu.en.md#vduse) iops-wise and may or may not outperform VDUSE in linear I/O MB/s.
ublk also allows to recover devices even if the server (vitastor-ublk process) dies.
## Example performance comparison
TCP (100G), 3 hosts each with 6 NVMe OSDs, 3 replicas, single client
| | direct fio | NBD | VDUSE | UBLK |
|----------------------|-------------|-------------|------------|-------------|
| linear write | 3807 MB/s | 1832 MB/s | 3226 MB/s | 3027 MB/s |
| linear read | 3067 MB/s | 1885 MB/s | 1800 MB/s | 2076 MB/s |
| 4k random write Q128 | 128624 iops | 91060 iops | 94621 iops | 149450 iops |
| 4k random read Q128 | 117769 iops | 153408 iops | 93157 iops | 171987 iops |
| 4k random write Q1 | 8090 iops | 6442 iops | 6316 iops | 7272 iops |
| 4k random read Q1 | 9474 iops | 7200 iops | 6840 iops | 8038 iops |
RDMA (100G), 3 hosts each with 6 NVMe OSDs, 3 replicas, single client
| | direct fio | NBD | VDUSE | UBLK |
|----------------------|-------------|-------------|-------------|-------------|
| linear write | 6998 MB/s | 1878 MB/s | 4249 MB/s | 3140 MB/s |
| linear read | 8628 MB/s | 3389 MB/s | 5062 MB/s | 3674 MB/s |
| 4k random write Q128 | 222541 iops | 181589 iops | 138281 iops | 218222 iops |
| 4k random read Q128 | 412647 iops | 239987 iops | 151663 iops | 269583 iops |
| 4k random write Q1 | 11601 iops | 8592 iops | 9111 iops | 10000 iops |
| 4k random read Q1 | 10102 iops | 7788 iops | 8111 iops | 8965 iops |
## Commands
vitastor-ublk supports the following commands:
- [map](#map)
- [unmap](#unmap)
- [ls](#ls)
## map
To create a local block device for a Vitastor image run:
```
vitastor-ublk map [/dev/ublkbN] --image testimg
```
It will output a block device name like /dev/ublkb0 which you can then use as a normal disk.
You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want.
vitastor-ublk supports all usual Vitastor configuration options like `--config_path <path_to_config>` plus ublk-specific:
* `--recover` \
Recover a mapped device if the previous ublk server is dead.
* `--queue_depth 256` \
Maximum queue size for the device.
* `--max_io_size 1M` \
Maximum single I/O size for the device. Default: `max(1 MB, pool block size * EC part count)`.
* `--readonly` \
Make the device read-only.
* `--hdd` \
Mark the device as rotational.
* `--logfile /path/to/log/file.txt` \
Write log messages to the specified file instead of dropping them (in background mode)
or printing them to the standard output (in foreground mode).
* `--dev_num N` \
Use the specified device /dev/ublkbN instead of automatic selection (alternative syntax
to /dev/ublkbN positional parameter).
* `--foreground 1` \
Stay in foreground, do not daemonize.
Note that `ublk_queue_depth` and `ublk_max_io_size` may also be specified
in `/etc/vitastor/vitastor.conf` or in other configuration file specified with `--config_path`.
## unmap
To unmap the device run:
```
vitastor-ublk unmap /dev/ublkb0
```
## ls
```
vitastor-ublk ls [--json]
```
List mapped images.
Example output (normal format):
```
/dev/ublkb0
image: bench
pid: 584536
/dev/ublkb1
image: bench1
pid: 584546
```
Example output (JSON format):
```
{"/dev/ublkb0": {"image": "bench", "pid": 584536}, "/dev/ublkb1": {"image": "bench1", "pid": 584546}}
```

121
docs/usage/ublk.ru.md Normal file
View File

@@ -0,0 +1,121 @@
[Документация](../../README-ru.md#документация) → Использование → UBLK
-----
[Read in English](ublk.en.md)
# UBLK
[ublk](https://docs.kernel.org/block/ublk.html) - это новый Linux-интерфейс на основе io_uring
для реализации блочных устройств в пространстве пользователя, доступный, начиная с Linux 6.0.
ublk тоже копирует память (т.е. не является zero-copy), но по IOPS всё равно обгоняет и
[NBD](nbd.ru.md), и [VDUSE](qemu.ru.md#vduse), и иногда может даже обгонять VDUSE по
скорости линейного доступа. Также ublk позволяет оживлять устройства, у которых умер
сервер (процесс-обработчик vitastor-ublk).
## Пример сравнения производительности
TCP (100G), 3 сервера с 6 NVMe OSD каждый, 3 реплики, один клиент
| | Прямой fio | NBD | VDUSE | UBLK |
|--------------------------|-------------|-------------|------------|-------------|
| линейная запись | 3807 MB/s | 1832 MB/s | 3226 MB/s | 3027 MB/s |
| линейное чтение | 3067 MB/s | 1885 MB/s | 1800 MB/s | 2076 MB/s |
| 4k случайная запись Q128 | 128624 iops | 91060 iops | 94621 iops | 149450 iops |
| 4k случайное чтение Q128 | 117769 iops | 153408 iops | 93157 iops | 171987 iops |
| 4k случайная запись Q1 | 8090 iops | 6442 iops | 6316 iops | 7272 iops |
| 4k случайное чтение Q1 | 9474 iops | 7200 iops | 6840 iops | 8038 iops |
RDMA (100G), 3 сервера с 6 NVMe OSD каждый, 3 реплики, один клиент
| | Прямой fio | NBD | VDUSE | UBLK |
|--------------------------|-------------|-------------|-------------|-------------|
| линейная запись | 6998 MB/s | 1878 MB/s | 4249 MB/s | 3140 MB/s |
| линейное чтение | 8628 MB/s | 3389 MB/s | 5062 MB/s | 3674 MB/s |
| 4k случайная запись Q128 | 222541 iops | 181589 iops | 138281 iops | 218222 iops |
| 4k случайное чтение Q128 | 412647 iops | 239987 iops | 151663 iops | 269583 iops |
| 4k случайная запись Q1 | 11601 iops | 8592 iops | 9111 iops | 10000 iops |
| 4k случайное чтение Q1 | 10102 iops | 7788 iops | 8111 iops | 8965 iops |
## Команды
vitastor-ublk поддерживает следующие команды:
- [map](#map)
- [unmap](#unmap)
- [ls](#ls)
## map
Чтобы создать локальное блочное устройство для образа, выполните команду:
```
vitastor-ublk map [/dev/ublkbN] --image testimg
```
Команда напечатает название блочного устройства вида /dev/ublkb0, которое потом можно
будет использовать как обычный диск.
Для обращения по номеру инода, аналогично другим командам, можно использовать опции
`--pool <POOL> --inode <INODE> --size <SIZE>` вместо `--image testimg`.
vitastor-ublk поддерживает все обычные опции Vitastor, например, `--config_path <path_to_config>`,
плюс специфичные для ublk:
* `--recover` \
Восстановить ранее подключённое устройство, у которого умер обработчик.
* `--queue_depth 256` \
Максимальная глубина очереди устройства.
* `--max_io_size 1M` \
Максимальный размер запроса ввода-вывода для устройства. По умолчанию: `max(1 MB, блок данных пула * число частей данных EC)`.
* `--readonly` \
Подключить устройство в режиме только для чтения.
* `--hdd` \
Пометить устройство как вращающийся жёсткий диск (флаг rotational).
* `--logfile /path/to/log/file.txt` \
Писать сообщения о процессе работы в заданный файл, вместо пропуска их
при фоновом режиме запуска или печати на стандартный вывод при запуске
в консоли с `--foreground 1`.
* `--dev_num N` \
Использовать заданное устройство `/dev/ublkbN` вместо автоматического подбора.
* `--foreground 1` \
Не уводить процесс в фоновый режим.
Обратите внимание, что опции `ublk_queue_depth` и `ublk_max_io_size` можно
также задавать в `/etc/vitastor/vitastor.conf` или в другом файле конфигурации,
заданном опцией `--config_path`.
## unmap
Для отключения устройства выполните:
```
vitastor-ublk unmap /dev/ublkb0
```
## ls
```
vitastor-ublk ls [--json]
```
Вывести подключённые устройства.
Пример вывода в обычном формате:
```
/dev/ublkb0
image: bench
pid: 584536
/dev/ublkb1
image: bench1
pid: 584546
```
Пример вывода в JSON-формате:
```
{"/dev/ublkb0": {"image": "bench", "pid": 584536}, "/dev/ublkb1": {"image": "bench1", "pid": 584546}}
```

1
emhash Submodule

Submodule emhash added at b7ff3147a5

View File

@@ -96,6 +96,7 @@ class Mon
}
else
{
res.setHeader('Content-Type', 'text/plain; version=0.0.4; charset=utf-8');
res.write(export_prometheus_metrics(this.state));
}
}

View File

@@ -15,7 +15,7 @@ function get_osd_tree(global_config, state)
const stat = state.osd.stats[osd_num];
const osd_cfg = state.config.osd[osd_num];
let reweight = osd_cfg == null ? 1 : Number(osd_cfg.reweight);
if (isNaN(reweight) || reweight < 0 || reweight > 0)
if (isNaN(reweight) || reweight < 0 || reweight > 1)
reweight = 1;
if (stat && stat.size && reweight && (state.osd.state[osd_num] || Number(stat.time) >= down_time ||
osd_cfg && osd_cfg.noout))
@@ -179,7 +179,7 @@ function filter_osds_by_block_layout(orig_tree, osd_stats, block_size, bitmap_gr
if (orig_tree[osd].level === 'osd')
{
const osd_stat = osd_stats[osd];
if (osd_stat && (osd_stat.bs_block_size && osd_stat.bs_block_size != block_size ||
if (osd_stat && (osd_stat.data_block_size && osd_stat.data_block_size != block_size ||
osd_stat.bitmap_granularity && osd_stat.bitmap_granularity != bitmap_granularity ||
osd_stat.immediate_commit == 'small' && immediate_commit == 'all' ||
osd_stat.immediate_commit == 'none' && immediate_commit != 'none'))

View File

@@ -1,6 +1,6 @@
{
"name": "vitastor-mon",
"version": "2.2.2",
"version": "2.3.0",
"description": "Vitastor SDS monitor service",
"main": "mon-main.js",
"scripts": {
@@ -9,7 +9,7 @@
"author": "Vitaliy Filippov",
"license": "UNLICENSED",
"dependencies": {
"antietcd": "^1.1.2",
"antietcd": "^1.1.3",
"sprintf-js": "^1.1.2",
"ws": "^7.2.5"
},

View File

@@ -1,6 +1,6 @@
{
"name": "vitastor",
"version": "2.2.2",
"version": "2.3.0",
"description": "Low-level native bindings to Vitastor client library",
"main": "index.js",
"keywords": [

View File

@@ -261,7 +261,7 @@ sub free_image
my ($vtype, $name, $vmid, undef, undef, undef) = $class->parse_volname($volname);
$class->deactivate_volume($storeid, $scfg, $volname);
my $full_list = run_cli($scfg, [ 'ls', '-l' ]);
my $list = _process_list($scfg, $storeid, $full_list);
my $list = _process_list($scfg, $storeid, $full_list, 0);
# Remove image and all its snapshots
my $rm_names = {
map { ($prefix.$_->{name} => 1) }
@@ -269,6 +269,10 @@ sub free_image
@$list
};
my $children = [ grep { $_->{parent_name} && $rm_names->{$_->{parent_name}} } @$full_list ];
$children = [ grep {
substr($_->{name}, 0, length($prefix.$name)) ne $prefix.$name &&
substr($_->{name}, 0, length($prefix.$name)+1) ne $prefix.$name.'@'
} @$children ];
die "Image has children: ".join(', ', map {
substr($_->{name}, 0, length $prefix) eq $prefix
? substr($_->name, length $prefix)
@@ -288,14 +292,15 @@ sub free_image
sub _process_list
{
my ($scfg, $storeid, $result) = @_;
my ($scfg, $storeid, $result, $skip_snapshot) = @_;
$skip_snapshot = 1 if !defined $skip_snapshot;
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
my $list = [];
foreach my $el (@$result)
{
next if !$el->{name} || length($prefix) && substr($el->{name}, 0, length $prefix) ne $prefix;
my $name = substr($el->{name}, length $prefix);
next if $name =~ /@/;
next if $skip_snapshot && $name =~ /@/;
my ($owner) = $name =~ /^(?:vm|base)-(\d+)-/s;
next if !defined $owner;
my $parent = !defined $el->{parent_name}
@@ -494,4 +499,55 @@ sub rename_volume
return "${storeid}:${base_name}${target_volname}";
}
sub _monkey_patch_qemu_blockdev_options
{
my ($cfg, $volid, $machine_version, $options) = @_;
my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid);
my $scfg = PVE::Storage::storage_config($cfg, $storeid);
my $plugin = PVE::Storage::Plugin->lookup($scfg->{type});
my ($vtype) = $plugin->parse_volname($volname);
die "cannot use volume of type '$vtype' as a QEMU blockdevice\n"
if $vtype ne 'images' && $vtype ne 'iso' && $vtype ne 'import';
return $plugin->qemu_blockdev_options($scfg, $storeid, $volname, $machine_version, $options);
}
sub qemu_blockdev_options
{
my ($class, $scfg, $storeid, $volname, $machine_version, $options) = @_;
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
my ($vtype, $name, $vmid) = $class->parse_volname($volname);
$name .= '@'.$options->{'snapshot-name'} if $options->{'snapshot-name'};
if ($scfg->{vitastor_nbd})
{
my $mapped = run_cli($scfg, [ 'ls' ], binary => '/usr/bin/vitastor-nbd');
my ($kerneldev) = grep { $mapped->{$_}->{image} eq $prefix.$name } keys %$mapped;
die "Image not mapped via NBD" if !$kerneldev;
return { driver => 'host_device', filename => $kerneldev };
}
my $blockdev = {
driver => 'vitastor',
image => $prefix.$name,
};
if ($scfg->{vitastor_config_path})
{
$blockdev->{'config-path'} = $scfg->{vitastor_config_path};
}
if ($scfg->{vitastor_etcd_address})
{
# FIXME This is the only exception: etcd_address -> etcd_host for qemu
$blockdev->{'etcd-host'} = $scfg->{vitastor_etcd_address};
}
if ($scfg->{vitastor_etcd_prefix})
{
$blockdev->{'etcd-prefix'} = $scfg->{vitastor_etcd_prefix};
}
return $blockdev;
}
*PVE::Storage::qemu_blockdev_options = *_monkey_patch_qemu_blockdev_options;
1;

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VITASTOR_VERSION = '2.2.2'
VITASTOR_VERSION = '2.3.0'
LOG = logging.getLogger(__name__)

View File

@@ -0,0 +1,637 @@
diff --git a/include/libvirt/libvirt-storage.h b/include/libvirt/libvirt-storage.h
index aaad4a3da1..5f5daa8341 100644
--- a/include/libvirt/libvirt-storage.h
+++ b/include/libvirt/libvirt-storage.h
@@ -326,6 +326,7 @@ typedef enum {
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS = 1 << 17, /* (Since: 1.2.8) */
VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE = 1 << 18, /* (Since: 3.1.0) */
VIR_CONNECT_LIST_STORAGE_POOLS_ISCSI_DIRECT = 1 << 19, /* (Since: 5.6.0) */
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR = 1 << 20, /* (Since: 5.0.0) */
} virConnectListAllStoragePoolsFlags;
int virConnectListAllStoragePools(virConnectPtr conn,
diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index 1e24e41a48..ce359a4cf8 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -7435,7 +7435,8 @@ virDomainDiskSourceNetworkParse(xmlNodePtr node,
src->configFile = virXPathString("string(./config/@file)", ctxt);
if (src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTP ||
- src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS)
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS ||
+ src->protocol == VIR_STORAGE_NET_PROTOCOL_VITASTOR)
src->query = virXMLPropString(node, "query");
if (virDomainStorageNetworkParseHosts(node, ctxt, &src->hosts, &src->nhosts) < 0)
@@ -31871,6 +31872,7 @@ virDomainStorageSourceTranslateSourcePool(virStorageSource *src,
case VIR_STORAGE_POOL_MPATH:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_SHEEPDOG:
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_LAST:
diff --git a/src/conf/domain_validate.c b/src/conf/domain_validate.c
index b28af7fa56..d1aae6e43e 100644
--- a/src/conf/domain_validate.c
+++ b/src/conf/domain_validate.c
@@ -504,6 +504,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
case VIR_STORAGE_NET_PROTOCOL_RBD:
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_NBD:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
@@ -576,7 +577,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
}
}
- /* internal snapshots and config files are currently supported only with rbd: */
+ /* internal snapshots are currently supported only with rbd: */
if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD) {
if (src->snapshot) {
@@ -584,10 +585,14 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src)
_("<snapshot> element is currently supported only with 'rbd' disks"));
return -1;
}
-
+ }
+ /* config files are currently supported only with rbd and vitastor: */
+ if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR) {
if (src->configFile) {
virReportError(VIR_ERR_XML_ERROR, "%s",
- _("<config> element is currently supported only with 'rbd' disks"));
+ _("<config> element is currently supported only with 'rbd' and 'vitastor' disks"));
return -1;
}
}
diff --git a/src/conf/schemas/domaincommon.rng b/src/conf/schemas/domaincommon.rng
index 183dd5db5e..dcc0d1a778 100644
--- a/src/conf/schemas/domaincommon.rng
+++ b/src/conf/schemas/domaincommon.rng
@@ -2066,6 +2066,35 @@
</element>
</define>
+ <define name="diskSourceNetworkProtocolVitastor">
+ <element name="source">
+ <interleave>
+ <attribute name="protocol">
+ <value>vitastor</value>
+ </attribute>
+ <ref name="diskSourceCommon"/>
+ <optional>
+ <attribute name="name"/>
+ </optional>
+ <optional>
+ <attribute name="query"/>
+ </optional>
+ <zeroOrMore>
+ <ref name="diskSourceNetworkHost"/>
+ </zeroOrMore>
+ <optional>
+ <element name="config">
+ <attribute name="file">
+ <ref name="absFilePath"/>
+ </attribute>
+ <empty/>
+ </element>
+ </optional>
+ <empty/>
+ </interleave>
+ </element>
+ </define>
+
<define name="diskSourceNetworkProtocolISCSI">
<element name="source">
<attribute name="protocol">
@@ -2416,6 +2445,7 @@
<ref name="diskSourceNetworkProtocolSimple"/>
<ref name="diskSourceNetworkProtocolVxHS"/>
<ref name="diskSourceNetworkProtocolNFS"/>
+ <ref name="diskSourceNetworkProtocolVitastor"/>
</choice>
</define>
diff --git a/src/conf/storage_conf.c b/src/conf/storage_conf.c
index 1dc9365bf2..a8a736be81 100644
--- a/src/conf/storage_conf.c
+++ b/src/conf/storage_conf.c
@@ -56,7 +56,7 @@ VIR_ENUM_IMPL(virStoragePool,
"logical", "disk", "iscsi",
"iscsi-direct", "scsi", "mpath",
"rbd", "sheepdog", "gluster",
- "zfs", "vstorage",
+ "zfs", "vstorage", "vitastor",
);
VIR_ENUM_IMPL(virStoragePoolFormatFileSystem,
@@ -242,6 +242,18 @@ static virStoragePoolTypeInfo poolTypeInfo[] = {
.formatToString = virStorageFileFormatTypeToString,
}
},
+ {.poolType = VIR_STORAGE_POOL_VITASTOR,
+ .poolOptions = {
+ .flags = (VIR_STORAGE_POOL_SOURCE_HOST |
+ VIR_STORAGE_POOL_SOURCE_NETWORK |
+ VIR_STORAGE_POOL_SOURCE_NAME),
+ },
+ .volOptions = {
+ .defaultFormat = VIR_STORAGE_FILE_RAW,
+ .formatFromString = virStorageVolumeFormatFromString,
+ .formatToString = virStorageFileFormatTypeToString,
+ }
+ },
{.poolType = VIR_STORAGE_POOL_SHEEPDOG,
.poolOptions = {
.flags = (VIR_STORAGE_POOL_SOURCE_HOST |
@@ -538,6 +550,11 @@ virStoragePoolDefParseSource(xmlXPathContextPtr ctxt,
_("element 'name' is mandatory for RBD pool"));
return -1;
}
+ if (pool_type == VIR_STORAGE_POOL_VITASTOR && source->name == NULL) {
+ virReportError(VIR_ERR_XML_ERROR, "%s",
+ _("element 'name' is mandatory for Vitastor pool"));
+ return -1;
+ }
if (options->formatFromString) {
g_autofree char *format = NULL;
@@ -1127,6 +1144,7 @@ virStoragePoolDefFormatBuf(virBuffer *buf,
/* RBD, Sheepdog, Gluster and Iscsi-direct devices are not local block devs nor
* files, so they don't have a target */
if (def->type != VIR_STORAGE_POOL_RBD &&
+ def->type != VIR_STORAGE_POOL_VITASTOR &&
def->type != VIR_STORAGE_POOL_SHEEPDOG &&
def->type != VIR_STORAGE_POOL_GLUSTER &&
def->type != VIR_STORAGE_POOL_ISCSI_DIRECT) {
diff --git a/src/conf/storage_conf.h b/src/conf/storage_conf.h
index fc67957cfe..720c07ef74 100644
--- a/src/conf/storage_conf.h
+++ b/src/conf/storage_conf.h
@@ -103,6 +103,7 @@ typedef enum {
VIR_STORAGE_POOL_GLUSTER, /* Gluster device */
VIR_STORAGE_POOL_ZFS, /* ZFS */
VIR_STORAGE_POOL_VSTORAGE, /* Virtuozzo Storage */
+ VIR_STORAGE_POOL_VITASTOR, /* Vitastor */
VIR_STORAGE_POOL_LAST,
} virStoragePoolType;
@@ -454,6 +455,7 @@ VIR_ENUM_DECL(virStoragePartedFs);
VIR_CONNECT_LIST_STORAGE_POOLS_SCSI | \
VIR_CONNECT_LIST_STORAGE_POOLS_MPATH | \
VIR_CONNECT_LIST_STORAGE_POOLS_RBD | \
+ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR | \
VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG | \
VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER | \
VIR_CONNECT_LIST_STORAGE_POOLS_ZFS | \
diff --git a/src/conf/storage_source_conf.c b/src/conf/storage_source_conf.c
index 8a063be244..dd9c7f11a2 100644
--- a/src/conf/storage_source_conf.c
+++ b/src/conf/storage_source_conf.c
@@ -89,6 +89,7 @@ VIR_ENUM_IMPL(virStorageNetProtocol,
"ssh",
"vxhs",
"nfs",
+ "vitastor",
);
@@ -1314,6 +1315,7 @@ virStorageSourceNetworkDefaultPort(virStorageNetProtocol protocol)
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
return 24007;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_RBD:
/* we don't provide a default for RBD */
return 0;
diff --git a/src/conf/storage_source_conf.h b/src/conf/storage_source_conf.h
index ebddf28cd6..873a2be65c 100644
--- a/src/conf/storage_source_conf.h
+++ b/src/conf/storage_source_conf.h
@@ -130,6 +130,7 @@ typedef enum {
VIR_STORAGE_NET_PROTOCOL_SSH,
VIR_STORAGE_NET_PROTOCOL_VXHS,
VIR_STORAGE_NET_PROTOCOL_NFS,
+ VIR_STORAGE_NET_PROTOCOL_VITASTOR,
VIR_STORAGE_NET_PROTOCOL_LAST
} virStorageNetProtocol;
diff --git a/src/conf/virstorageobj.c b/src/conf/virstorageobj.c
index 59fa5da372..4739167f5f 100644
--- a/src/conf/virstorageobj.c
+++ b/src/conf/virstorageobj.c
@@ -1438,6 +1438,7 @@ virStoragePoolObjSourceFindDuplicateCb(const void *payload,
return 1;
break;
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_ISCSI_DIRECT:
case VIR_STORAGE_POOL_RBD:
case VIR_STORAGE_POOL_LAST:
@@ -1921,6 +1922,8 @@ virStoragePoolObjMatch(virStoragePoolObj *obj,
(obj->def->type == VIR_STORAGE_POOL_MPATH)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_RBD) &&
(obj->def->type == VIR_STORAGE_POOL_RBD)) ||
+ (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR) &&
+ (obj->def->type == VIR_STORAGE_POOL_VITASTOR)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG) &&
(obj->def->type == VIR_STORAGE_POOL_SHEEPDOG)) ||
(MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER) &&
diff --git a/src/libvirt-storage.c b/src/libvirt-storage.c
index db7660aac4..561df34709 100644
--- a/src/libvirt-storage.c
+++ b/src/libvirt-storage.c
@@ -94,6 +94,7 @@ virStoragePoolGetConnect(virStoragePoolPtr pool)
* VIR_CONNECT_LIST_STORAGE_POOLS_SCSI
* VIR_CONNECT_LIST_STORAGE_POOLS_MPATH
* VIR_CONNECT_LIST_STORAGE_POOLS_RBD
+ * VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR
* VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG
* VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER
* VIR_CONNECT_LIST_STORAGE_POOLS_ZFS
diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c
index bdd30dd65a..5353e00b4a 100644
--- a/src/libxl/libxl_conf.c
+++ b/src/libxl/libxl_conf.c
@@ -1081,6 +1081,7 @@ libxlMakeNetworkDiskSrcStr(virStorageSource *src,
case VIR_STORAGE_NET_PROTOCOL_SSH:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
case VIR_STORAGE_NET_PROTOCOL_NFS:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
virReportError(VIR_ERR_NO_SUPPORT,
diff --git a/src/libxl/xen_xl.c b/src/libxl/xen_xl.c
index ec8de30c01..61eab9606d 100644
--- a/src/libxl/xen_xl.c
+++ b/src/libxl/xen_xl.c
@@ -1461,6 +1461,7 @@ xenFormatXLDiskSrcNet(virStorageSource *src)
case VIR_STORAGE_NET_PROTOCOL_SSH:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
case VIR_STORAGE_NET_PROTOCOL_NFS:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
virReportError(VIR_ERR_NO_SUPPORT,
diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c
index 32568d4ae6..e625fa0720 100644
--- a/src/qemu/qemu_block.c
+++ b/src/qemu/qemu_block.c
@@ -731,6 +731,38 @@ qemuBlockStorageSourceGetRBDProps(virStorageSource *src,
}
+static virJSONValue *
+qemuBlockStorageSourceGetVitastorProps(virStorageSource *src)
+{
+ virJSONValue *ret = NULL;
+ virStorageNetHostDef *host;
+ size_t i;
+ g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER;
+ g_autofree char *etcd = NULL;
+
+ for (i = 0; i < src->nhosts; i++) {
+ host = src->hosts + i;
+ if ((virStorageNetHostTransport)host->transport != VIR_STORAGE_NET_HOST_TRANS_TCP) {
+ return NULL;
+ }
+ virBufferAsprintf(&buf, i > 0 ? ",%s:%u" : "%s:%u", host->name, host->port);
+ }
+ if (src->nhosts > 0) {
+ etcd = virBufferContentAndReset(&buf);
+ }
+
+ if (virJSONValueObjectAdd(&ret,
+ "S:etcd-host", etcd,
+ "S:etcd-prefix", src->query,
+ "S:config-path", src->configFile,
+ "s:image", src->path,
+ NULL) < 0)
+ return NULL;
+
+ return ret;
+}
+
+
static virJSONValue *
qemuBlockStorageSourceGetSshProps(virStorageSource *src)
{
@@ -1082,6 +1114,12 @@ qemuBlockStorageSourceGetBackendProps(virStorageSource *src,
return NULL;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ driver = "vitastor";
+ if (!(fileprops = qemuBlockStorageSourceGetVitastorProps(src)))
+ return NULL;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SSH:
driver = "ssh";
if (!(fileprops = qemuBlockStorageSourceGetSshProps(src)))
@@ -1985,6 +2023,7 @@ qemuBlockGetBackingStoreString(virStorageSource *src,
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_VXHS:
case VIR_STORAGE_NET_PROTOCOL_NFS:
case VIR_STORAGE_NET_PROTOCOL_SSH:
@@ -2365,6 +2404,12 @@ qemuBlockStorageSourceCreateGetStorageProps(virStorageSource *src,
return -1;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ driver = "vitastor";
+ if (!(location = qemuBlockStorageSourceGetVitastorProps(src)))
+ return -1;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SSH:
if (srcPriv->nbdkitProcess) {
/* disk creation not yet supported with nbdkit, and even if it
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index 0d2548d8d4..91121d6e1f 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -4526,7 +4526,8 @@ qemuDomainValidateStorageSource(virStorageSource *src,
if (src->query &&
(actualType != VIR_STORAGE_TYPE_NETWORK ||
(src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTPS &&
- src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP))) {
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP &&
+ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR))) {
virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
_("query is supported only with HTTP(S) protocols"));
return -1;
@@ -8954,6 +8955,7 @@ qemuDomainPrepareStorageSourceTLS(virStorageSource *src,
break;
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
diff --git a/src/qemu/qemu_snapshot.c b/src/qemu/qemu_snapshot.c
index 8128154749..afb339b9b0 100644
--- a/src/qemu/qemu_snapshot.c
+++ b/src/qemu/qemu_snapshot.c
@@ -662,6 +662,7 @@ qemuSnapshotPrepareDiskExternalInactive(virDomainSnapshotDiskDef *snapdisk,
case VIR_STORAGE_NET_PROTOCOL_NONE:
case VIR_STORAGE_NET_PROTOCOL_NBD:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
@@ -887,6 +888,7 @@ qemuSnapshotPrepareDiskInternal(virDomainDiskDef *disk,
case VIR_STORAGE_NET_PROTOCOL_NONE:
case VIR_STORAGE_NET_PROTOCOL_NBD:
case VIR_STORAGE_NET_PROTOCOL_RBD:
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_GLUSTER:
case VIR_STORAGE_NET_PROTOCOL_ISCSI:
diff --git a/src/storage/storage_driver.c b/src/storage/storage_driver.c
index e19e032427..59f91f4710 100644
--- a/src/storage/storage_driver.c
+++ b/src/storage/storage_driver.c
@@ -1626,6 +1626,7 @@ storageVolLookupByPathCallback(virStoragePoolObj *obj,
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_SHEEPDOG:
case VIR_STORAGE_POOL_ZFS:
case VIR_STORAGE_POOL_LAST:
diff --git a/src/storage_file/storage_source_backingstore.c b/src/storage_file/storage_source_backingstore.c
index 80681924ea..8a3ade9ec0 100644
--- a/src/storage_file/storage_source_backingstore.c
+++ b/src/storage_file/storage_source_backingstore.c
@@ -287,6 +287,75 @@ virStorageSourceParseRBDColonString(const char *rbdstr,
}
+static int
+virStorageSourceParseVitastorColonString(const char *colonstr,
+ virStorageSource *src)
+{
+ char *p, *e, *next;
+ g_autofree char *options = NULL;
+
+ /* optionally skip the "vitastor:" prefix if provided */
+ if (STRPREFIX(colonstr, "vitastor:"))
+ colonstr += strlen("vitastor:");
+
+ options = g_strdup(colonstr);
+
+ p = options;
+ while (*p) {
+ /* find : delimiter or end of string */
+ for (e = p; *e && *e != ':'; ++e) {
+ if (*e == '\\') {
+ e++;
+ if (*e == '\0')
+ break;
+ }
+ }
+ if (*e == '\0') {
+ next = e; /* last kv pair */
+ } else {
+ next = e + 1;
+ *e = '\0';
+ }
+
+ if (STRPREFIX(p, "image=")) {
+ src->path = g_strdup(p + strlen("image="));
+ } else if (STRPREFIX(p, "etcd-prefix=")) {
+ src->query = g_strdup(p + strlen("etcd-prefix="));
+ } else if (STRPREFIX(p, "config-path=")) {
+ src->configFile = g_strdup(p + strlen("config-path="));
+ } else if (STRPREFIX(p, "etcd-host=")) {
+ char *h, *sep;
+
+ h = p + strlen("etcd-host=");
+ while (h < e) {
+ for (sep = h; sep < e; ++sep) {
+ if (*sep == '\\' && (sep[1] == ',' ||
+ sep[1] == ';' ||
+ sep[1] == ' ')) {
+ *sep = '\0';
+ sep += 2;
+ break;
+ }
+ }
+
+ if (virStorageSourceRBDAddHost(src, h) < 0)
+ return -1;
+
+ h = sep;
+ }
+ }
+
+ p = next;
+ }
+
+ if (!src->path) {
+ return -1;
+ }
+
+ return 0;
+}
+
+
static int
virStorageSourceParseNBDColonString(const char *nbdstr,
virStorageSource *src)
@@ -399,6 +468,11 @@ virStorageSourceParseBackingColon(virStorageSource *src,
return -1;
break;
+ case VIR_STORAGE_NET_PROTOCOL_VITASTOR:
+ if (virStorageSourceParseVitastorColonString(path, src) < 0)
+ return -1;
+ break;
+
case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG:
case VIR_STORAGE_NET_PROTOCOL_LAST:
case VIR_STORAGE_NET_PROTOCOL_NONE:
@@ -975,6 +1049,54 @@ virStorageSourceParseBackingJSONRBD(virStorageSource *src,
return 0;
}
+static int
+virStorageSourceParseBackingJSONVitastor(virStorageSource *src,
+ virJSONValue *json,
+ const char *jsonstr G_GNUC_UNUSED,
+ int opaque G_GNUC_UNUSED)
+{
+ const char *filename;
+ const char *image = virJSONValueObjectGetString(json, "image");
+ const char *conf = virJSONValueObjectGetString(json, "config-path");
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd-prefix");
+ virJSONValue *servers = virJSONValueObjectGetArray(json, "server");
+ size_t nservers;
+ size_t i;
+
+ src->type = VIR_STORAGE_TYPE_NETWORK;
+ src->protocol = VIR_STORAGE_NET_PROTOCOL_VITASTOR;
+
+ /* legacy syntax passed via 'filename' option */
+ if ((filename = virJSONValueObjectGetString(json, "filename")))
+ return virStorageSourceParseVitastorColonString(filename, src);
+
+ if (!image) {
+ virReportError(VIR_ERR_INVALID_ARG, "%s",
+ _("missing image name in Vitastor backing volume "
+ "JSON specification"));
+ return -1;
+ }
+
+ src->path = g_strdup(image);
+ src->configFile = g_strdup(conf);
+ src->query = g_strdup(etcd_prefix);
+
+ if (servers) {
+ nservers = virJSONValueArraySize(servers);
+
+ src->hosts = g_new0(virStorageNetHostDef, nservers);
+ src->nhosts = nservers;
+
+ for (i = 0; i < nservers; i++) {
+ if (virStorageSourceParseBackingJSONInetSocketAddress(src->hosts + i,
+ virJSONValueArrayGet(servers, i)) < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
static int
virStorageSourceParseBackingJSONRaw(virStorageSource *src,
virJSONValue *json,
@@ -1152,6 +1274,7 @@ static const struct virStorageSourceJSONDriverParser jsonParsers[] = {
{"sheepdog", false, virStorageSourceParseBackingJSONSheepdog, 0},
{"ssh", false, virStorageSourceParseBackingJSONSSH, 0},
{"rbd", false, virStorageSourceParseBackingJSONRBD, 0},
+ {"vitastor", false, virStorageSourceParseBackingJSONVitastor, 0},
{"raw", true, virStorageSourceParseBackingJSONRaw, 0},
{"nfs", false, virStorageSourceParseBackingJSONNFS, 0},
{"vxhs", false, virStorageSourceParseBackingJSONVxHS, 0},
diff --git a/src/test/test_driver.c b/src/test/test_driver.c
index 25335d9002..cf54069fbe 100644
--- a/src/test/test_driver.c
+++ b/src/test/test_driver.c
@@ -7340,6 +7340,7 @@ testStorageVolumeTypeForPool(int pooltype)
case VIR_STORAGE_POOL_ISCSI_DIRECT:
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_RBD:
+ case VIR_STORAGE_POOL_VITASTOR:
return VIR_STORAGE_VOL_NETWORK;
case VIR_STORAGE_POOL_LOGICAL:
case VIR_STORAGE_POOL_DISK:
diff --git a/tests/storagepoolcapsschemadata/poolcaps-fs.xml b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
index eee75af746..8bd0a57bdd 100644
--- a/tests/storagepoolcapsschemadata/poolcaps-fs.xml
+++ b/tests/storagepoolcapsschemadata/poolcaps-fs.xml
@@ -204,4 +204,11 @@
</enum>
</volOptions>
</pool>
+ <pool type='vitastor' supported='no'>
+ <volOptions>
+ <defaultFormat type='raw'/>
+ <enum name='targetFormatType'>
+ </enum>
+ </volOptions>
+ </pool>
</storagepoolCapabilities>
diff --git a/tests/storagepoolcapsschemadata/poolcaps-full.xml b/tests/storagepoolcapsschemadata/poolcaps-full.xml
index 805950a937..852df0de16 100644
--- a/tests/storagepoolcapsschemadata/poolcaps-full.xml
+++ b/tests/storagepoolcapsschemadata/poolcaps-full.xml
@@ -204,4 +204,11 @@
</enum>
</volOptions>
</pool>
+ <pool type='vitastor' supported='yes'>
+ <volOptions>
+ <defaultFormat type='raw'/>
+ <enum name='targetFormatType'>
+ </enum>
+ </volOptions>
+ </pool>
</storagepoolCapabilities>
diff --git a/tests/storagepoolxml2argvtest.c b/tests/storagepoolxml2argvtest.c
index d5c2531ab8..b19308ac38 100644
--- a/tests/storagepoolxml2argvtest.c
+++ b/tests/storagepoolxml2argvtest.c
@@ -57,6 +57,7 @@ testCompareXMLToArgvFiles(bool shouldFail,
case VIR_STORAGE_POOL_GLUSTER:
case VIR_STORAGE_POOL_ZFS:
case VIR_STORAGE_POOL_VSTORAGE:
+ case VIR_STORAGE_POOL_VITASTOR:
case VIR_STORAGE_POOL_LAST:
default:
VIR_TEST_DEBUG("pool type '%s' has no xml2argv test", defTypeStr);
diff --git a/tools/virsh-pool.c b/tools/virsh-pool.c
index 2010ef1356..072e2ff9e8 100644
--- a/tools/virsh-pool.c
+++ b/tools/virsh-pool.c
@@ -1187,6 +1187,9 @@ cmdPoolList(vshControl *ctl, const vshCmd *cmd G_GNUC_UNUSED)
case VIR_STORAGE_POOL_VSTORAGE:
flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE;
break;
+ case VIR_STORAGE_POOL_VITASTOR:
+ flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR;
+ break;
case VIR_STORAGE_POOL_LAST:
break;
}

View File

@@ -0,0 +1,172 @@
Index: pve-qemu-kvm-10.0.2/block/meson.build
===================================================================
--- pve-qemu-kvm-10.0.2.orig/block/meson.build
+++ pve-qemu-kvm-10.0.2/block/meson.build
@@ -126,6 +126,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
Index: pve-qemu-kvm-10.0.2/meson.build
===================================================================
--- pve-qemu-kvm-10.0.2.orig/meson.build
+++ pve-qemu-kvm-10.0.2/meson.build
@@ -1622,6 +1622,26 @@ if not get_option('rbd').auto() or have_
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'))
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -2514,6 +2534,7 @@ endif
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
@@ -4812,6 +4833,7 @@ summary_info += {'fdt support': fd
summary_info += {'libcap-ng support': libcap_ng}
summary_info += {'bpf support': libbpf}
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
Index: pve-qemu-kvm-10.0.2/meson_options.txt
===================================================================
--- pve-qemu-kvm-10.0.2.orig/meson_options.txt
+++ pve-qemu-kvm-10.0.2/meson_options.txt
@@ -202,6 +202,8 @@ option('pvg', type: 'feature', value: 'a
description: 'macOS paravirtualized graphics support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
Index: pve-qemu-kvm-10.0.2/qapi/block-core.json
===================================================================
--- pve-qemu-kvm-10.0.2.orig/qapi/block-core.json
+++ pve-qemu-kvm-10.0.2/qapi/block-core.json
@@ -3599,7 +3599,7 @@
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
'pbs',
- 'ssh', 'throttle', 'vdi', 'vhdx',
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -4725,6 +4725,28 @@
'*server': ['InetSocketAddressBase'] } }
##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
+##
# @ReplicationMode:
#
# An enumeration of replication modes.
@@ -5194,6 +5216,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'virtio-blk-vfio-pci':
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
'if': 'CONFIG_BLKIO' },
@@ -5674,6 +5697,20 @@
'*encrypt' : 'RbdEncryptionCreateOptions' } }
##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @location: Where to store the new image file. This location cannot
+# point to a snapshot.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
+##
# @BlockdevVmdkSubformat:
#
# Subformat options for VMDK images
@@ -5895,6 +5932,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
Index: pve-qemu-kvm-10.0.2/scripts/meson-buildoptions.sh
===================================================================
--- pve-qemu-kvm-10.0.2.orig/scripts/meson-buildoptions.sh
+++ pve-qemu-kvm-10.0.2/scripts/meson-buildoptions.sh
@@ -175,6 +175,7 @@ meson_options_help() {
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
printf "%s\n" ' qpl Query Processing Library support'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' rdma Enable RDMA-based migration'
printf "%s\n" ' replication replication support'
printf "%s\n" ' rust Rust support'
@@ -458,6 +459,8 @@ _meson_option_parse() {
--disable-qpl) printf "%s" -Dqpl=disabled ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-rdma) printf "%s" -Drdma=enabled ;;
--disable-rdma) printf "%s" -Drdma=disabled ;;
--enable-relocatable) printf "%s" -Drelocatable=true ;;

View File

@@ -0,0 +1,172 @@
diff --git a/block/meson.build b/block/meson.build
index 34b1b2a306..24ca0f1e52 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -114,6 +114,7 @@ foreach m : [
[libnfs, 'nfs', files('nfs.c')],
[libssh, 'ssh', files('ssh.c')],
[rbd, 'rbd', files('rbd.c')],
+ [vitastor, 'vitastor', files('vitastor.c')],
]
if m[0].found()
module_ss = ss.source_set()
diff --git a/meson.build b/meson.build
index 41f68d3806..29eaed9ba4 100644
--- a/meson.build
+++ b/meson.build
@@ -1622,6 +1622,26 @@ if not get_option('rbd').auto() or have_block
endif
endif
+vitastor = not_found
+if not get_option('vitastor').auto() or have_block
+ libvitastor_client = cc.find_library('vitastor_client', has_headers: ['vitastor_c.h'],
+ required: get_option('vitastor'))
+ if libvitastor_client.found()
+ if cc.links('''
+ #include <vitastor_c.h>
+ int main(void) {
+ vitastor_c_create_qemu(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+ return 0;
+ }''', dependencies: libvitastor_client)
+ vitastor = declare_dependency(dependencies: libvitastor_client)
+ elif get_option('vitastor').enabled()
+ error('could not link libvitastor_client')
+ else
+ warning('could not link libvitastor_client, disabling')
+ endif
+ endif
+endif
+
glusterfs = not_found
glusterfs_ftruncate_has_stat = false
glusterfs_iocb_has_stat = false
@@ -2506,6 +2526,7 @@ endif
config_host_data.set('CONFIG_OPENGL', opengl.found())
config_host_data.set('CONFIG_PLUGIN', get_option('plugins'))
config_host_data.set('CONFIG_RBD', rbd.found())
+config_host_data.set('CONFIG_VITASTOR', vitastor.found())
config_host_data.set('CONFIG_RDMA', rdma.found())
config_host_data.set('CONFIG_RELOCATABLE', get_option('relocatable'))
config_host_data.set('CONFIG_SAFESTACK', get_option('safe_stack'))
@@ -4813,6 +4834,7 @@ summary_info += {'fdt support': fdt_opt == 'internal' ? 'internal' : fdt}
summary_info += {'libcap-ng support': libcap_ng}
summary_info += {'bpf support': libbpf}
summary_info += {'rbd support': rbd}
+summary_info += {'vitastor support': vitastor}
summary_info += {'smartcard support': cacard}
summary_info += {'U2F support': u2f}
summary_info += {'libusb': libusb}
diff --git a/meson_options.txt b/meson_options.txt
index 59d973bca0..a3e7123980 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -202,6 +202,8 @@ option('pvg', type: 'feature', value: 'auto',
description: 'macOS paravirtualized graphics support')
option('rbd', type : 'feature', value : 'auto',
description: 'Ceph block device driver')
+option('vitastor', type : 'feature', value : 'auto',
+ description: 'Vitastor block device driver')
option('opengl', type : 'feature', value : 'auto',
description: 'OpenGL support')
option('rdma', type : 'feature', value : 'auto',
diff --git a/qapi/block-core.json b/qapi/block-core.json
index b1937780e1..a511193620 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3216,7 +3216,7 @@
'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
'raw', 'rbd',
{ 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
- 'ssh', 'throttle', 'vdi', 'vhdx',
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor',
{ 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
{ 'name': 'virtio-blk-vhost-vdpa', 'if': 'CONFIG_BLKIO' },
@@ -4299,6 +4299,28 @@
'*key-secret': 'str',
'*server': ['InetSocketAddressBase'] } }
+##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
##
# @ReplicationMode:
#
@@ -4767,6 +4789,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'virtio-blk-vfio-pci':
{ 'type': 'BlockdevOptionsVirtioBlkVfioPci',
'if': 'CONFIG_BLKIO' },
@@ -5240,6 +5263,20 @@
'*cluster-size' : 'size',
'*encrypt' : 'RbdEncryptionCreateOptions' } }
+##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @location: Where to store the new image file. This location cannot
+# point to a snapshot.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
##
# @BlockdevVmdkSubformat:
#
@@ -5462,6 +5499,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 3e8e00852b..45aff3b6a9 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -175,6 +175,7 @@ meson_options_help() {
printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)'
printf "%s\n" ' qpl Query Processing Library support'
printf "%s\n" ' rbd Ceph block device driver'
+ printf "%s\n" ' vitastor Vitastor block device driver'
printf "%s\n" ' rdma Enable RDMA-based migration'
printf "%s\n" ' replication replication support'
printf "%s\n" ' rust Rust support'
@@ -458,6 +459,8 @@ _meson_option_parse() {
--disable-qpl) printf "%s" -Dqpl=disabled ;;
--enable-rbd) printf "%s" -Drbd=enabled ;;
--disable-rbd) printf "%s" -Drbd=disabled ;;
+ --enable-vitastor) printf "%s" -Dvitastor=enabled ;;
+ --disable-vitastor) printf "%s" -Dvitastor=disabled ;;
--enable-rdma) printf "%s" -Drdma=enabled ;;
--disable-rdma) printf "%s" -Drdma=disabled ;;
--enable-relocatable) printf "%s" -Drelocatable=true ;;

View File

@@ -7,22 +7,24 @@ set -e
VITASTOR=$(dirname $0)
VITASTOR=$(realpath "$VITASTOR/..")
EL=$(rpm --eval '%dist')
if [ "$EL" = ".el8" ]; then
REL=$(rpm --eval '%dist')
REL=${REL##.}
if [ "$REL" = "el8" ]; then
# CentOS 8
. /opt/rh/gcc-toolset-9/enable
elif [ "$EL" = ".el7" ]; then
elif [ "$REL" = "el7" ]; then
# CentOS 7
. /opt/rh/devtoolset-9/enable
fi
cd ~/rpmbuild/SPECS
rpmbuild -bp fio.spec
cd $VITASTOR
VER=$(grep ^Version: rpm/vitastor-el7.spec | awk '{print $2}')
VER=$(grep ^Version: rpm/vitastor-$REL.spec | awk '{print $2}')
rm -rf fio
ln -s ~/rpmbuild/BUILD/fio*/ fio
sh copy-fio-includes.sh
rm fio
mv fio-copy fio
FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
tar --transform "s#^#vitastor-$VER/#" --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-$VER$(rpm --eval '%dist').tar.gz *
perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-$REL.spec
tar --transform "s#^#vitastor-$VER/#" --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-$VER.$REL.tar.gz $(ls | grep -v packages)

16
rpm/vitastor-build.sh Executable file
View File

@@ -0,0 +1,16 @@
#!/bin/bash
set -e -x
REL=$(rpm --eval '%dist')
REL=${REL##.}
cd /root/vitastor/rpm
./build-tarball.sh
VER=$(grep ^Version: vitastor-$REL.spec | awk '{print $2}')
cp /root/vitastor-$VER.$REL.tar.gz ~/rpmbuild/SOURCES
cp vitastor-$REL.spec ~/rpmbuild/SPECS/vitastor.spec
cd ~/rpmbuild/SPECS/
rpmbuild -ba vitastor.spec
mkdir -p /root/vitastor/packages/vitastor-$REL
rm -rf /root/vitastor/packages/vitastor-$REL/*
cp ~/rpmbuild/RPMS/*/*vitastor* /root/vitastor/packages/vitastor-$REL/
cp ~/rpmbuild/SRPMS/vitastor* /root/vitastor/packages/vitastor-$REL/

View File

@@ -1,5 +1,8 @@
# Build packages for CentOS 7 inside a container
# cd ..; podman build -t vitastor-el7 -v `pwd`/packages:/root/packages -f rpm/vitastor-el7.Dockerfile .
# cd ..
# docker build -t vitastor-buildenv:el7 -f rpm/vitastor-el7.Dockerfile .
# docker run -i --rm -v ./:/root/vitastor vitastor-buildenv:el7 /root/vitastor/rpm/vitastor-build.sh
# localedef -i ru_RU -f UTF-8 ru_RU.UTF-8
FROM centos:7
@@ -7,7 +10,9 @@ FROM centos:7
WORKDIR /root
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
RUN sed -i 's/^mirrorlist=/#mirrorlist=/; s!#baseurl=http://mirror.centos.org/centos/\$releasever!baseurl=http://vault.centos.org/7.9.2009!' /etc/yum.repos.d/*.repo
RUN yum -y --enablerepo=extras install centos-release-scl epel-release yum-utils rpm-build
RUN perl -i -pe 's!mirrorlist=!#mirrorlist=!s; s!#\s*baseurl=http://mirror.centos.org!baseurl=http://vault.centos.org!' /etc/yum.repos.d/CentOS-SCLo-scl*.repo
RUN yum -y install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm
RUN yum -y install devtoolset-9-gcc-c++ devtoolset-9-libatomic-devel gcc make cmake gperftools-devel \
fio rh-nodejs12 jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel libnl3-devel
@@ -16,32 +21,3 @@ RUN rpm --nomd5 -i fio*.src.rpm
RUN rm -f /etc/yum.repos.d/CentOS-Media.repo
RUN cd ~/rpmbuild/SPECS && yum-builddep -y fio.spec
RUN yum -y install cmake3
ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
RUN set -e; \
rpm -i liburing*.src.rpm; \
cd ~/rpmbuild/SPECS/; \
. /opt/rh/devtoolset-9/enable; \
rpmbuild -ba liburing.spec; \
mkdir -p /root/packages/liburing-el7; \
rm -rf /root/packages/liburing-el7/*; \
cp ~/rpmbuild/RPMS/*/liburing* /root/packages/liburing-el7/; \
cp ~/rpmbuild/SRPMS/liburing* /root/packages/liburing-el7/
RUN rpm -i `ls /root/packages/liburing-el7/liburing-*.x86_64.rpm | grep -v debug`
ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
VER=$(grep ^Version: vitastor-el7.spec | awk '{print $2}'); \
cp /root/vitastor-$VER.el7.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \
mkdir -p /root/packages/vitastor-el7; \
rm -rf /root/packages/vitastor-el7/*; \
cp ~/rpmbuild/RPMS/*/*vitastor* /root/packages/vitastor-el7/; \
cp ~/rpmbuild/SRPMS/vitastor* /root/packages/vitastor-el7/

View File

@@ -1,13 +1,12 @@
Name: vitastor
Version: 2.2.2
Version: 2.3.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-2.2.2.el7.tar.gz
Source0: vitastor-2.3.0.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel
BuildRequires: devtoolset-9-gcc-c++
BuildRequires: rh-nodejs12
@@ -35,8 +34,6 @@ size with configurable redundancy (replication or erasure codes/XOR).
Summary: Vitastor - OSD
Requires: libJerasure2
Requires: libisa-l
Requires: liburing >= 0.6
Requires: liburing < 2
Requires: vitastor-client = %{version}-%{release}
Requires: util-linux
Requires: parted
@@ -60,8 +57,6 @@ scheduling cluster-level operations.
%package -n vitastor-client
Summary: Vitastor - client
Requires: liburing >= 0.6
Requires: liburing < 2
%description -n vitastor-client
@@ -82,7 +77,7 @@ Vitastor library headers for development.
Summary: Vitastor - fio drivers
Group: Development/Libraries
Requires: vitastor-client = %{version}-%{release}
Requires: fio = 3.7-1.el7
Requires: fio = 3.7-2.el7
%description -n vitastor-fio
@@ -169,6 +164,7 @@ chown vitastor:vitastor /var/lib/vitastor
%files -n vitastor-client
%_bindir/vitastor-nbd
%_bindir/vitastor-ublk
%_bindir/vitastor-nfs
%_bindir/vitastor-cli
%_bindir/vitastor-rm

View File

@@ -1,5 +1,7 @@
# Build packages for CentOS 8 inside a container
# cd ..; podman build -t vitastor-el8 -v `pwd`/packages:/root/packages -f rpm/vitastor-el8.Dockerfile .
# cd ..
# docker build -t vitastor-buildenv:el8 -f rpm/vitastor-el8.Dockerfile .
# docker run -i --rm -v ./:/root/vitastor vitastor-buildenv:el8 /root/vitastor/rpm/vitastor-build.sh
FROM centos:8
@@ -15,32 +17,3 @@ RUN dnf -y install gcc-toolset-9 gcc-toolset-9-gcc-c++ gperftools-devel \
RUN dnf download --source fio
RUN rpm --nomd5 -i fio*.src.rpm
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --enablerepo=powertools --spec fio.spec
ADD https://vitastor.io/rpms/liburing-el7/liburing-0.7-2.el7.src.rpm /root
RUN set -e; \
rpm -i liburing*.src.rpm; \
cd ~/rpmbuild/SPECS/; \
. /opt/rh/gcc-toolset-9/enable; \
rpmbuild -ba liburing.spec; \
mkdir -p /root/packages/liburing-el8; \
rm -rf /root/packages/liburing-el8/*; \
cp ~/rpmbuild/RPMS/*/liburing* /root/packages/liburing-el8/; \
cp ~/rpmbuild/SRPMS/liburing* /root/packages/liburing-el8/
RUN rpm -i `ls /root/packages/liburing-el8/liburing-*.x86_64.rpm | grep -v debug`
ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
VER=$(grep ^Version: vitastor-el8.spec | awk '{print $2}'); \
cp /root/vitastor-$VER.el8.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \
mkdir -p /root/packages/vitastor-el8; \
rm -rf /root/packages/vitastor-el8/*; \
cp ~/rpmbuild/RPMS/*/*vitastor* /root/packages/vitastor-el8/; \
cp ~/rpmbuild/SRPMS/vitastor* /root/packages/vitastor-el8/

View File

@@ -1,13 +1,12 @@
Name: vitastor
Version: 2.2.2
Version: 2.3.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-2.2.2.el8.tar.gz
Source0: vitastor-2.3.0.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel
BuildRequires: gcc-toolset-9-gcc-c++
BuildRequires: nodejs >= 10
@@ -34,8 +33,6 @@ size with configurable redundancy (replication or erasure codes/XOR).
Summary: Vitastor - OSD
Requires: libJerasure2
Requires: libisa-l
Requires: liburing >= 0.6
Requires: liburing < 2
Requires: vitastor-client = %{version}-%{release}
Requires: util-linux
Requires: parted
@@ -58,8 +55,6 @@ scheduling cluster-level operations.
%package -n vitastor-client
Summary: Vitastor - client
Requires: liburing >= 0.6
Requires: liburing < 2
%description -n vitastor-client
@@ -80,7 +75,7 @@ Vitastor library headers for development.
Summary: Vitastor - fio drivers
Group: Development/Libraries
Requires: vitastor-client = %{version}-%{release}
Requires: fio = 3.7-3.el8
Requires: fio = 3.19-3.el8
%description -n vitastor-fio
@@ -166,6 +161,7 @@ chown vitastor:vitastor /var/lib/vitastor
%files -n vitastor-client
%_bindir/vitastor-nbd
%_bindir/vitastor-ublk
%_bindir/vitastor-nfs
%_bindir/vitastor-cli
%_bindir/vitastor-rm

View File

@@ -1,5 +1,7 @@
# Build packages for AlmaLinux 9 inside a container
# cd ..; podman build -t vitastor-el9 -v `pwd`/packages:/root/packages -f rpm/vitastor-el9.Dockerfile .
# cd ..
# docker build -t vitastor-buildenv:el9 -f rpm/vitastor-el9.Dockerfile .
# docker run -i --rm -v ./:/root/vitastor vitastor-buildenv:el9 /root/vitastor/rpm/vitastor-build.sh
FROM almalinux:9
@@ -8,22 +10,7 @@ WORKDIR /root
RUN sed -i 's/enabled=0/enabled=1/' /etc/yum.repos.d/*.repo
RUN dnf -y install epel-release dnf-plugins-core
RUN dnf -y install https://vitastor.io/rpms/centos/9/vitastor-release-1.0-1.el9.noarch.rpm
RUN dnf -y install gcc-c++ gperftools-devel fio nodejs rpm-build jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel libarchive liburing-devel cmake libnl3-devel
RUN dnf -y install gcc-c++ gperftools-devel fio nodejs rpm-build jerasure-devel libisa-l-devel gf-complete-devel rdma-core-devel libarchive cmake libnl3-devel
RUN dnf download --source fio
RUN rpm --nomd5 -i fio*.src.rpm
RUN cd ~/rpmbuild/SPECS && dnf builddep -y --spec fio.spec
ADD . /root/vitastor
RUN set -e; \
cd /root/vitastor/rpm; \
sh build-tarball.sh; \
VER=$(grep ^Version: vitastor-el9.spec | awk '{print $2}'); \
cp /root/vitastor-$VER.el9.tar.gz ~/rpmbuild/SOURCES; \
cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
cd ~/rpmbuild/SPECS/; \
rpmbuild -ba vitastor.spec; \
mkdir -p /root/packages/vitastor-el9; \
rm -rf /root/packages/vitastor-el9/*; \
cp ~/rpmbuild/RPMS/*/*vitastor* /root/packages/vitastor-el9/; \
cp ~/rpmbuild/SRPMS/vitastor* /root/packages/vitastor-el9/

View File

@@ -1,13 +1,12 @@
Name: vitastor
Version: 2.2.2
Version: 2.3.0
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-2.2.2.el9.tar.gz
Source0: vitastor-2.3.0.el9.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel
BuildRequires: gcc-c++
BuildRequires: nodejs >= 10
@@ -159,6 +158,7 @@ chown vitastor:vitastor /var/lib/vitastor
%files -n vitastor-client
%_bindir/vitastor-nbd
%_bindir/vitastor-ublk
%_bindir/vitastor-nfs
%_bindir/vitastor-cli
%_bindir/vitastor-rm

View File

@@ -12,20 +12,30 @@ set(WITH_QEMU false CACHE BOOL "Build QEMU driver inside Vitastor source tree")
set(WITH_FIO true CACHE BOOL "Build FIO driver")
set(QEMU_PLUGINDIR qemu CACHE STRING "QEMU plugin directory suffix (qemu-kvm on RHEL)")
set(WITH_ASAN false CACHE BOOL "Build with AddressSanitizer")
set(WITH_SYSTEM_LIBURING false CACHE BOOL "Use system liburing")
if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
if(EXISTS "/etc/debian_version")
set(CMAKE_INSTALL_LIBDIR "lib/${CMAKE_LIBRARY_ARCHITECTURE}")
endif()
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
set(ENABLE_COVERAGE false CACHE BOOL "Enable code coverage")
add_definitions(-DVITASTOR_VERSION="2.2.2")
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
add_definitions(-DVITASTOR_VERSION="2.3.0")
add_definitions(-D_GNU_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -fvisibility=hidden -I ${CMAKE_SOURCE_DIR}/src)
add_link_options(-fno-omit-frame-pointer)
if (${WITH_ASAN})
add_definitions(-fsanitize=address)
add_link_options(-fsanitize=address -fno-omit-frame-pointer)
endif (${WITH_ASAN})
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fvisibility-inlines-hidden")
set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} -fvisibility-inlines-hidden")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fvisibility-inlines-hidden")
if (${ENABLE_COVERAGE})
add_definitions(-coverage)
add_link_options(-coverage)
endif()
set(CMAKE_BUILD_TYPE RelWithDebInfo)
string(REGEX REPLACE "([\\/\\-]O)[^ \t\r\n]*" "\\13" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
@@ -49,7 +59,6 @@ endmacro(install_symlink)
check_include_file("linux/nbd-netlink.h" HAVE_NBD_NETLINK_H)
find_package(PkgConfig)
pkg_check_modules(LIBURING REQUIRED liburing)
if (${WITH_QEMU})
pkg_check_modules(GLIB REQUIRED glib-2.0)
endif (${WITH_QEMU})
@@ -66,13 +75,14 @@ if (RDMACM_LIBRARIES)
add_definitions(-DWITH_RDMACM)
endif (RDMACM_LIBRARIES)
add_custom_target(build_tests)
add_custom_target(test
COMMAND
echo leak:tcmalloc > ${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt &&
env LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_BINARY_DIR}/lsan-suppress.txt ${CMAKE_CTEST_COMMAND}
)
add_dependencies(test build_tests)
if (${WITH_SYSTEM_LIBURING})
pkg_check_modules(LIBURING REQUIRED liburing>=2.10)
include_directories(${LIBURING_INCLUDE_DIRS})
else()
include_directories(${CMAKE_SOURCE_DIR}/src/liburing/include)
add_subdirectory(liburing)
set(LIBURING_LIBRARIES uring)
endif (${WITH_SYSTEM_LIBURING})
include_directories(
../
@@ -86,7 +96,6 @@ include_directories(
${CMAKE_SOURCE_DIR}/src/test
${CMAKE_SOURCE_DIR}/src/util
/usr/include/jerasure
${LIBURING_INCLUDE_DIRS}
${IBVERBS_INCLUDE_DIRS}
)
@@ -101,7 +110,7 @@ add_subdirectory(test)
### Install
install(TARGETS vitastor-osd vitastor-disk vitastor-nbd vitastor-nfs vitastor-cli vitastor-kv vitastor-kv-stress RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
install(TARGETS vitastor-osd vitastor-disk vitastor-nbd vitastor-ublk vitastor-nfs vitastor-cli vitastor-kv vitastor-kv-stress RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
install_symlink(vitastor-disk ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-dump-journal)
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-rm)
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vita)

View File

@@ -2,14 +2,17 @@ cmake_minimum_required(VERSION 2.8.12)
project(vitastor)
# libvitastor_blk.so
add_library(vitastor_blk SHARED
../util/allocator.cpp blockstore.cpp blockstore_impl.cpp blockstore_disk.cpp blockstore_init.cpp blockstore_open.cpp blockstore_journal.cpp blockstore_read.cpp
blockstore_write.cpp blockstore_sync.cpp blockstore_stable.cpp blockstore_rollback.cpp blockstore_flush.cpp ../util/crc32c.c ../util/ringloop.cpp
# libvitastor_blk.a
add_library(vitastor_blk STATIC
../util/allocator.cpp ../util/crc32c.c ../util/ringloop.cpp
multilist.cpp blockstore_heap.cpp blockstore_disk.cpp
blockstore.cpp blockstore_impl.cpp blockstore_init.cpp blockstore_open.cpp
blockstore_flush.cpp blockstore_read.cpp blockstore_stable.cpp blockstore_sync.cpp blockstore_write.cpp
)
target_compile_options(vitastor_blk PUBLIC -fPIC)
target_link_libraries(vitastor_blk
${LIBURING_LIBRARIES}
tcmalloc_minimal
${ISAL_LIBRARIES}
# for timerfd_manager
vitastor_common
)

View File

@@ -3,7 +3,7 @@
#include "blockstore_impl.h"
blockstore_t::blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd)
blockstore_t::blockstore_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd)
{
impl = new blockstore_impl_t(config, ringloop, tfd);
}
@@ -48,9 +48,9 @@ int blockstore_t::read_bitmap(object_id oid, uint64_t target_version, void *bitm
return impl->read_bitmap(oid, target_version, bitmap, result_version);
}
std::map<uint64_t, uint64_t> & blockstore_t::get_inode_space_stats()
const std::map<uint64_t, uint64_t> & blockstore_t::get_inode_space_stats()
{
return impl->inode_space_stats;
return impl->get_inode_space_stats();
}
void blockstore_t::dump_diagnostics()
@@ -82,8 +82,3 @@ uint32_t blockstore_t::get_bitmap_granularity()
{
return impl->get_bitmap_granularity();
}
void blockstore_t::set_no_inode_stats(const std::vector<uint64_t> & pool_ids)
{
impl->set_no_inode_stats(pool_ids);
}

View File

@@ -22,17 +22,20 @@
#define DIRECT_IO_ALIGNMENT 512
#endif
// Memory allocation alignment (page size is usually optimal)
#ifndef MEM_ALIGNMENT
#define MEM_ALIGNMENT 4096
#endif
// Default block size is 128 KB, current allowed range is 4K - 128M
#define DEFAULT_DATA_BLOCK_ORDER 17
#define MIN_DATA_BLOCK_SIZE 4*1024
#define MAX_DATA_BLOCK_SIZE 128*1024*1024
#define DEFAULT_BITMAP_GRANULARITY 4096
#define MIN_JOURNAL_SIZE 1024*1024
// "VITAstor"
#define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
#define BLOCKSTORE_META_FORMAT_V1 1
#define BLOCKSTORE_META_FORMAT_V2 2
#define BLOCKSTORE_META_FORMAT_HEAP 3
#define BS_OP_MIN 1
#define BS_OP_READ 1
#define BS_OP_WRITE 2
@@ -42,13 +45,18 @@
#define BS_OP_DELETE 6
#define BS_OP_LIST 7
#define BS_OP_ROLLBACK 8
#define BS_OP_SYNC_STAB_ALL 9
#define BS_OP_MAX 9
#define BS_OP_MAX 8
#define BS_OP_PRIVATE_DATA_SIZE 256
/*
All operations may be submitted in any order, because reads only see completed writes,
syncs only sync completed writes and writes don't depend on each other.
The only restriction is that the external code MUST NOT submit multiple writes for one
object in parallel. This is a natural restriction because `version` numbers are used though.
Blockstore opcode documentation:
## BS_OP_READ / BS_OP_WRITE / BS_OP_WRITE_STABLE
@@ -113,14 +121,6 @@ Input:
Output:
- retval = 0 or negative error number (-ENOENT if no such version for stabilize)
## BS_OP_SYNC_STAB_ALL
ONLY FOR TESTS! Sync and mark all unstable object versions as stable, at once.
Input: Nothing except opcode
Output:
- retval = 0 or negative error number (-EINVAL)
## BS_OP_LIST
Get a list of all objects in this Blockstore.
@@ -144,10 +144,10 @@ Output:
*/
struct blockstore_op_t
struct __attribute__ ((visibility("default"))) blockstore_op_t
{
// operation
uint64_t opcode;
uint64_t opcode = 0;
// finish callback
std::function<void (blockstore_op_t*)> callback;
union __attribute__((__packed__))
@@ -171,9 +171,9 @@ struct blockstore_op_t
uint32_t list_stable_limit;
};
};
void *buf;
void *bitmap;
int retval;
uint8_t *buf = NULL;
uint8_t *bitmap = NULL;
int retval = 0;
uint8_t private_data[BS_OP_PRIVATE_DATA_SIZE];
};
@@ -182,11 +182,11 @@ typedef std::map<std::string, std::string> blockstore_config_t;
class blockstore_impl_t;
class blockstore_t
class __attribute__((visibility("default"))) blockstore_t
{
blockstore_impl_t *impl;
public:
blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
blockstore_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd);
~blockstore_t();
// Update configuration
@@ -214,10 +214,7 @@ public:
int read_bitmap(object_id oid, uint64_t target_version, void *bitmap, uint64_t *result_version = NULL);
// Get per-inode space usage statistics
std::map<uint64_t, uint64_t> & get_inode_space_stats();
// Set per-pool no_inode_stats
void set_no_inode_stats(const std::vector<uint64_t> & pool_ids);
const std::map<uint64_t, uint64_t> & get_inode_space_stats();
// Print diagnostics to stdout
void dump_diagnostics();

View File

@@ -2,11 +2,14 @@
// License: VNPL-1.1 (see README.md for details)
#include <sys/file.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <stdexcept>
#include "blockstore_impl.h"
#include "blockstore.h"
#include "blockstore_disk.h"
#include "blockstore_heap.h"
#include "str_util.h"
#include "allocator.h"
@@ -44,8 +47,11 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
disk_alignment = parse_size(config["disk_alignment"]);
journal_block_size = parse_size(config["journal_block_size"]);
meta_block_size = parse_size(config["meta_block_size"]);
meta_block_target_free_space = parse_size(config["meta_block_target_free_space"]);
bitmap_granularity = parse_size(config["bitmap_granularity"]);
meta_format = stoull_full(config["meta_format"]);
atomic_write_size = (config.find("atomic_write_size") != config.end()
? parse_size(config["atomic_write_size"]) : 4096);
if (config.find("data_io") == config.end() &&
config.find("meta_io") == config.end() &&
config.find("journal_io") == config.end())
@@ -90,12 +96,28 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
if (!min_discard_size)
min_discard_size = 1024*1024;
discard_granularity = parse_size(config["discard_granularity"]);
inmemory_meta = config["inmemory_metadata"] != "false" && config["inmemory_metadata"] != "0" &&
config["inmemory_metadata"] != "no";
inmemory_journal = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" &&
config["inmemory_journal"] != "no";
disable_data_fsync = config["disable_data_fsync"] == "true" || config["disable_data_fsync"] == "1" || config["disable_data_fsync"] == "yes";
disable_meta_fsync = config["disable_meta_fsync"] == "true" || config["disable_meta_fsync"] == "1" || config["disable_meta_fsync"] == "yes";
disable_journal_fsync = config["disable_journal_fsync"] == "true" || config["disable_journal_fsync"] == "1" || config["disable_journal_fsync"] == "yes";
if (mock_mode)
{
data_device_size = parse_size(config["data_device_size"]);
data_device_sect = parse_size(config["data_device_sect"]);
meta_device_size = parse_size(config["meta_device_size"]);
meta_device_sect = parse_size(config["meta_device_sect"]);
journal_device_size = parse_size(config["journal_device_size"]);
journal_device_sect = parse_size(config["journal_device_sect"]);
}
// Validate
if (!data_block_size)
{
data_block_size = (1 << DEFAULT_DATA_BLOCK_ORDER);
}
if ((block_order = is_power_of_two(data_block_size)) >= 64 || data_block_size < MIN_DATA_BLOCK_SIZE || data_block_size >= MAX_DATA_BLOCK_SIZE)
if (is_power_of_two(data_block_size) >= 64 || data_block_size < MIN_DATA_BLOCK_SIZE || data_block_size >= MAX_DATA_BLOCK_SIZE)
{
throw std::runtime_error("Bad block size");
}
@@ -131,6 +153,14 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
{
throw std::runtime_error("meta_block_size must not exceed "+std::to_string(MAX_DATA_BLOCK_SIZE));
}
if (!meta_block_target_free_space)
{
meta_block_target_free_space = 800;
}
if (meta_block_target_free_space >= meta_block_size)
{
throw std::runtime_error("meta_block_target_free_space must not exceed "+std::to_string(meta_block_size));
}
if (data_offset % disk_alignment)
{
throw std::runtime_error("data_offset must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
@@ -179,17 +209,29 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
{
throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));
}
if (!meta_format)
{
meta_format = BLOCKSTORE_META_FORMAT_HEAP;
}
if (meta_device == data_device)
{
disable_meta_fsync = disable_data_fsync;
}
if (journal_device == meta_device)
{
disable_journal_fsync = disable_meta_fsync;
}
}
void blockstore_disk_t::calc_lengths(bool skip_meta_check)
void blockstore_disk_t::calc_lengths()
{
// data
data_len = data_device_size - data_offset;
if (data_fd == meta_fd && data_offset < meta_offset)
if (data_device == meta_device && data_offset < meta_offset)
{
data_len = meta_offset - data_offset;
}
if (data_fd == journal_fd && data_offset < journal_offset)
if (data_device == journal_device && data_offset < journal_offset)
{
data_len = data_len < journal_offset-data_offset
? data_len : journal_offset-data_offset;
@@ -204,23 +246,23 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check)
data_len = cfg_data_size;
}
// meta
uint64_t meta_area_size = (meta_fd == data_fd ? data_device_size : meta_device_size) - meta_offset;
if (meta_fd == data_fd && meta_offset <= data_offset)
meta_area_size = (meta_device == data_device ? data_device_size : meta_device_size) - meta_offset;
if (meta_device == data_device && meta_offset <= data_offset)
{
meta_area_size = data_offset - meta_offset;
}
if (meta_fd == journal_fd && meta_offset <= journal_offset)
if (meta_device == journal_device && meta_offset <= journal_offset)
{
meta_area_size = meta_area_size < journal_offset-meta_offset
? meta_area_size : journal_offset-meta_offset;
}
// journal
journal_len = (journal_fd == data_fd ? data_device_size : (journal_fd == meta_fd ? meta_device_size : journal_device_size)) - journal_offset;
if (journal_fd == data_fd && journal_offset <= data_offset)
journal_len = (journal_device == data_device ? data_device_size : (journal_device == meta_device ? meta_device_size : journal_device_size)) - journal_offset;
if (journal_device == data_device && journal_offset <= data_offset)
{
journal_len = data_offset - journal_offset;
}
if (journal_fd == meta_fd && journal_offset <= meta_offset)
if (journal_device == meta_device && journal_offset <= meta_offset)
{
journal_len = journal_len < meta_offset-journal_offset
? journal_len : meta_offset-journal_offset;
@@ -230,37 +272,37 @@ void blockstore_disk_t::calc_lengths(bool skip_meta_check)
clean_entry_bitmap_size = data_block_size / bitmap_granularity / 8;
clean_dyn_size = clean_entry_bitmap_size*2 + (csum_block_size
? data_block_size/csum_block_size*(data_csum_type & 0xFF) : 0);
clean_entry_size = sizeof(clean_disk_entry) + clean_dyn_size + 4 /*entry_csum*/;
meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
bool new_doesnt_fit = (!meta_format && !skip_meta_check && meta_area_size < meta_len && !data_csum_type);
if (meta_format == BLOCKSTORE_META_FORMAT_V1 || new_doesnt_fit)
if (meta_format == BLOCKSTORE_META_FORMAT_HEAP)
{
uint64_t clean_entry_v0_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
uint64_t meta_v0_len = (1 + (block_count - 1 + meta_block_size / clean_entry_v0_size)
/ (meta_block_size / clean_entry_v0_size)) * meta_block_size;
if (meta_format == BLOCKSTORE_META_FORMAT_V1 || meta_area_size >= meta_v0_len)
{
// Old metadata fits.
if (new_doesnt_fit)
{
printf("Warning: Using old metadata format without checksums because the new format"
" doesn't fit into provided area (%ju bytes required, %ju bytes available)\n", meta_len, meta_area_size);
}
clean_entry_size = clean_entry_v0_size;
meta_len = meta_v0_len;
meta_format = BLOCKSTORE_META_FORMAT_V1;
}
else
meta_format = BLOCKSTORE_META_FORMAT_V2;
uint32_t entries_per_block = ((meta_block_size-meta_block_target_free_space) /
(sizeof(heap_object_t) + sizeof(heap_write_t) + clean_dyn_size));
min_meta_len = (block_count+entries_per_block-1) / entries_per_block * meta_block_size;
}
else if (meta_format == BLOCKSTORE_META_FORMAT_V1)
{
clean_entry_size = 24 /*sizeof(clean_disk_entry)*/ + 2*clean_entry_bitmap_size;
min_meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size)
/ (meta_block_size / clean_entry_size)) * meta_block_size;
}
else if (meta_format == BLOCKSTORE_META_FORMAT_V2)
{
clean_entry_size = 24 /*sizeof(clean_disk_entry)*/ + clean_dyn_size + 4 /*entry_csum*/;
min_meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
}
else
meta_format = BLOCKSTORE_META_FORMAT_V2;
if (!skip_meta_check && meta_area_size < meta_len)
{
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes, have only "+std::to_string(meta_area_size)+" bytes");
throw std::runtime_error("meta_format = "+std::to_string(meta_format)+" is not supported");
}
}
void blockstore_disk_t::check_lengths()
{
if (meta_area_size < min_meta_len)
{
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(min_meta_len)+" bytes, have only "+std::to_string(meta_area_size)+" bytes");
}
// requested journal size
if (!skip_meta_check && cfg_journal_size > journal_len)
if (cfg_journal_size > journal_len)
{
throw std::runtime_error("Requested journal_size is too large");
}
@@ -321,12 +363,19 @@ static int bs_openmode(const std::string & mode)
void blockstore_disk_t::open_data()
{
data_fd = open(data_device.c_str(), bs_openmode(data_io) | O_RDWR);
if (data_fd >= 0)
{
throw std::runtime_error("data device is already opened");
}
data_fd = mock_mode ? MOCK_DATA_FD : open(data_device.c_str(), bs_openmode(data_io) | O_RDWR);
if (data_fd == -1)
{
throw std::runtime_error("Failed to open data device "+data_device+": "+std::string(strerror(errno)));
}
check_size(data_fd, &data_device_size, &data_device_sect, "data device");
if (!mock_mode)
{
check_size(data_fd, &data_device_size, &data_device_sect, "data device");
}
if (disk_alignment % data_device_sect)
{
throw std::runtime_error(
@@ -338,7 +387,7 @@ void blockstore_disk_t::open_data()
{
throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_device_size));
}
if (!disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
if (!mock_mode && !disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno));
}
@@ -346,19 +395,26 @@ void blockstore_disk_t::open_data()
void blockstore_disk_t::open_meta()
{
if (meta_fd >= 0)
{
throw std::runtime_error("metadata device is already opened");
}
if (meta_device != data_device || meta_io != data_io)
{
meta_fd = open(meta_device.c_str(), bs_openmode(meta_io) | O_RDWR);
meta_fd = mock_mode ? MOCK_META_FD : open(meta_device.c_str(), bs_openmode(meta_io) | O_RDWR);
if (meta_fd == -1)
{
throw std::runtime_error("Failed to open metadata device "+meta_device+": "+std::string(strerror(errno)));
}
check_size(meta_fd, &meta_device_size, &meta_device_sect, "metadata device");
if (!mock_mode)
{
check_size(meta_fd, &meta_device_size, &meta_device_sect, "metadata device");
}
if (meta_offset >= meta_device_size)
{
throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_device_size));
}
if (!disable_flock && meta_device != data_device && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
if (!mock_mode && !disable_flock && meta_device != data_device && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
}
@@ -384,15 +440,26 @@ void blockstore_disk_t::open_meta()
void blockstore_disk_t::open_journal()
{
if (journal_fd >= 0)
{
throw std::runtime_error("journal device is already opened");
}
if (journal_device != meta_device || journal_io != meta_io)
{
journal_fd = open(journal_device.c_str(), bs_openmode(journal_io) | O_RDWR);
journal_fd = mock_mode ? MOCK_JOURNAL_FD : open(journal_device.c_str(), bs_openmode(journal_io) | O_RDWR);
if (journal_fd == -1)
{
throw std::runtime_error("Failed to open journal device "+journal_device+": "+std::string(strerror(errno)));
}
check_size(journal_fd, &journal_device_size, &journal_device_sect, "journal device");
if (!disable_flock && journal_device != meta_device && flock(journal_fd, LOCK_EX|LOCK_NB) != 0)
if (!mock_mode)
{
check_size(journal_fd, &journal_device_size, &journal_device_sect, "journal device");
}
if (journal_offset >= journal_device_size)
{
throw std::runtime_error("journal_offset exceeds device size = "+std::to_string(journal_device_size));
}
if (!mock_mode && !disable_flock && journal_device != meta_device && flock(journal_fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno));
}
@@ -418,25 +485,32 @@ void blockstore_disk_t::open_journal()
void blockstore_disk_t::close_all()
{
if (data_fd >= 0)
close(data_fd);
if (meta_fd >= 0 && meta_fd != data_fd)
close(meta_fd);
if (journal_fd >= 0 && journal_fd != meta_fd)
close(journal_fd);
if (!mock_mode)
{
if (data_fd >= 0)
close(data_fd);
if (meta_fd >= 0 && meta_fd != data_fd)
close(meta_fd);
if (journal_fd >= 0 && journal_fd != meta_fd)
close(journal_fd);
}
data_fd = meta_fd = journal_fd = -1;
}
// Sadly DISCARD only works through ioctl(), but it seems to always block the device queue,
// so it's not a big deal that we can only run it synchronously.
int blockstore_disk_t::trim_data(allocator_t *alloc)
int blockstore_disk_t::trim_data(std::function<bool(uint64_t)> is_free)
{
if (mock_mode)
{
return -EINVAL;
}
int r = 0;
uint64_t j = 0, i = 0;
uint64_t discarded = 0;
for (; i <= block_count; i++)
{
if (i >= block_count || alloc->get(i))
if (i >= block_count || is_free(i))
{
if (i > j && (i-j)*data_block_size >= min_discard_size)
{

View File

@@ -12,6 +12,10 @@
// Lower byte of checksum type is its length
#define BLOCKSTORE_CSUM_CRC32C 0x104
#define MOCK_DATA_FD 1000
#define MOCK_META_FD 1001
#define MOCK_JOURNAL_FD 1002
class allocator_t;
struct blockstore_disk_t
@@ -22,11 +26,15 @@ struct blockstore_disk_t
// Required write alignment and journal/metadata/data areas' location alignment
uint32_t disk_alignment = 4096;
// Journal block size - minimum_io_size of the journal device is the best choice
uint64_t journal_block_size = 4096;
uint32_t journal_block_size = 4096;
// Metadata block size - minimum_io_size of the metadata device is the best choice
uint64_t meta_block_size = 4096;
uint32_t meta_block_size = 4096;
// Atomic write size of the data block device
uint32_t atomic_write_size = 4096;
// Target free space in metadata blocks
uint32_t meta_block_target_free_space = 800;
// Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment
uint64_t bitmap_granularity = 4096;
uint32_t bitmap_granularity = 4096;
// Data checksum type, BLOCKSTORE_CSUM_NONE or BLOCKSTORE_CSUM_CRC32C
uint32_t data_csum_type = BLOCKSTORE_CSUM_NONE;
// Checksum block size, must be a multiple of bitmap_granularity
@@ -36,27 +44,36 @@ struct blockstore_disk_t
// I/O modes for data, metadata and journal: direct or "" = O_DIRECT, cached = O_SYNC, directsync = O_DIRECT|O_SYNC
// O_SYNC without O_DIRECT = use Linux page cache for reads and writes
std::string data_io, meta_io, journal_io;
// It is safe to disable fsync() if drive write cache is writethrough
bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
// Keep journal (buffered data) in memory?
bool inmemory_meta = true;
// Keep metadata in memory?
bool inmemory_journal = true;
// Data discard granularity and minimum size (for the sake of performance)
bool discard_on_start = false;
uint64_t min_discard_size = 1024*1024;
uint64_t discard_granularity = 0;
int meta_fd = -1, data_fd = -1, journal_fd = -1;
uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len, meta_format = 0;
uint64_t meta_offset, meta_device_sect, meta_device_size, meta_area_size, min_meta_len, meta_format = 0;
uint64_t data_offset, data_device_sect, data_device_size, data_len;
uint64_t journal_offset, journal_device_sect, journal_device_size, journal_len;
uint32_t block_order = 0;
uint64_t block_count = 0;
uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0, clean_dyn_size = 0;
uint32_t clean_entry_bitmap_size = 0;
uint32_t clean_entry_size = 0, clean_dyn_size = 0; // for meta_v1/2
bool mock_mode = false;
void parse_config(std::map<std::string, std::string> & config);
void open_data();
void open_meta();
void open_journal();
void calc_lengths(bool skip_meta_check = false);
void calc_lengths();
void check_lengths();
void close_all();
int trim_data(allocator_t *alloc);
int trim_data(std::function<bool(uint64_t)> is_free);
inline uint64_t dirty_dyn_size(uint64_t offset, uint64_t len)
{

File diff suppressed because it is too large Load Diff

View File

@@ -1,22 +1,20 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#define COPY_BUF_JOURNAL 1
#define COPY_BUF_DATA 2
#define COPY_BUF_ZERO 4
#define COPY_BUF_CSUM_FILL 8
#define COPY_BUF_COALESCED 16
#define COPY_BUF_META_BLOCK 32
#define COPY_BUF_JOURNALED_BIG 64
#define COPY_BUF_JOURNAL 0x01
#define COPY_BUF_DATA 0x02
#define COPY_BUF_ZERO 0x04
#define COPY_BUF_CSUM_FILL 0x08
#define COPY_BUF_COALESCED 0x10
#define COPY_BUF_PADDED 0x20
#define COPY_BUF_SKIP_CSUM 0x40
struct copy_buffer_t
{
int copy_flags;
uint64_t offset, len, disk_offset;
uint64_t journal_sector; // only for reads: sector+1 if used and !journal.inmemory, otherwise 0
void *buf;
uint8_t *csum_buf;
int *dyn_data;
uint32_t copy_flags;
uint64_t offset, len, disk_loc, disk_offset, disk_len;
uint8_t *buf;
uint64_t wr_lsn;
};
struct meta_sector_t
@@ -27,13 +25,6 @@ struct meta_sector_t
int usage_count;
};
struct flusher_sync_t
{
bool fsync_meta;
int ready_count;
int state;
};
struct flusher_meta_write_t
{
uint64_t sector, pos;
@@ -49,93 +40,75 @@ class journal_flusher_co
{
blockstore_impl_t *bs;
journal_flusher_t *flusher;
int wait_state, wait_count, wait_journal_count;
int co_id;
int wait_state, wait_count;
struct io_uring_sqe *sqe;
struct ring_data_t *data;
std::list<flusher_sync_t>::iterator cur_sync;
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
obj_ver_id cur;
std::map<obj_ver_id, dirty_entry>::iterator dirty_it, dirty_start, dirty_end;
std::map<object_id, uint64_t>::iterator repeat_it;
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_rj, simple_callback_w;
object_id cur_oid;
uint64_t copy_id;
uint64_t compact_lsn;
uint64_t cur_version;
heap_object_t *cur_obj;
heap_write_t *begin_wr, *end_wr;
uint32_t modified_block;
bool should_repeat;
bool try_trim = false;
bool skip_copy, has_delete, has_writes;
std::vector<copy_buffer_t> v;
std::vector<copy_buffer_t>::iterator it;
int i;
bool fill_incomplete, cleared_incomplete;
int read_to_fill_incomplete;
std::vector<copy_buffer_t> read_vec;
uint32_t overwrite_start, overwrite_end;
uint32_t big_start, big_end;
int i, res;
bool read_to_fill_incomplete;
int copy_count;
uint64_t clean_loc, clean_ver, old_clean_loc, old_clean_ver;
uint64_t clean_loc;
flusher_meta_write_t meta_old, meta_new;
bool clean_init_bitmap;
uint64_t clean_bitmap_offset, clean_bitmap_len;
uint8_t *clean_init_dyn_ptr;
uint8_t *new_clean_bitmap;
uint64_t new_trim_pos;
bool do_repeat = false;
friend class journal_flusher_t;
void scan_dirty();
bool read_dirty(int wait_base);
bool modify_meta_do_reads(int wait_base);
bool wait_meta_reads(int wait_base);
bool modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base);
bool clear_incomplete_csum_block_bits(int wait_base);
void calc_block_checksums(uint32_t *new_data_csums, bool skip_overwrites);
void update_metadata_entry();
bool write_meta_block(flusher_meta_write_t & meta_block, int wait_base);
void update_clean_db();
void free_data_blocks();
bool fsync_batch(bool fsync_meta, int wait_base);
bool trim_journal(int wait_base);
void iterate_checksum_holes(std::function<void(int & pos, uint32_t hole_start, uint32_t hole_end)> cb);
void fill_partial_checksum_blocks();
void free_buffers();
int check_and_punch_checksums();
bool calc_block_checksums();
bool write_meta_block(int wait_base);
bool read_buffered(int wait_base);
bool fsync_meta(int wait_base);
int fsync_buffer(int wait_base);
bool trim_lsn(int wait_base);
public:
journal_flusher_co();
~journal_flusher_co();
bool loop();
};
// Journal flusher itself
class journal_flusher_t
{
int trim_wanted = 0;
bool dequeuing;
int min_flusher_count, max_flusher_count, cur_flusher_count, target_flusher_count;
int flusher_start_threshold;
int force_start = 0;
int min_flusher_count = 0, max_flusher_count = 0, cur_flusher_count = 0, target_flusher_count = 0;
journal_flusher_co *co;
blockstore_impl_t *bs;
friend class journal_flusher_co;
int journal_trim_counter;
bool trimming;
void* journal_superblock;
int advance_lsn_counter = 0;
uint64_t compact_counter = 0;
int active_flushers;
int syncing_flushers;
std::list<flusher_sync_t> syncs;
std::map<object_id, uint64_t> sync_to_repeat;
std::map<uint64_t, meta_sector_t> meta_sectors;
std::deque<object_id> flush_queue;
std::map<object_id, uint64_t> flush_versions; // FIXME: consider unordered_map?
bool try_find_older(std::map<obj_ver_id, dirty_entry>::iterator & dirty_end, obj_ver_id & cur);
bool try_find_other(std::map<obj_ver_id, dirty_entry>::iterator & dirty_end, obj_ver_id & cur);
int active_flushers = 0;
int wanting_meta_fsync = 0;
bool fsyncing_meta = false;
int syncing_buffer = 0;
public:
journal_flusher_t(blockstore_impl_t *bs);
~journal_flusher_t();
void loop();
bool is_trim_wanted() { return trim_wanted; }
int get_syncing_buffer();
uint64_t get_compact_counter();
bool is_active();
void mark_trim_possible();
void request_trim();
void release_trim();
void enqueue_flush(obj_ver_id oid);
void unshift_flush(obj_ver_id oid, bool force);
void remove_flush(object_id oid);
void dump_diagnostics();
bool is_mutated(uint64_t clean_loc);
};

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,344 @@
// Metadata storage version 3 ("heap")
// Copyright (c) Vitaliy Filippov, 2025+
// License: VNPL-1.1 (see README.md for details)
#pragma once
#define EMH_EXT
#include <map>
#include <unordered_map>
#include <set>
#include <deque>
#include <vector>
#include "../client/object_id.h"
#include "../../emhash/hash_table7.hpp"
#include "../util/wyhash.h"
#include "blockstore_disk.h"
#include "multilist.h"
struct pool_shard_settings_t
{
uint32_t pg_count;
uint32_t pg_stripe_size;
};
#define BS_HEAP_TYPE 7
#define BS_HEAP_SMALL_WRITE 1
#define BS_HEAP_BIG_WRITE 2
#define BS_HEAP_TOMBSTONE 3
#define BS_HEAP_INTENT_WRITE 4
#define BS_HEAP_STABLE 8
class blockstore_heap_t;
struct __attribute__((__packed__)) heap_write_t
{
// size should have top bit cleared
uint16_t size = 0;
int16_t next_pos = 0;
uint64_t lsn = 0;
uint64_t version = 0;
uint32_t offset = 0;
uint32_t len = 0;
uint64_t location = 0;
uint8_t flags = 0; // 1|2|3 = small|big|tombstone, 4|0 = stable|unstable
// uint8_t[] external_bitmap
// uint8_t[] internal_bitmap
// uint32_t[] checksums
heap_write_t *next();
inline uint8_t type() const { return (flags & BS_HEAP_TYPE); }
uint32_t get_size(blockstore_heap_t *heap);
uint32_t get_csum_size(blockstore_heap_t *heap);
bool needs_recheck(blockstore_heap_t *heap);
bool needs_compact(blockstore_heap_t *heap);
bool is_compacted(uint64_t compacted_lsn);
bool can_be_collapsed(blockstore_heap_t *heap);
bool is_allowed_before_compacted(uint64_t compacted_lsn, bool is_last_entry);
uint8_t *get_ext_bitmap(blockstore_heap_t *heap);
uint8_t *get_int_bitmap(blockstore_heap_t *heap);
uint8_t *get_checksums(blockstore_heap_t *heap);
uint32_t *get_checksum(blockstore_heap_t *heap);
};
struct __attribute__((__packed__)) heap_object_t
{
// size should have top bit cleared
uint16_t size = 0;
// linked list of write entries...
// newest entries are stored first to simplify scanning
int16_t write_pos = 0;
uint32_t crc32c = 0;
uint64_t inode = 0;
uint64_t stripe = 0;
heap_write_t *get_writes();
uint32_t calc_crc32c();
};
struct heap_object_lsn_t
{
object_id oid;
uint64_t lsn;
};
inline bool operator < (const heap_object_lsn_t & a, const heap_object_lsn_t & b)
{
return a.oid < b.oid || a.oid == b.oid && a.lsn < b.lsn;
}
struct tmp_compact_item_t
{
object_id oid;
uint64_t lsn;
bool compact;
};
struct heap_mvcc_copy_id_t
{
object_id oid;
uint64_t copy_id;
};
inline bool operator == (const heap_mvcc_copy_id_t & a, const heap_mvcc_copy_id_t & b)
{
return a.oid.inode == b.oid.inode && a.oid.stripe == b.oid.stripe && a.copy_id == b.copy_id;
}
namespace std
{
template<> struct hash<heap_mvcc_copy_id_t>
{
inline size_t operator()(const heap_mvcc_copy_id_t &s) const
{
size_t seed = std::hash<object_id>()(s.oid);
// Copy-pasted from spp::hash_combine()
seed ^= (s.copy_id + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
return seed;
}
};
};
struct heap_object_mvcc_t
{
uint32_t readers = 0;
heap_object_t *entry_copy = NULL;
};
struct __attribute__((__packed__)) heap_block_info_t
{
uint32_t used_space = 0;
uint32_t free_pos = 0;
uint8_t *data = NULL;
};
struct heap_inflight_lsn_t
{
object_id oid;
uint64_t flags;
};
struct heap_refqi_t
{
uint64_t lsn;
uint64_t inode;
uint64_t location;
uint32_t len;
bool is_data;
};
using i64hash_t = wyhash::hash<uint64_t>;
using heap_block_index_t = emhash7::HashMap<uint64_t, emhash7::HashMap<inode_t, emhash7::HashMap<uint64_t, uint64_t, i64hash_t>, i64hash_t>, i64hash_t>;
using heap_mvcc_map_t = emhash7::HashMap<heap_mvcc_copy_id_t, heap_object_mvcc_t>;
class blockstore_heap_t
{
friend class heap_write_t;
friend class heap_object_t;
blockstore_disk_t *dsk = NULL;
uint8_t* buffer_area = NULL;
bool abort_on_corruption = false;
bool abort_on_overlap = true;
int log_level = 0;
const uint32_t meta_block_count = 0;
uint32_t target_block_free_space = 800;
uint64_t next_lsn = 0;
emhash7::HashMap<pool_id_t, pool_shard_settings_t> pool_shard_settings;
// PG => inode => stripe => block number
heap_block_index_t block_index;
std::vector<heap_block_info_t> block_info;
allocator_t *data_alloc = NULL;
multilist_index_t *meta_alloc = NULL;
uint32_t meta_alloc_count = 0;
uint64_t meta_used_space = 0;
multilist_alloc_t *buffer_alloc = NULL;
heap_mvcc_map_t object_mvcc;
std::unordered_map<uint64_t, uint32_t> mvcc_data_refs;
std::unordered_map<uint64_t, uint32_t> mvcc_buffer_refs;
std::map<uint64_t, uint64_t> inode_space_stats;
uint64_t buffer_area_used_space = 0;
uint64_t data_used_space = 0;
// LSN queue: inflight (writing) -> completed [-> fsynced] -> compactable -> compacted [-> fsynced] -> trimmed and removed
std::deque<heap_inflight_lsn_t> inflight_lsn;
uint32_t to_compact_count = 0;
uint64_t first_inflight_lsn = 0;
uint64_t completed_lsn = 0;
uint64_t fsynced_lsn = 0;
uint64_t compacted_lsn = 0;
uint64_t next_compact_lsn = 0;
std::deque<heap_refqi_t> overwrite_ref_queue;
std::vector<tmp_compact_item_t> tmp_compact_queue;
std::deque<object_id> recheck_queue;
int recheck_in_progress = 0;
bool in_recheck = false;
std::function<void(bool is_data, uint64_t offset, uint64_t len, uint8_t* buf, std::function<void()>)> recheck_cb;
int recheck_queue_depth = 0;
const uint32_t max_write_entry_size;
uint64_t get_pg_id(inode_t inode, uint64_t stripe);
void defragment_block(uint32_t block_num);
uint32_t find_block_run(heap_block_info_t & block, uint32_t space);
uint32_t find_block_space(uint32_t block_num, uint32_t space);
uint32_t block_has_compactable(uint8_t *data);
uint32_t compact_object_to(heap_object_t *obj, uint64_t lsn, uint8_t *new_csums, bool do_free);
void copy_full_object(uint8_t *dst, heap_object_t *obj);
bool mvcc_save_copy(heap_object_t *obj);
bool mvcc_check_tracking(object_id oid);
void free_mvcc(heap_mvcc_map_t::iterator mvcc_it);
void allocate_block(heap_block_info_t & inf);
int allocate_new_object(object_id oid, uint32_t full_object_size, uint32_t *modified_block, heap_object_t **new_obj);
int add_object(object_id oid, heap_write_t *wr, uint32_t *modified_block);
void mark_overwritten(uint64_t over_lsn, uint64_t inode, heap_write_t *wr, heap_write_t *end_wr, bool tracking_active);
int update_object(uint32_t block_num, heap_object_t *obj, heap_write_t *wr, uint32_t *modified_block, uint32_t *moved_from_block);
void init_erase(uint32_t block_num, heap_object_t *obj);
void erase_object(uint32_t block_num, heap_object_t *obj, uint64_t lsn, bool tracking_active);
void reindex_block(uint32_t block_num, heap_object_t *from_obj);
void erase_block_index(inode_t inode, uint64_t stripe);
void deref_data(uint64_t inode, uint64_t location, bool free_at_0);
void deref_buffer(uint64_t inode, uint64_t location, uint32_t len, bool free_at_0);
void deref_overwrites(uint64_t lsn);
void free_object_space(inode_t inode, heap_write_t *from, heap_write_t *to, int mode = 0);
void add_used_space(uint32_t block_num, int32_t used_delta);
void push_inflight_lsn(object_id oid, uint64_t lsn, uint64_t flags);
public:
blockstore_heap_t(blockstore_disk_t *dsk, uint8_t *buffer_area, int log_level = 0);
~blockstore_heap_t();
// set initially compacted lsn - should be done before loading
void set_compacted_lsn(uint64_t compacted_lsn);
uint64_t get_compacted_lsn();
// load data from the disk, returns count of loaded write entries
void read_blocks(uint64_t disk_offset, uint64_t size, uint8_t *buf,
std::function<void(heap_object_t*)> handle_object, std::function<void(uint32_t, uint32_t, uint8_t*)> handle_block);
uint64_t load_blocks(uint64_t disk_offset, uint64_t size, uint8_t *buf);
// finish loading
void finish_load();
// recheck small write data after reading the database from disk
bool recheck_small_writes(std::function<void(bool is_data, uint64_t offset, uint64_t len, uint8_t* buf, std::function<void()>)> read_buffer, int queue_depth);
// initialize metadata area (fill it with empty data)
// returns 0 when done, EAGAIN when the caller has to wait more
int initialize();
// read from the metadata area
// returns 0 when done, EAGAIN when the caller has to wait more
int read();
// reshard database according to the pool's PG count
void reshard(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size);
// read an object entry and lock it against removal
// in the future, may become asynchronous
heap_object_t *lock_and_read_entry(object_id oid, uint64_t & copy_id);
// re-read a locked object entry with the given lsn (pointer may be invalidated)
heap_object_t *read_locked_entry(object_id oid, uint64_t copy_id);
// read an object entry without locking it
heap_object_t *read_entry(object_id oid, uint32_t *block_num_ptr, bool for_update = false);
// unlock an entry
bool unlock_entry(object_id oid, uint64_t copy_id);
// set or verify checksums in a write request
bool calc_checksums(heap_write_t *wr, uint8_t *data, bool set);
// set or verify raw block checksums
bool calc_block_checksums(uint32_t *block_csums, uint8_t *data, uint8_t *bitmap, uint32_t start, uint32_t end,
bool set, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
bool calc_block_checksums(uint32_t *block_csums, uint8_t *bitmap,
uint32_t start, uint32_t end, std::function<uint8_t*(uint32_t start, uint32_t & len)> next,
bool set, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
// copy an object as is
int copy_object(heap_object_t *obj, uint32_t *modified_block);
// auto-compacts the object, then adds a write entry to it and to the compaction queue
// return 0 if OK, or maybe ENOSPC
int post_write(object_id oid, heap_write_t *wr, uint32_t *modified_block, uint32_t *moved_from_block);
int post_write(uint32_t & block_num, object_id oid, heap_object_t *obj, heap_write_t *wr, uint32_t *moved_from_block);
// stabilize an unstable object version
// return 0 if OK, ENOENT if not exists
int post_stabilize(object_id oid, uint64_t version, uint32_t *modified_block, uint64_t *new_lsn, uint64_t *new_to_lsn);
// rollback an unstable object version
// return 0 if OK, ENOENT if not exists, EBUSY if already stable
int post_rollback(object_id oid, uint64_t version, uint64_t *new_lsn, uint32_t *modified_block);
// forget an object
// return error code
int post_delete(object_id oid, uint64_t *new_lsn, uint32_t *modified_block);
int post_delete(uint32_t block_num, heap_object_t *obj, uint64_t *new_lsn);
// get the next object to compact
// guaranteed to return objects in min lsn order
// returns 0 if OK, ENOENT if nothing to compact
int get_next_compact(object_id & oid);
// get the range of an object eligible for compaction
void get_compact_range(heap_object_t *obj, uint64_t max_lsn, heap_write_t **begin_wr, heap_write_t **end_wr);
// mark an object as compacted up to the given lsn
int compact_object(object_id oid, uint64_t lsn, uint8_t *new_csums);
// retrieve object listing from a PG
int list_objects(uint32_t pg_num, object_id min_oid, object_id max_oid,
obj_ver_id **result_list, size_t *stable_count, size_t *unstable_count);
// set a block number for a new object and returns error status: 0, EAGAIN or ENOSPC
int get_block_for_new_object(uint32_t & out_block_num, uint32_t size = 0);
// inflight write tracking
void mark_lsn_completed(uint64_t lsn);
void mark_lsn_fsynced(uint64_t lsn);
void mark_lsn_compacted(uint64_t lsn, bool allow_undone = false);
void mark_object_compacted(heap_object_t *obj, uint64_t max_lsn);
void mark_lsn_trimmed(uint64_t lsn);
uint64_t get_completed_lsn();
uint64_t get_fsynced_lsn();
// data device block allocator functions
uint64_t find_free_data();
bool is_data_used(uint64_t location);
void use_data(inode_t inode, uint64_t location);
void free_data(inode_t inode, uint64_t location);
// buffer device allocator functions
uint64_t find_free_buffer_area(uint64_t size);
bool is_buffer_area_free(uint64_t location, uint64_t size);
void use_buffer_area(inode_t inode, uint64_t location, uint64_t size);
void free_buffer_area(inode_t inode, uint64_t location, uint64_t size);
uint64_t get_buffer_area_used_space();
// get metadata block data buffer and used space
uint8_t *get_meta_block(uint32_t block_num);
uint32_t get_meta_block_used_space(uint32_t block_num);
// get space usage statistics
uint64_t get_data_used_space();
const std::map<uint64_t, uint64_t> & get_inode_space_stats();
uint64_t get_meta_total_space();
uint64_t get_meta_used_space();
uint32_t get_meta_nearfull_blocks();
uint32_t get_inflight_queue_size();
uint32_t get_compact_queue_size();
uint32_t get_to_compact_count();
// get maximum size for a temporary heap_write_t buffer
uint32_t get_max_write_entry_size();
// only for tests
void set_abort_on_corruption(bool fail);
void set_abort_on_overlap(bool fail);
};

View File

@@ -1,13 +1,17 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include "blockstore_impl.h"
#include <stdexcept>
blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd)
#include "blockstore_impl.h"
#include "crc32c.h"
blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd, bool mock_mode)
{
assert(sizeof(blockstore_op_private_t) <= BS_OP_PRIVATE_DATA_SIZE);
this->tfd = tfd;
this->ringloop = ringloop;
dsk.mock_mode = mock_mode;
ring_consumer.loop = [this]() { loop(); };
ringloop->register_consumer(&ring_consumer);
initialized = 0;
@@ -17,31 +21,43 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
dsk.open_data();
dsk.open_meta();
dsk.open_journal();
calc_lengths();
alloc_dyn_data = dsk.clean_dyn_size > sizeof(void*) || dsk.csum_block_size > 0;
dsk.calc_lengths();
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
data_alloc = new allocator_t(dsk.block_count);
}
catch (std::exception & e)
{
dsk.close_all();
throw;
}
memset(zero_object, 0, dsk.data_block_size);
meta_superblock = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.meta_block_size);
memset(meta_superblock, 0, dsk.meta_block_size);
}
void blockstore_impl_t::init()
{
flusher = new journal_flusher_t(this);
if (dsk.inmemory_journal)
{
buffer_area = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.journal_len);
}
heap = new blockstore_heap_t(&dsk, buffer_area, log_level);
}
blockstore_impl_t::~blockstore_impl_t()
{
delete data_alloc;
delete flusher;
if (flusher)
delete flusher;
if (heap)
delete heap;
if (buffer_area)
free(buffer_area);
if (meta_superblock)
free(meta_superblock);
if (zero_object)
free(zero_object);
ringloop->unregister_consumer(&ring_consumer);
dsk.close_all();
if (metadata_buffer)
free(metadata_buffer);
if (clean_bitmaps)
free(clean_bitmaps);
}
bool blockstore_impl_t::is_started()
@@ -57,10 +73,9 @@ bool blockstore_impl_t::is_stalled()
// main event loop - produce requests
void blockstore_impl_t::loop()
{
// FIXME: initialized == 10 is ugly
if (initialized != 10)
{
// read metadata, then journal
// read metadata
if (initialized == 0)
{
metadata_init_reader = new blockstore_init_meta(this);
@@ -73,69 +88,41 @@ void blockstore_impl_t::loop()
{
delete metadata_init_reader;
metadata_init_reader = NULL;
journal_init_reader = new blockstore_init_journal(this);
initialized = 2;
}
}
if (initialized == 2)
{
int res = journal_init_reader->loop();
if (!res)
{
delete journal_init_reader;
journal_init_reader = NULL;
initialized = 3;
ringloop->wakeup();
}
}
if (initialized == 3)
{
if (!readonly && dsk.discard_on_start)
dsk.trim_data(data_alloc);
if (journal.flush_journal)
initialized = 4;
else
initialized = 10;
}
if (initialized == 4)
{
if (readonly)
{
printf("Can't flush the journal in readonly mode\n");
exit(1);
dsk.trim_data([this](uint64_t block_num){ return heap->is_data_used(block_num * dsk.data_block_size); });
}
flusher->loop();
ringloop->submit();
initialized = 10;
}
}
else
{
// try to submit ops
unsigned initial_ring_space = ringloop->space_left();
// has_writes == 0 - no writes before the current queue item
// has_writes == 1 - some writes in progress
// has_writes == 2 - tried to submit some writes, but failed
int has_writes = 0, op_idx = 0, new_idx = 0;
int op_idx = 0, new_idx = 0;
bool has_unfinished_writes = false;
for (; op_idx < submit_queue.size(); op_idx++, new_idx++)
{
auto op = submit_queue[op_idx];
submit_queue[new_idx] = op;
// FIXME: This needs some simplification
// Writes should not block reads if the ring is not full and reads don't depend on them
// In all other cases we should stop submission
if (PRIV(op)->wait_for)
{
check_wait(op);
if (PRIV(op)->wait_for == WAIT_SQE)
{
// ring is full, stop submission
break;
}
else if (PRIV(op)->wait_for)
{
if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE)
{
has_writes = 2;
}
has_unfinished_writes = has_unfinished_writes || op->opcode == BS_OP_WRITE ||
op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE ||
op->opcode == BS_OP_STABLE || op->opcode == BS_OP_ROLLBACK;
continue;
}
}
@@ -148,46 +135,33 @@ void blockstore_impl_t::loop()
{
wr_st = dequeue_read(op);
}
else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE)
else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE)
{
if (has_writes == 2)
{
// Some writes already could not be submitted
continue;
}
wr_st = dequeue_write(op);
has_writes = wr_st > 0 ? 1 : 2;
}
else if (op->opcode == BS_OP_DELETE)
{
if (has_writes == 2)
{
// Some writes already could not be submitted
continue;
}
wr_st = dequeue_del(op);
has_writes = wr_st > 0 ? 1 : 2;
has_unfinished_writes = has_unfinished_writes || (wr_st != 2);
}
else if (op->opcode == BS_OP_SYNC)
{
// sync only completed writes?
// wait for the data device fsync to complete, then submit journal writes for big writes
// then submit an fsync operation
// syncs only completed writes, so doesn't have to be blocked by anything
wr_st = continue_sync(op);
}
else if (op->opcode == BS_OP_STABLE)
else if (op->opcode == BS_OP_STABLE || op->opcode == BS_OP_ROLLBACK)
{
wr_st = dequeue_stable(op);
}
else if (op->opcode == BS_OP_ROLLBACK)
{
wr_st = dequeue_rollback(op);
has_unfinished_writes = has_unfinished_writes || (wr_st != 2);
}
else if (op->opcode == BS_OP_LIST)
{
// LIST doesn't have to be blocked by previous modifications
process_list(op);
wr_st = 2;
// LIST has to be blocked by previous writes and commits/rollbacks
if (!has_unfinished_writes)
{
process_list(op);
wr_st = 2;
}
else
{
wr_st = 0;
}
}
if (wr_st == 2)
{
@@ -196,16 +170,13 @@ void blockstore_impl_t::loop()
}
if (wr_st == 0)
{
PRIV(op)->pending_ops = 0;
ringloop->restore(prev_sqe_pos);
if (PRIV(op)->wait_for == WAIT_SQE)
{
// ring is full, stop submission
break;
}
else if (PRIV(op)->wait_for == WAIT_JOURNAL)
{
PRIV(op)->wait_detail2 = (unstable_writes.size()+unstable_unsynced);
}
}
}
if (op_idx != new_idx)
@@ -225,12 +196,6 @@ void blockstore_impl_t::loop()
{
throw std::runtime_error(std::string("io_uring_submit: ") + strerror(-ret));
}
for (auto s: journal.submitting_sectors)
{
// Mark journal sector writes as submitted
journal.sector_info[s].submit_id = 0;
}
journal.submitting_sectors.clear();
if ((initial_ring_space - ringloop->space_left()) > 0)
{
live = true;
@@ -248,7 +213,7 @@ bool blockstore_impl_t::is_safe_to_stop()
{
return false;
}
if (unsynced_big_writes.size() > 0 || unsynced_small_writes.size() > 0)
if (unsynced_big_write_count > 0 || unsynced_small_write_count > 0)
{
if (!readonly && !stop_sync_submitted)
{
@@ -272,7 +237,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
{
if (PRIV(op)->wait_for == WAIT_SQE)
{
if (ringloop->sqes_left() < PRIV(op)->wait_detail)
if (ringloop->space_left() < PRIV(op)->wait_detail)
{
// stop submission if there's still no free space
#ifdef BLOCKSTORE_DEBUG
@@ -282,40 +247,13 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
}
PRIV(op)->wait_for = 0;
}
else if (PRIV(op)->wait_for == WAIT_JOURNAL)
else if (PRIV(op)->wait_for == WAIT_COMPACTION)
{
if (journal.used_start == PRIV(op)->wait_detail &&
(unstable_writes.size()+unstable_unsynced) == PRIV(op)->wait_detail2)
if (flusher->get_compact_counter() <= PRIV(op)->wait_detail)
{
// do not submit
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting to flush journal offset %08jx\n", PRIV(op)->wait_detail);
#endif
return;
}
flusher->release_trim();
PRIV(op)->wait_for = 0;
}
else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER)
{
int next = ((journal.cur_sector + 1) % journal.sector_count);
if (journal.sector_info[next].flush_count > 0 ||
journal.sector_info[next].dirty)
{
// do not submit
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting for a journal buffer\n");
#endif
return;
}
PRIV(op)->wait_for = 0;
}
else if (PRIV(op)->wait_for == WAIT_FREE)
{
if (!data_alloc->get_free_count() && big_to_flush > 0)
{
#ifdef BLOCKSTORE_DEBUG
printf("Still waiting for free space on the data device\n");
printf("Still waiting for more flushes\n");
#endif
return;
}
@@ -343,44 +281,6 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
return;
}
if (op->opcode == BS_OP_SYNC_STAB_ALL)
{
std::function<void(blockstore_op_t*)> *old_callback = new std::function<void(blockstore_op_t*)>(op->callback);
op->opcode = BS_OP_SYNC;
op->callback = [this, old_callback](blockstore_op_t *op)
{
if (op->retval >= 0 && unstable_writes.size() > 0)
{
op->opcode = BS_OP_STABLE;
op->len = unstable_writes.size();
obj_ver_id *vers = new obj_ver_id[op->len];
op->buf = vers;
int i = 0;
for (auto it = unstable_writes.begin(); it != unstable_writes.end(); it++, i++)
{
vers[i] = {
.oid = it->first,
.version = it->second,
};
}
unstable_writes.clear();
op->callback = [old_callback](blockstore_op_t *op)
{
obj_ver_id *vers = (obj_ver_id*)op->buf;
delete[] vers;
op->buf = NULL;
(*old_callback)(op);
delete old_callback;
};
this->enqueue_op(op);
}
else
{
(*old_callback)(op);
delete old_callback;
}
};
}
if ((op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE || op->opcode == BS_OP_DELETE) && !enqueue_write(op))
{
ringloop->set_immediate([op]() { std::function<void (blockstore_op_t*)>(op->callback)(op); });
@@ -399,75 +299,11 @@ void blockstore_impl_t::init_op(blockstore_op_t *op)
{
// Call constructor without allocating memory. We'll call destructor before returning op back
new ((void*)op->private_data) blockstore_op_private_t;
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->wait_for = 0;
PRIV(op)->op_state = 0;
PRIV(op)->pending_ops = 0;
}
static bool replace_stable(object_id oid, uint64_t version, int search_start, int search_end, obj_ver_id* list)
{
while (search_start < search_end)
{
int pos = search_start+(search_end-search_start)/2;
if (oid < list[pos].oid)
{
search_end = pos;
}
else if (list[pos].oid < oid)
{
search_start = pos+1;
}
else
{
list[pos].version = version;
return true;
}
}
return false;
}
blockstore_clean_db_t& blockstore_impl_t::clean_db_shard(object_id oid)
{
uint64_t pg_num = 0;
uint64_t pool_id = (oid.inode >> (64-POOL_ID_BITS));
auto sh_it = clean_db_settings.find(pool_id);
if (sh_it != clean_db_settings.end())
{
// like map_to_pg()
pg_num = (oid.stripe / sh_it->second.pg_stripe_size) % sh_it->second.pg_count + 1;
}
return clean_db_shards[(pool_id << (64-POOL_ID_BITS)) | pg_num];
}
void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size)
{
uint64_t pool_id = (uint64_t)pool;
std::map<pool_pg_id_t, blockstore_clean_db_t> new_shards;
auto sh_it = clean_db_shards.lower_bound((pool_id << (64-POOL_ID_BITS)));
while (sh_it != clean_db_shards.end() &&
(sh_it->first >> (64-POOL_ID_BITS)) == pool_id)
{
for (auto & pair: sh_it->second)
{
// like map_to_pg()
uint64_t pg_num = (pair.first.stripe / pg_stripe_size) % pg_count + 1;
uint64_t shard_id = (pool_id << (64-POOL_ID_BITS)) | pg_num;
new_shards[shard_id][pair.first] = pair.second;
}
clean_db_shards.erase(sh_it++);
}
for (sh_it = new_shards.begin(); sh_it != new_shards.end(); sh_it++)
{
auto & to = clean_db_shards[sh_it->first];
to.swap(sh_it->second);
}
clean_db_settings[pool_id] = (pool_shard_settings_t){
.pg_count = pg_count,
.pg_stripe_size = pg_stripe_size,
};
}
void blockstore_impl_t::process_list(blockstore_op_t *op)
{
uint32_t list_pg = op->pg_number+1;
@@ -476,7 +312,8 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
uint64_t min_inode = op->min_oid.inode;
uint64_t max_inode = op->max_oid.inode;
// Check PG
if (pg_count != 0 && (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count))
if (!pg_count || (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count) ||
!INODE_POOL(min_inode) || INODE_POOL(min_inode) != INODE_POOL(max_inode))
{
op->retval = -EINVAL;
FINISH_OP(op);
@@ -484,250 +321,40 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
}
// Check if the DB needs resharding
// (we don't know about PGs from the beginning, we only create "shards" here)
uint64_t first_shard = 0, last_shard = UINT64_MAX;
if (min_inode != 0 &&
// Check if min_inode == max_inode == pool_id<<N, i.e. this is a pool listing
(min_inode >> (64-POOL_ID_BITS)) == (max_inode >> (64-POOL_ID_BITS)))
heap->reshard(INODE_POOL(min_inode), pg_count, pg_stripe_size);
obj_ver_id *result = NULL;
size_t stable_count = 0, unstable_count = 0;
int res = heap->list_objects(list_pg, op->min_oid, op->max_oid, &result, &stable_count, &unstable_count);
if (op->list_stable_limit)
{
pool_id_t pool_id = (min_inode >> (64-POOL_ID_BITS));
if (pg_count > 1)
// Ordered result is expected - used by scrub
// We use an unordered map
std::sort(result, result + stable_count);
if (stable_count > op->list_stable_limit)
{
// Per-pg listing
auto sh_it = clean_db_settings.find(pool_id);
if (sh_it == clean_db_settings.end() ||
sh_it->second.pg_count != pg_count ||
sh_it->second.pg_stripe_size != pg_stripe_size)
{
reshard_clean_db(pool_id, pg_count, pg_stripe_size);
}
first_shard = last_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS)) | list_pg;
}
else
{
// Per-pool listing
first_shard = ((uint64_t)pool_id << (64-POOL_ID_BITS));
last_shard = ((uint64_t)(pool_id+1) << (64-POOL_ID_BITS)) - 1;
memmove(result + op->list_stable_limit, result + stable_count, unstable_count);
stable_count = op->list_stable_limit;
}
}
// Copy clean_db entries
int stable_count = 0, stable_alloc = 0;
if (min_inode != max_inode)
{
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
shard_it++)
{
auto & clean_db = shard_it->second;
stable_alloc += clean_db.size();
}
}
if (op->list_stable_limit > 0)
{
stable_alloc = op->list_stable_limit;
if (stable_alloc > 1024*1024)
stable_alloc = 1024*1024;
}
if (stable_alloc < 32768)
{
stable_alloc = 32768;
}
obj_ver_id *stable = (obj_ver_id*)malloc(sizeof(obj_ver_id) * stable_alloc);
if (!stable)
{
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
auto max_oid = op->max_oid;
bool limited = false;
pool_pg_id_t last_shard_id = 0;
for (auto shard_it = clean_db_shards.lower_bound(first_shard);
shard_it != clean_db_shards.end() && shard_it->first <= last_shard;
shard_it++)
{
auto & clean_db = shard_it->second;
auto clean_it = clean_db.begin(), clean_end = clean_db.end();
if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
{
clean_it = clean_db.lower_bound(op->min_oid);
}
if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
{
clean_end = clean_db.upper_bound(max_oid);
}
for (; clean_it != clean_end; clean_it++)
{
if (stable_count >= stable_alloc)
{
stable_alloc *= 2;
obj_ver_id* nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
if (!nst)
{
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
stable = nst;
}
stable[stable_count++] = {
.oid = clean_it->first,
.version = clean_it->second.version,
};
if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit)
{
if (!limited)
{
limited = true;
max_oid = stable[stable_count-1].oid;
}
break;
}
}
if (op->list_stable_limit > 0)
{
// To maintain the order, we have to include objects in the same range from other shards
if (last_shard_id != 0 && last_shard_id != shard_it->first)
std::sort(stable, stable+stable_count);
if (stable_count > op->list_stable_limit)
stable_count = op->list_stable_limit;
}
last_shard_id = shard_it->first;
}
if (op->list_stable_limit == 0 && first_shard != last_shard)
{
// If that's not a per-PG listing, sort clean entries (already sorted if list_stable_limit != 0)
std::sort(stable, stable+stable_count);
}
int clean_stable_count = stable_count;
// Copy dirty_db entries (sorted, too)
int unstable_count = 0, unstable_alloc = 0;
obj_ver_id *unstable = NULL;
{
auto dirty_it = dirty_db.begin(), dirty_end = dirty_db.end();
if (op->min_oid.inode != 0 || op->min_oid.stripe != 0)
{
dirty_it = dirty_db.lower_bound({
.oid = op->min_oid,
.version = 0,
});
}
if ((max_oid.inode != 0 || max_oid.stripe != 0) && !(max_oid < op->min_oid))
{
dirty_end = dirty_db.upper_bound({
.oid = max_oid,
.version = UINT64_MAX,
});
}
for (; dirty_it != dirty_end; dirty_it++)
{
if (!pg_count || ((dirty_it->first.oid.stripe / pg_stripe_size) % pg_count + 1) == list_pg) // like map_to_pg()
{
if (IS_DELETE(dirty_it->second.state))
{
// Deletions are always stable, so try to zero out two possible entries
if (!replace_stable(dirty_it->first.oid, 0, 0, clean_stable_count, stable))
{
replace_stable(dirty_it->first.oid, 0, clean_stable_count, stable_count, stable);
}
}
else if (IS_STABLE(dirty_it->second.state) || (dirty_it->second.state & BS_ST_INSTANT))
{
// First try to replace a clean stable version in the first part of the list
if (!replace_stable(dirty_it->first.oid, dirty_it->first.version, 0, clean_stable_count, stable))
{
// Then try to replace the last dirty stable version in the second part of the list
if (stable_count > 0 && stable[stable_count-1].oid == dirty_it->first.oid)
{
stable[stable_count-1].version = dirty_it->first.version;
}
else
{
if (stable_count >= stable_alloc)
{
stable_alloc += 32768;
obj_ver_id *nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
if (!nst)
{
if (unstable)
free(unstable);
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
stable = nst;
}
stable[stable_count++] = dirty_it->first;
}
}
if (op->list_stable_limit > 0 && stable_count >= op->list_stable_limit)
{
// Stop here
break;
}
}
else
{
if (unstable_count >= unstable_alloc)
{
unstable_alloc += 32768;
obj_ver_id *nst = (obj_ver_id*)realloc(unstable, sizeof(obj_ver_id) * unstable_alloc);
if (!nst)
{
if (stable)
free(stable);
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
unstable = nst;
}
unstable[unstable_count++] = dirty_it->first;
}
}
}
}
// Remove zeroed out stable entries
int j = 0;
for (int i = 0; i < stable_count; i++)
{
if (stable[i].version != 0)
{
stable[j++] = stable[i];
}
}
stable_count = j;
if (stable_count+unstable_count > stable_alloc)
{
stable_alloc = stable_count+unstable_count;
obj_ver_id *nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc);
if (!nst)
{
if (unstable)
free(unstable);
op->retval = -ENOMEM;
FINISH_OP(op);
return;
}
stable = nst;
}
// Copy unstable entries
for (int i = 0; i < unstable_count; i++)
{
stable[j++] = unstable[i];
}
free(unstable);
op->version = stable_count;
op->retval = stable_count+unstable_count;
op->buf = stable;
op->retval = res == 0 ? stable_count+unstable_count : -res;
op->buf = (uint8_t*)result;
FINISH_OP(op);
}
void blockstore_impl_t::dump_diagnostics()
{
journal.dump_diagnostics();
flusher->dump_diagnostics();
}
void blockstore_meta_header_v3_t::set_crc32c()
{
header_csum = 0;
uint32_t calc = crc32c(0, this, version == BLOCKSTORE_META_FORMAT_HEAP
? sizeof(blockstore_meta_header_v3_t) : sizeof(blockstore_meta_header_v2_t));
header_csum = calc;
}
void blockstore_impl_t::disk_error_abort(const char *op, int retval, int expected)
{
if (retval == -EAGAIN)
@@ -741,85 +368,7 @@ void blockstore_impl_t::disk_error_abort(const char *op, int retval, int expecte
exit(1);
}
void blockstore_impl_t::set_no_inode_stats(const std::vector<uint64_t> & pool_ids)
uint64_t blockstore_impl_t::get_free_block_count()
{
for (auto & np: no_inode_stats)
{
np.second = 2;
}
for (auto pool_id: pool_ids)
{
if (!no_inode_stats[pool_id])
recalc_inode_space_stats(pool_id, false);
no_inode_stats[pool_id] = 1;
}
for (auto np_it = no_inode_stats.begin(); np_it != no_inode_stats.end(); )
{
if (np_it->second == 2)
{
recalc_inode_space_stats(np_it->first, true);
no_inode_stats.erase(np_it++);
}
else
np_it++;
}
}
void blockstore_impl_t::recalc_inode_space_stats(uint64_t pool_id, bool per_inode)
{
auto sp_begin = inode_space_stats.lower_bound((pool_id << (64-POOL_ID_BITS)));
auto sp_end = inode_space_stats.lower_bound(((pool_id+1) << (64-POOL_ID_BITS)));
inode_space_stats.erase(sp_begin, sp_end);
auto sh_it = clean_db_shards.lower_bound((pool_id << (64-POOL_ID_BITS)));
while (sh_it != clean_db_shards.end() &&
(sh_it->first >> (64-POOL_ID_BITS)) == pool_id)
{
for (auto & pair: sh_it->second)
{
uint64_t space_id = per_inode ? pair.first.inode : (pool_id << (64-POOL_ID_BITS));
inode_space_stats[space_id] += dsk.data_block_size;
}
sh_it++;
}
object_id last_oid = {};
bool last_exists = false;
auto dirty_it = dirty_db.lower_bound((obj_ver_id){ .oid = { .inode = (pool_id << (64-POOL_ID_BITS)) } });
while (dirty_it != dirty_db.end() && (dirty_it->first.oid.inode >> (64-POOL_ID_BITS)) == pool_id)
{
if (IS_STABLE(dirty_it->second.state) && (IS_BIG_WRITE(dirty_it->second.state) || IS_DELETE(dirty_it->second.state)))
{
bool exists = false;
if (last_oid == dirty_it->first.oid)
{
exists = last_exists;
}
else
{
auto & clean_db = clean_db_shard(dirty_it->first.oid);
auto clean_it = clean_db.find(dirty_it->first.oid);
exists = clean_it != clean_db.end();
}
uint64_t space_id = per_inode ? dirty_it->first.oid.inode : (pool_id << (64-POOL_ID_BITS));
if (IS_BIG_WRITE(dirty_it->second.state))
{
if (!exists)
inode_space_stats[space_id] += dsk.data_block_size;
last_exists = true;
}
else
{
if (exists)
{
auto & sp = inode_space_stats[space_id];
if (sp > dsk.data_block_size)
sp -= dsk.data_block_size;
else
inode_space_stats.erase(space_id);
}
last_exists = false;
}
last_oid = dirty_it->first.oid;
}
dirty_it++;
}
return dsk.block_count - heap->get_data_used_space()/dsk.data_block_size;
}

View File

@@ -5,6 +5,7 @@
#include "blockstore.h"
#include "blockstore_disk.h"
#include "blockstore_heap.h"
#include <sys/types.h>
#include <sys/ioctl.h>
@@ -19,46 +20,18 @@
#include <deque>
#include <new>
#include <unordered_map>
#include "cpp-btree/btree_map.h"
#include <unordered_set>
#include "malloc_or_die.h"
#include "allocator.h"
//#define BLOCKSTORE_DEBUG
// States are not stored on disk. Instead, they're deduced from the journal
#define BS_ST_SMALL_WRITE 0x01
#define BS_ST_BIG_WRITE 0x02
#define BS_ST_DELETE 0x03
#define BS_ST_WAIT_DEL 0x10
#define BS_ST_WAIT_BIG 0x20
#define BS_ST_IN_FLIGHT 0x30
#define BS_ST_SUBMITTED 0x40
#define BS_ST_WRITTEN 0x50
#define BS_ST_SYNCED 0x60
#define BS_ST_STABLE 0x70
#define BS_ST_INSTANT 0x100
#define IMMEDIATE_NONE 0
#define IMMEDIATE_SMALL 1
#define IMMEDIATE_ALL 2
#define BS_ST_TYPE_MASK 0x0F
#define BS_ST_WORKFLOW_MASK 0xF0
#define IS_IN_FLIGHT(st) (((st) & 0xF0) <= BS_ST_SUBMITTED)
#define IS_STABLE(st) (((st) & 0xF0) == BS_ST_STABLE)
#define IS_SYNCED(st) (((st) & 0xF0) >= BS_ST_SYNCED)
#define IS_JOURNAL(st) (((st) & 0x0F) == BS_ST_SMALL_WRITE)
#define IS_BIG_WRITE(st) (((st) & 0x0F) == BS_ST_BIG_WRITE)
#define IS_DELETE(st) (((st) & 0x0F) == BS_ST_DELETE)
#define IS_INSTANT(st) (((st) & BS_ST_TYPE_MASK) == BS_ST_DELETE || ((st) & BS_ST_INSTANT))
#define BS_SUBMIT_CHECK_SQES(n) \
if (ringloop->sqes_left() < (n))\
if (ringloop->space_left() < (n))\
{\
/* Pause until there are more requests available */\
PRIV(op)->wait_detail = (n);\
@@ -90,13 +63,6 @@
return 0;\
}
#include "blockstore_journal.h"
// "VITAstor"
#define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
#define BLOCKSTORE_META_FORMAT_V1 1
#define BLOCKSTORE_META_FORMAT_V2 2
// metadata header (superblock)
struct __attribute__((__packed__)) blockstore_meta_header_v1_t
{
@@ -121,75 +87,26 @@ struct __attribute__((__packed__)) blockstore_meta_header_v2_t
uint32_t header_csum;
};
// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
// per "clean" entry on disk with fixed metadata tables
struct __attribute__((__packed__)) clean_disk_entry
struct __attribute__((__packed__)) blockstore_meta_header_v3_t
{
object_id oid;
uint64_t zero;
uint64_t magic;
uint64_t version;
uint8_t bitmap[];
// Two more fields come after bitmap in metadata version 2:
// uint32_t data_csum[];
// uint32_t entry_csum;
};
uint32_t meta_block_size;
uint32_t data_block_size;
uint32_t bitmap_granularity;
uint32_t data_csum_type;
uint32_t csum_block_size;
uint32_t header_csum;
uint64_t compacted_lsn;
// 32 = 16 + 16 bytes per "clean" entry in memory (object_id => clean_entry)
struct __attribute__((__packed__)) clean_entry
{
uint64_t version;
uint64_t location;
void set_crc32c();
};
// 64 = 24 + 40 bytes per dirty entry in memory (obj_ver_id => dirty_entry). Plus checksums
struct __attribute__((__packed__)) dirty_entry
{
uint32_t state;
uint32_t flags; // unneeded, but present for alignment
uint64_t location; // location in either journal or data -> in BYTES
uint32_t offset; // data offset within object (stripe)
uint32_t len; // data length
uint64_t journal_sector; // journal sector used for this entry
void* dyn_data; // dynamic data: external bitmap and data block checksums. may be a pointer to the in-memory journal
};
// - Sync must be submitted after previous writes/deletes (not before!)
// - Reads to the same object must be submitted after previous writes/deletes
// are written (not necessarily synced) in their location. This is because we
// rely on read-modify-write for erasure coding and we must return new data
// to calculate parity for subsequent writes
// - Writes may be submitted in any order, because they don't overlap. Each write
// goes into a new location - either on the journal device or on the data device
// - Stable (stabilize) must be submitted after sync of that object is completed
// It's even OK to return an error to the caller if that object is not synced yet
// - Journal trim may be processed only after all versions are moved to
// the main storage AND after all read operations for older versions complete
// - If an operation can not be submitted because the ring is full
// we should stop submission of other operations. Otherwise some "scatter" reads
// may end up blocked for a long time.
// Otherwise, the submit order is free, that is all operations may be submitted immediately
// In fact, adding a write operation must immediately result in dirty_db being populated
// Suspend operation until there are more free SQEs
#define WAIT_SQE 1
// Suspend operation until there are <wait_detail> bytes of free space in the journal on disk
#define WAIT_JOURNAL 3
// Suspend operation until the next journal sector buffer is free
#define WAIT_JOURNAL_BUFFER 4
// Suspend operation until there is some free space on the data device
#define WAIT_FREE 5
struct used_clean_obj_t
{
int refs;
bool was_freed; // was freed by a parallel flush?
bool was_changed; // was changed by a parallel flush?
};
// https://github.com/algorithm-ninja/cpp-btree
// https://github.com/greg7mdp/sparsepp/ was used previously, but it was TERRIBLY slow after resizing
// with sparsepp, random reads dropped to ~700 iops very fast with just as much as ~32k objects in the DB
typedef btree::btree_map<object_id, clean_entry> blockstore_clean_db_t;
typedef std::map<obj_ver_id, dirty_entry> blockstore_dirty_db_t;
// Suspend until something is compacted
#define WAIT_COMPACTION 2
#include "blockstore_init.h"
@@ -202,58 +119,47 @@ struct blockstore_op_private_t
{
// Wait status
int wait_for;
uint64_t wait_detail, wait_detail2;
uint64_t wait_detail;
int pending_ops;
int op_state;
// Read, write, sync, stabilize
uint64_t lsn;
// Read
uint64_t clean_block_used;
std::vector<copy_buffer_t> read_vec;
// Sync, write
uint64_t min_flushed_journal_sector, max_flushed_journal_sector;
// Write
uint64_t location;
bool is_big;
// Stabilize, rollback
int stab_pos;
// Stabilize
uint64_t to_lsn;
// Write
struct iovec iov_zerofill[3];
// Warning: must not have a default value here because it's written to before calling constructor in blockstore_write.cpp O_o
uint64_t real_version;
timespec tv_begin;
// Sync
std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
};
typedef uint32_t pool_id_t;
typedef uint64_t pool_pg_id_t;
#define POOL_ID_BITS 16
struct pool_shard_settings_t
{
uint32_t pg_count;
uint32_t pg_stripe_size;
};
#define STAB_SPLIT_DONE 1
#define STAB_SPLIT_WAIT 2
#define STAB_SPLIT_SYNC 3
#define STAB_SPLIT_TODO 4
class blockstore_impl_t
{
public:
blockstore_disk_t dsk;
/******* OPTIONS *******/
bool readonly = false;
// It is safe to disable fsync() if drive write cache is writethrough
bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
// Enable if you want every operation to be executed with an "implicit fsync"
// Suitable only for server SSDs with capacitors, requires disabled data and journal fsyncs
int immediate_commit = IMMEDIATE_NONE;
bool inmemory_meta = false;
uint32_t meta_write_recheck_parallelism = 0;
// Maximum and minimum flusher count
unsigned max_flusher_count, min_flusher_count;
unsigned journal_trim_interval;
unsigned max_flusher_count = 0, min_flusher_count = 0;
unsigned journal_trim_interval = 0;
unsigned flusher_start_threshold = 0;
// Maximum queue depth
unsigned max_write_iodepth = 128;
// Enable small (journaled) write throttling, useful for the SSD+HDD case
@@ -268,139 +174,89 @@ class blockstore_impl_t
uint64_t autosync_writes = 128;
// Log level (0-10)
int log_level = 0;
// Enable correct block checksum validation on objects updated with small writes when checksum block
// is larger than bitmap_granularity, at the expense of extra metadata fsyncs during compaction
bool perfect_csum_update = false;
/******* END OF OPTIONS *******/
struct ring_consumer_t ring_consumer;
std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
std::map<uint64_t, int> no_inode_stats;
uint8_t *clean_bitmaps = NULL;
blockstore_dirty_db_t dirty_db;
blockstore_heap_t *heap = NULL;
uint8_t* meta_superblock = NULL;
uint8_t *buffer_area = NULL;
std::vector<blockstore_op_t*> submit_queue;
std::vector<obj_ver_id> unsynced_big_writes, unsynced_small_writes;
int unsynced_big_write_count = 0, unstable_unsynced = 0;
int unsynced_big_write_count = 0, unsynced_small_write_count = 0, unsynced_meta_write_count = 0;
int unsynced_queued_ops = 0;
allocator_t *data_alloc = NULL;
uint64_t used_blocks = 0;
uint8_t *zero_object = NULL;
void *metadata_buffer = NULL;
struct journal_t journal;
journal_flusher_t *flusher;
int big_to_flush = 0;
int write_iodepth = 0;
bool alloc_dyn_data = false;
// clean data blocks referenced by read operations
std::map<uint64_t, used_clean_obj_t> used_clean_objects;
int inflight_big = 0;
bool fsyncing_data = false;
bool live = false, queue_stall = false;
ring_loop_t *ringloop;
timerfd_manager_t *tfd;
ring_loop_i *ringloop = NULL;
timerfd_manager_t *tfd = NULL;
bool stop_sync_submitted;
bool stop_sync_submitted = false;
inline struct io_uring_sqe* get_sqe()
{
return ringloop->get_sqe();
}
friend class blockstore_init_meta;
friend class blockstore_init_journal;
friend struct blockstore_journal_check_t;
friend class journal_flusher_t;
friend class journal_flusher_co;
void calc_lengths();
void open_data();
void open_meta();
void open_journal();
uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);
blockstore_clean_db_t& clean_db_shard(object_id oid);
void reshard_clean_db(pool_id_t pool_id, uint32_t pg_count, uint32_t pg_stripe_size);
void recalc_inode_space_stats(uint64_t pool_id, bool per_inode);
// Journaling
void prepare_journal_sector_write(int sector, blockstore_op_t *op);
void handle_journal_write(ring_data_t *data, uint64_t flush_id);
void disk_error_abort(const char *op, int retval, int expected);
// Asynchronous init
int initialized;
int metadata_buf_size;
blockstore_init_meta* metadata_init_reader;
blockstore_init_journal* journal_init_reader;
void init();
void check_wait(blockstore_op_t *op);
void init_op(blockstore_op_t *op);
// Read
int dequeue_read(blockstore_op_t *read_op);
int dequeue_read(blockstore_op_t *op);
int fulfill_read(blockstore_op_t *op);
uint32_t prepare_read(std::vector<copy_buffer_t> & read_vec, heap_object_t *obj, heap_write_t *wr, uint32_t start, uint32_t end);
uint32_t prepare_read_with_bitmaps(std::vector<copy_buffer_t> & read_vec, heap_object_t *obj, heap_write_t *wr, uint32_t start, uint32_t end);
uint32_t prepare_read_zero(std::vector<copy_buffer_t> & read_vec, uint32_t start, uint32_t end);
uint32_t prepare_read_simple(std::vector<copy_buffer_t> & read_vec, heap_object_t *obj, heap_write_t *wr, uint32_t start, uint32_t end);
void prepare_disk_read(std::vector<copy_buffer_t> & read_vec, int pos, heap_object_t *obj, heap_write_t *wr,
uint32_t blk_start, uint32_t blk_end, uint32_t start, uint32_t end, uint32_t copy_flags);
void find_holes(std::vector<copy_buffer_t> & read_vec, uint32_t item_start, uint32_t item_end,
std::function<int(int, bool, uint32_t, uint32_t)> callback);
int fulfill_read(blockstore_op_t *read_op,
uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
uint32_t item_state, uint64_t item_version, uint64_t item_location,
uint64_t journal_sector, uint8_t *csum, int *dyn_data);
bool fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled,
uint8_t *clean_entry_bitmap, int *dyn_data,
uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver);
int fill_partial_checksum_blocks(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled,
uint8_t *clean_entry_bitmap, int *dyn_data, bool from_journal, uint8_t *read_buf, uint64_t read_offset, uint64_t read_end);
int pad_journal_read(std::vector<copy_buffer_t> & rv, copy_buffer_t & cp,
uint64_t dirty_offset, uint64_t dirty_end, uint64_t dirty_loc, uint8_t *csum_ptr, int *dyn_data,
uint64_t offset, uint64_t submit_len, uint64_t & blk_begin, uint64_t & blk_end, uint8_t* & blk_buf);
bool read_range_fulfilled(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled, uint8_t *read_buf,
uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end);
bool read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc);
uint8_t* read_clean_meta_block(blockstore_op_t *read_op, uint64_t clean_loc, int rv_pos);
bool verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
bool verify_journal_checksums(uint8_t *csums, uint32_t offset,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
bool verify_clean_padded_checksums(blockstore_op_t *op, uint64_t clean_loc, uint8_t *dyn_data, bool from_journal,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
int fulfill_read_push(blockstore_op_t *op, void *buf, uint64_t offset, uint64_t len,
uint32_t item_state, uint64_t item_version);
std::function<void(int&, uint32_t, uint32_t)> callback);
void free_read_buffers(std::vector<copy_buffer_t> & rv);
void handle_read_event(ring_data_t *data, blockstore_op_t *op);
bool verify_read_checksums(blockstore_op_t *op);
// Write
bool enqueue_write(blockstore_op_t *op);
void cancel_all_writes(blockstore_op_t *op, blockstore_dirty_db_t::iterator dirty_it, int retval);
void prepare_meta_block_write(blockstore_op_t *op, uint64_t modified_block, io_uring_sqe *sqe = NULL);
int dequeue_write(blockstore_op_t *op);
int dequeue_del(blockstore_op_t *op);
int make_big_write(blockstore_op_t *op, uint32_t offset, uint32_t len, uint32_t *modified_block, uint32_t *moved_from_block);
int continue_write(blockstore_op_t *op);
void release_journal_sectors(blockstore_op_t *op);
void handle_write_event(ring_data_t *data, blockstore_op_t *op);
// Sync
int continue_sync(blockstore_op_t *op);
void ack_sync(blockstore_op_t *op);
bool submit_fsyncs(int & wait_count);
int do_sync(blockstore_op_t *op, int base_state);
// Stabilize
int dequeue_stable(blockstore_op_t *op);
int continue_stable(blockstore_op_t *op);
void mark_stable(obj_ver_id ov, bool forget_dirty = false);
void stabilize_object(object_id oid, uint64_t max_ver);
blockstore_op_t* selective_sync(blockstore_op_t *op);
int split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider);
// Rollback
int dequeue_rollback(blockstore_op_t *op);
int continue_rollback(blockstore_op_t *op);
void mark_rolled_back(const obj_ver_id & ov);
void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc);
void free_dirty_dyn_data(dirty_entry & e);
// List
void process_list(blockstore_op_t *op);
public:
/*public:*/
blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
blockstore_impl_t(blockstore_config_t & config, ring_loop_i *ringloop, timerfd_manager_t *tfd, bool mock_mode = false);
~blockstore_impl_t();
void parse_config(blockstore_config_t & config, bool init);
@@ -426,21 +282,13 @@ public:
// Simplified synchronous operation: get object bitmap & current version
int read_bitmap(object_id oid, uint64_t target_version, void *bitmap, uint64_t *result_version = NULL);
// Unstable writes are added here (map of object_id -> version)
std::unordered_map<object_id, uint64_t> unstable_writes;
// Space usage statistics
std::map<uint64_t, uint64_t> inode_space_stats;
// Set per-pool no_inode_stats
void set_no_inode_stats(const std::vector<uint64_t> & pool_ids);
// Print diagnostics to stdout
void dump_diagnostics();
const std::map<uint64_t, uint64_t> & get_inode_space_stats() { return heap->get_inode_space_stats(); }
inline uint32_t get_block_size() { return dsk.data_block_size; }
inline uint64_t get_block_count() { return dsk.block_count; }
inline uint64_t get_free_block_count() { return dsk.block_count - used_blocks; }
uint64_t get_free_block_count();
inline uint32_t get_bitmap_granularity() { return dsk.disk_alignment; }
inline uint64_t get_journal_size() { return dsk.journal_len; }
};

File diff suppressed because it is too large Load Diff

View File

@@ -25,47 +25,10 @@ class blockstore_init_meta
uint64_t next_offset = 0;
uint64_t last_read_offset = 0;
uint64_t entries_loaded = 0;
unsigned entries_per_block = 0;
int i = 0, j = 0;
std::vector<uint64_t> entries_to_zero;
bool handle_meta_block(uint8_t *buf, uint64_t count, uint64_t done_cnt);
void handle_event(ring_data_t *data, int buf_num);
public:
blockstore_init_meta(blockstore_impl_t *bs);
int loop();
};
struct bs_init_journal_done
{
void *buf;
uint64_t pos, len;
};
class blockstore_init_journal
{
blockstore_impl_t *bs;
int wait_state = 0, wait_count = 0, handle_res = 0;
uint64_t entries_loaded = 0;
uint32_t crc32_last = 0;
bool started = false;
uint64_t next_free;
std::vector<bs_init_journal_done> done;
std::vector<obj_ver_id> double_allocs;
std::vector<iovec> small_write_data;
uint64_t journal_pos = 0;
uint64_t continue_pos = 0;
void *init_write_buf = NULL;
uint64_t init_write_sector = 0;
bool wrapped = false;
void *submitted_buf;
struct io_uring_sqe *sqe;
struct ring_data_t *data;
journal_entry_start *je_start;
std::function<void(ring_data_t*)> simple_callback;
int handle_journal_part(void *buf, uint64_t done_pos, uint64_t len);
void handle_event(ring_data_t *data);
void erase_dirty_object(blockstore_dirty_db_t::iterator dirty_it);
public:
blockstore_init_journal(blockstore_impl_t* bs);
int loop();
};

View File

@@ -1,356 +0,0 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include "blockstore_impl.h"
blockstore_journal_check_t::blockstore_journal_check_t(blockstore_impl_t *bs)
{
this->bs = bs;
sectors_to_write = 0;
next_pos = bs->journal.next_free;
next_sector = bs->journal.cur_sector;
first_sector = -1;
next_in_pos = bs->journal.in_sector_pos;
right_dir = next_pos >= bs->journal.used_start;
}
// Check if we can write <required> entries of <size> bytes and <data_after> data bytes after them to the journal
int blockstore_journal_check_t::check_available(blockstore_op_t *op, int entries_required, int size, int data_after)
{
uint64_t prev_next = next_sector;
int required = entries_required;
while (1)
{
int fits = bs->journal.no_same_sector_overwrites && next_pos == bs->journal.next_free && bs->journal.sector_info[next_sector].written
? 0
: (bs->journal.block_size - next_in_pos) / size;
if (fits > 0)
{
if (fits > required)
{
fits = required;
}
if (first_sector == -1)
{
first_sector = next_sector;
}
required -= fits;
next_in_pos += fits * size;
if (next_sector != prev_next || !sectors_to_write)
{
// Except the previous call to this function
sectors_to_write++;
}
}
else if (bs->journal.sector_info[next_sector].dirty)
{
if (next_sector != prev_next || !sectors_to_write)
{
// Except the previous call to this function
sectors_to_write++;
}
}
if (required <= 0)
{
break;
}
next_pos = next_pos + bs->journal.block_size;
if (next_pos >= bs->journal.len)
{
next_pos = bs->journal.block_size;
right_dir = false;
}
next_in_pos = 0;
next_sector = ((next_sector + 1) % bs->journal.sector_count);
if (next_sector == first_sector)
{
// next_sector may wrap when all sectors are flushed and the incoming batch is too big
// This is an error condition, we can't wait for anything in this case
throw std::runtime_error(
"Blockstore journal_sector_buffer_count="+std::to_string(bs->journal.sector_count)+
" is too small for a batch of "+std::to_string(entries_required)+" entries of "+std::to_string(size)+" bytes"
);
}
if (bs->journal.sector_info[next_sector].flush_count > 0 ||
bs->journal.sector_info[next_sector].dirty)
{
// No memory buffer available. Wait for it.
int used = 0, dirty = 0;
for (int i = 0; i < bs->journal.sector_count; i++)
{
if (bs->journal.sector_info[i].dirty)
{
dirty++;
used++;
}
if (bs->journal.sector_info[i].flush_count > 0)
{
used++;
}
}
// In fact, it's even more rare than "ran out of journal space", so print a warning
printf(
"Ran out of journal sector buffers: %d/%ju buffers used (%d dirty), next buffer (%jd)"
" is %s and flushed %ju times. Consider increasing \'journal_sector_buffer_count\'\n",
used, bs->journal.sector_count, dirty, next_sector,
bs->journal.sector_info[next_sector].dirty ? "dirty" : "not dirty",
bs->journal.sector_info[next_sector].flush_count
);
PRIV(op)->wait_for = WAIT_JOURNAL_BUFFER;
return 0;
}
}
if (data_after > 0)
{
next_pos = next_pos + data_after;
if (next_pos >= bs->journal.len)
{
if (right_dir)
next_pos = bs->journal.block_size + data_after;
right_dir = false;
}
}
if (!right_dir && next_pos >= bs->journal.used_start-bs->journal.block_size)
{
// No space in the journal. Wait until used_start changes.
if (bs->log_level > 5)
{
printf(
"Ran out of journal space (used_start=%08jx, next_free=%08jx, dirty_start=%08jx)\n",
bs->journal.used_start, bs->journal.next_free, bs->journal.dirty_start
);
}
PRIV(op)->wait_for = WAIT_JOURNAL;
bs->flusher->request_trim();
PRIV(op)->wait_detail = bs->journal.used_start;
return 0;
}
return 1;
}
journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size)
{
if (!journal.entry_fits(size))
{
assert(!journal.sector_info[journal.cur_sector].dirty);
// Move to the next journal sector
if (journal.sector_info[journal.cur_sector].flush_count > 0)
{
// Also select next sector buffer in memory
journal.cur_sector = ((journal.cur_sector + 1) % journal.sector_count);
assert(!journal.sector_info[journal.cur_sector].flush_count);
}
else
{
journal.dirty_start = journal.next_free;
}
journal.sector_info[journal.cur_sector].written = false;
journal.sector_info[journal.cur_sector].offset = journal.next_free;
journal.in_sector_pos = 0;
auto next_next_free = (journal.next_free+journal.block_size) < journal.len ? journal.next_free + journal.block_size : journal.block_size;
// double check that next_free doesn't cross used_start from the left
assert(journal.next_free >= journal.used_start && next_next_free >= journal.next_free || next_next_free < journal.used_start);
journal.next_free = next_next_free;
memset(journal.inmemory
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector, 0, journal.block_size);
}
journal_entry *je = (struct journal_entry*)(
(journal.inmemory
? (uint8_t*)journal.buffer + journal.sector_info[journal.cur_sector].offset
: (uint8_t*)journal.sector_buf + journal.block_size*journal.cur_sector) + journal.in_sector_pos
);
journal.in_sector_pos += size;
je->magic = JOURNAL_MAGIC;
je->type = type;
je->size = size;
je->crc32_prev = journal.crc32_last;
journal.sector_info[journal.cur_sector].dirty = true;
return je;
}
void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_op_t *op)
{
// Don't submit the same sector twice in the same batch
if (!journal.sector_info[cur_sector].submit_id)
{
io_uring_sqe *sqe = get_sqe();
// Caller must ensure availability of an SQE
assert(sqe != NULL);
ring_data_t *data = ((ring_data_t*)sqe->user_data);
journal.sector_info[cur_sector].written = true;
journal.sector_info[cur_sector].submit_id = ++journal.submit_id;
assert(journal.submit_id != 0); // check overflow
journal.submitting_sectors.push_back(cur_sector);
journal.sector_info[cur_sector].flush_count++;
data->iov = (struct iovec){
(journal.inmemory
? (uint8_t*)journal.buffer + journal.sector_info[cur_sector].offset
: (uint8_t*)journal.sector_buf + journal.block_size*cur_sector),
(size_t)journal.block_size
};
data->callback = [this, flush_id = journal.submit_id](ring_data_t *data) { handle_journal_write(data, flush_id); };
my_uring_prep_writev(
sqe, dsk.journal_fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset
);
}
journal.sector_info[cur_sector].dirty = false;
// But always remember that this operation has to wait until this exact journal write is finished
journal.flushing_ops.emplace(journal.sector_info[cur_sector].submit_id, (pending_journaling_t){
.pending = 1,
.sector = cur_sector,
.op = op,
});
auto priv = PRIV(op);
priv->pending_ops++;
if (!priv->min_flushed_journal_sector)
priv->min_flushed_journal_sector = 1+cur_sector;
assert(priv->min_flushed_journal_sector <= journal.sector_count);
priv->max_flushed_journal_sector = 1+cur_sector;
}
void blockstore_impl_t::handle_journal_write(ring_data_t *data, uint64_t flush_id)
{
live = true;
if (data->res != data->iov.iov_len)
{
// FIXME: our state becomes corrupted after a write error. maybe do something better than just die
disk_error_abort("journal write", data->res, data->iov.iov_len);
}
auto fl_it = journal.flushing_ops.lower_bound(flush_id);
if (fl_it != journal.flushing_ops.end() && fl_it->first == flush_id && fl_it->second.sector >= 0)
{
journal.sector_info[fl_it->second.sector].flush_count--;
}
auto is_first = fl_it == journal.flushing_ops.begin();
while (fl_it != journal.flushing_ops.end())
{
bool del = false;
if (fl_it->first == flush_id)
{
fl_it->second.pending = 0;
del = is_first;
}
else
{
del = !fl_it->second.pending;
}
if (del)
{
// Do not complete this operation if previous writes are unfinished
// Otherwise also complete following operations waiting for this one
auto priv = PRIV(fl_it->second.op);
priv->pending_ops--;
assert(priv->pending_ops >= 0);
if (priv->pending_ops == 0)
{
release_journal_sectors(fl_it->second.op);
priv->op_state++;
ringloop->wakeup();
}
journal.flushing_ops.erase(fl_it++);
}
else
{
fl_it++;
}
}
}
journal_t::~journal_t()
{
if (sector_buf)
free(sector_buf);
if (sector_info)
free(sector_info);
if (buffer)
free(buffer);
sector_buf = NULL;
sector_info = NULL;
buffer = NULL;
}
uint64_t journal_t::get_trim_pos()
{
auto journal_used_it = used_sectors.lower_bound(used_start);
if (journal_used_it == used_sectors.end())
{
// Journal is cleared to its end, restart from the beginning
journal_used_it = used_sectors.begin();
if (journal_used_it == used_sectors.end())
{
// Journal is empty
return next_free;
}
else
{
// next_free does not need updating during trim
#ifdef BLOCKSTORE_DEBUG
printf(
"Trimming journal (used_start=%08jx, next_free=%08jx, dirty_start=%08jx, new_start=%08jx, new_refcount=%jd)\n",
used_start, next_free, dirty_start,
journal_used_it->first, journal_used_it->second
);
#endif
return journal_used_it->first;
}
}
else if (journal_used_it->first > used_start)
{
// Journal is cleared up to <journal_used_it>
#ifdef BLOCKSTORE_DEBUG
printf(
"Trimming journal (used_start=%08jx, next_free=%08jx, dirty_start=%08jx, new_start=%08jx, new_refcount=%jd)\n",
used_start, next_free, dirty_start,
journal_used_it->first, journal_used_it->second
);
#endif
return journal_used_it->first;
}
// Can't trim journal
return used_start;
}
void journal_t::dump_diagnostics()
{
auto journal_used_it = used_sectors.lower_bound(used_start);
if (journal_used_it == used_sectors.end())
{
// Journal is cleared to its end, restart from the beginning
journal_used_it = used_sectors.begin();
}
printf(
"Journal: used_start=%08jx next_free=%08jx dirty_start=%08jx trim_to=%08jx trim_to_refs=%jd\n",
used_start, next_free, dirty_start,
journal_used_it == used_sectors.end() ? 0 : journal_used_it->first,
journal_used_it == used_sectors.end() ? 0 : journal_used_it->second
);
}
static uint64_t zero_page[4096];
uint32_t crc32c_pad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad)
{
uint32_t r = prev_crc;
while (left_pad >= 4096)
{
r = crc32c(r, zero_page, 4096);
left_pad -= 4096;
}
if (left_pad > 0)
r = crc32c(r, zero_page, left_pad);
r = crc32c(r, buf, len);
while (right_pad >= 4096)
{
r = crc32c(r, zero_page, 4096);
right_pad -= 4096;
}
if (left_pad > 0)
r = crc32c(r, zero_page, right_pad);
return r;
}
uint32_t crc32c_nopad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad)
{
return crc32c(0, buf, len);
}

View File

@@ -2,6 +2,7 @@
// License: VNPL-1.1 (see README.md for details)
#include <sys/file.h>
#include <stdexcept>
#include "blockstore_impl.h"
void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
@@ -14,12 +15,14 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
}
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
journal_trim_interval = strtoull(config["journal_trim_interval"].c_str(), NULL, 10);
flusher_start_threshold = strtoull(config["flusher_start_threshold"].c_str(), NULL, 10);
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
perfect_csum_update = config["perfect_csum_update"] == "true" || config["perfect_csum_update"] == "1" || config["perfect_csum_update"] == "yes";
if (config["autosync_writes"] != "")
{
autosync_writes = strtoull(config["autosync_writes"].c_str(), NULL, 10);
@@ -28,13 +31,17 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
{
max_flusher_count = 256;
}
if (!min_flusher_count || journal.flush_journal)
if (!min_flusher_count)
{
min_flusher_count = 1;
}
if (!journal_trim_interval)
{
journal_trim_interval = 512;
journal_trim_interval = 1024;
}
if (!flusher_start_threshold)
{
flusher_start_threshold = 32;
}
if (!max_write_iodepth)
{
@@ -68,23 +75,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
{
readonly = true;
}
if (config["disable_data_fsync"] == "true" || config["disable_data_fsync"] == "1" || config["disable_data_fsync"] == "yes")
{
disable_data_fsync = true;
}
if (config["disable_meta_fsync"] == "true" || config["disable_meta_fsync"] == "1" || config["disable_meta_fsync"] == "yes")
{
disable_meta_fsync = true;
}
if (config["disable_journal_fsync"] == "true" || config["disable_journal_fsync"] == "1" || config["disable_journal_fsync"] == "yes")
{
disable_journal_fsync = true;
}
if (config["flush_journal"] == "true" || config["flush_journal"] == "1" || config["flush_journal"] == "yes")
{
// Only flush journal and exit
journal.flush_journal = true;
}
if (config["immediate_commit"] == "all")
{
immediate_commit = IMMEDIATE_ALL;
@@ -94,85 +84,23 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
immediate_commit = IMMEDIATE_SMALL;
}
metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
inmemory_meta = config["inmemory_metadata"] != "false" && config["inmemory_metadata"] != "0" &&
config["inmemory_metadata"] != "no";
journal.sector_count = strtoull(config["journal_sector_buffer_count"].c_str(), NULL, 10);
journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
journal.inmemory = config["inmemory_journal"] != "false" && config["inmemory_journal"] != "0" &&
config["inmemory_journal"] != "no";
meta_write_recheck_parallelism = strtoull(config["meta_write_recheck_parallelism"].c_str(), NULL, 10);
log_level = strtoull(config["log_level"].c_str(), NULL, 10);
// Validate
if (journal.sector_count < 2)
{
journal.sector_count = 32;
}
if (metadata_buf_size < 65536)
{
metadata_buf_size = 4*1024*1024;
}
if (dsk.meta_device == dsk.data_device)
if (!meta_write_recheck_parallelism)
{
disable_meta_fsync = disable_data_fsync;
meta_write_recheck_parallelism = 16;
}
if (dsk.journal_device == dsk.meta_device)
{
disable_journal_fsync = disable_meta_fsync;
}
if (immediate_commit != IMMEDIATE_NONE && !disable_journal_fsync)
if (immediate_commit != IMMEDIATE_NONE && !dsk.disable_journal_fsync)
{
throw std::runtime_error("immediate_commit requires disable_journal_fsync");
}
if (immediate_commit == IMMEDIATE_ALL && !disable_data_fsync)
if (immediate_commit == IMMEDIATE_ALL && !dsk.disable_data_fsync)
{
throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
}
// init some fields
journal.block_size = dsk.journal_block_size;
journal.next_free = dsk.journal_block_size;
journal.used_start = dsk.journal_block_size;
// no free space because sector is initially unmapped
journal.in_sector_pos = dsk.journal_block_size;
}
void blockstore_impl_t::calc_lengths()
{
dsk.calc_lengths();
journal.len = dsk.journal_len;
journal.block_size = dsk.journal_block_size;
journal.offset = dsk.journal_offset;
if (inmemory_meta)
{
metadata_buffer = memalign(MEM_ALIGNMENT, dsk.meta_len);
if (!metadata_buffer)
throw std::runtime_error("Failed to allocate memory for the metadata ("+std::to_string(dsk.meta_len/1024/1024)+" MB)");
}
else if (dsk.clean_entry_bitmap_size || dsk.data_csum_type)
{
clean_bitmaps = (uint8_t*)malloc(dsk.block_count * 2 * dsk.clean_entry_bitmap_size);
if (!clean_bitmaps)
{
throw std::runtime_error(
"Failed to allocate memory for the metadata sparse write bitmap ("+
std::to_string(dsk.block_count * 2 * dsk.clean_entry_bitmap_size / 1024 / 1024)+" MB)"
);
}
}
if (journal.inmemory)
{
journal.buffer = memalign(MEM_ALIGNMENT, journal.len);
if (!journal.buffer)
throw std::runtime_error("Failed to allocate memory for journal ("+std::to_string(journal.len/1024/1024)+" MB)");
}
else
{
journal.sector_buf = (uint8_t*)memalign(MEM_ALIGNMENT, journal.sector_count * dsk.journal_block_size);
if (!journal.sector_buf)
throw std::bad_alloc();
}
journal.sector_info = (journal_sector_info_t*)calloc(journal.sector_count, sizeof(journal_sector_info_t));
if (!journal.sector_info)
{
throw std::bad_alloc();
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,258 +0,0 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include "blockstore_impl.h"
int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
{
if (PRIV(op)->op_state)
{
return continue_rollback(op);
}
int r = split_stab_op(op, [this](obj_ver_id ov)
{
// Check that there are some versions greater than v->version (which may be zero),
// check that they're unstable, synced, and not currently written to
auto dirty_it = dirty_db.lower_bound((obj_ver_id){
.oid = ov.oid,
.version = UINT64_MAX,
});
if (dirty_it == dirty_db.begin())
{
// Already rolled back, skip this object version
return STAB_SPLIT_DONE;
}
else
{
dirty_it--;
if (dirty_it->first.oid != ov.oid || dirty_it->first.version < ov.version)
{
// Already rolled back, skip this object version
return STAB_SPLIT_DONE;
}
while (dirty_it->first.oid == ov.oid && dirty_it->first.version > ov.version)
{
if (IS_IN_FLIGHT(dirty_it->second.state))
{
// Object write is still in progress. Wait until the write request completes
return STAB_SPLIT_WAIT;
}
else if (!IS_SYNCED(dirty_it->second.state) ||
IS_STABLE(dirty_it->second.state))
{
// Sync the object
return STAB_SPLIT_SYNC;
}
if (dirty_it == dirty_db.begin())
{
break;
}
dirty_it--;
}
return STAB_SPLIT_TODO;
}
});
if (r != 1)
{
return r;
}
// Check journal space
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, op->len, sizeof(journal_entry_rollback), 0))
{
return 0;
}
// There is sufficient space. Check SQEs
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
// Prepare and submit journal entries
int s = 0;
auto v = (obj_ver_id*)op->buf;
for (int i = 0; i < op->len; i++, v++)
{
if (!journal.entry_fits(sizeof(journal_entry_rollback)) &&
journal.sector_info[journal.cur_sector].dirty)
{
prepare_journal_sector_write(journal.cur_sector, op);
s++;
}
journal_entry_rollback *je = (journal_entry_rollback*)
prefill_single_journal_entry(journal, JE_ROLLBACK, sizeof(journal_entry_rollback));
je->oid = v->oid;
je->version = v->version;
je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32;
}
prepare_journal_sector_write(journal.cur_sector, op);
s++;
assert(s == space_check.sectors_to_write);
PRIV(op)->op_state = 1;
return 1;
}
int blockstore_impl_t::continue_rollback(blockstore_op_t *op)
{
if (PRIV(op)->op_state == 2)
goto resume_2;
else if (PRIV(op)->op_state == 4)
goto resume_4;
else
return 1;
resume_2:
if (!disable_journal_fsync)
{
BS_SUBMIT_GET_SQE(sqe, data);
my_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = 3;
return 1;
}
resume_4:
obj_ver_id* v;
int i;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{
mark_rolled_back(*v);
}
// Acknowledge op
op->retval = 0;
FINISH_OP(op);
return 2;
}
void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov)
{
auto it = dirty_db.lower_bound((obj_ver_id){
.oid = ov.oid,
.version = UINT64_MAX,
});
if (it != dirty_db.begin())
{
uint64_t max_unstable = 0;
auto rm_start = it;
auto rm_end = it;
it--;
while (1)
{
if (it->first.oid != ov.oid)
break;
else if (it->first.version <= ov.version)
{
if (!IS_STABLE(it->second.state))
max_unstable = it->first.version;
break;
}
else if (IS_IN_FLIGHT(it->second.state) || IS_STABLE(it->second.state))
break;
// Remove entry
rm_start = it;
if (it == dirty_db.begin())
break;
it--;
}
if (rm_start != rm_end)
{
erase_dirty(rm_start, rm_end, UINT64_MAX);
auto unstab_it = unstable_writes.find(ov.oid);
if (unstab_it != unstable_writes.end())
{
if (max_unstable == 0)
unstable_writes.erase(unstab_it);
else
unstab_it->second = max_unstable;
}
}
}
}
void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc)
{
if (dirty_end == dirty_start)
{
return;
}
auto dirty_it = dirty_end;
dirty_it--;
if (IS_DELETE(dirty_it->second.state))
{
object_id oid = dirty_it->first.oid;
#ifdef BLOCKSTORE_DEBUG
printf("Unblock writes-after-delete %jx:%jx v%ju\n", oid.inode, oid.stripe, dirty_it->first.version);
#endif
dirty_it = dirty_end;
// Unblock operations blocked by delete flushing
uint32_t next_state = BS_ST_IN_FLIGHT;
while (dirty_it != dirty_db.end() && dirty_it->first.oid == oid)
{
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_DEL)
{
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | next_state;
if (IS_BIG_WRITE(dirty_it->second.state))
{
next_state = BS_ST_WAIT_BIG;
}
}
dirty_it++;
}
dirty_it = dirty_end;
dirty_it--;
}
while (1)
{
if ((IS_BIG_WRITE(dirty_it->second.state) || IS_DELETE(dirty_it->second.state)) &&
IS_STABLE(dirty_it->second.state))
{
big_to_flush--;
}
if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc &&
dirty_it->second.location != UINT64_MAX)
{
#ifdef BLOCKSTORE_DEBUG
printf("Free block %ju from %jx:%jx v%ju\n", dirty_it->second.location >> dsk.block_order,
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
#endif
data_alloc->set(dirty_it->second.location >> dsk.block_order, false);
}
auto used = --journal.used_sectors.at(dirty_it->second.journal_sector);
#ifdef BLOCKSTORE_DEBUG
printf(
"remove usage of journal offset %08jx by %jx:%jx v%ju (%ju refs)\n", dirty_it->second.journal_sector,
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, used
);
#endif
if (used == 0)
{
journal.used_sectors.erase(dirty_it->second.journal_sector);
if (dirty_it->second.journal_sector == journal.sector_info[journal.cur_sector].offset)
{
// Mark current sector as "full" to select the new one
journal.in_sector_pos = dsk.journal_block_size;
}
flusher->mark_trim_possible();
}
free_dirty_dyn_data(dirty_it->second);
if (dirty_it == dirty_start)
{
break;
}
dirty_it--;
}
dirty_db.erase(dirty_start, dirty_end);
}
void blockstore_impl_t::free_dirty_dyn_data(dirty_entry & e)
{
if (e.dyn_data)
{
if (alloc_dyn_data &&
--*((int*)e.dyn_data) == 0) // refcount
{
// dyn_data contains the bitmap and checksums
// free it if it doesn't refer to the in-memory journal
free(e.dyn_data);
}
e.dyn_data = NULL;
}
}

View File

@@ -3,559 +3,87 @@
#include "blockstore_impl.h"
// Stabilize small write:
// 1) Copy data from the journal to the data device
// 2) Increase version on the metadata device and sync it
// 3) Advance clean_db entry's version, clear previous journal entries
//
// This makes 1 4K small write+sync look like:
// 512b+4K (journal) + sync + 512b (journal) + sync + 4K (data) [+ sync?] + 512b (metadata) + sync.
// WA = 2.375. It's not the best, SSD FTL-like redirect-write could probably be lower
// even with defragmentation. But it's fixed and it's still better than in Ceph. :)
// except for HDD-only clusters, because each write results in 3 seeks.
// Stabilize big write:
// 1) Copy metadata from the journal to the metadata device
// 2) Move dirty_db entry to clean_db and clear previous journal entries
//
// This makes 1 128K big write+sync look like:
// 128K (data) + sync + 512b (journal) + sync + 512b (journal) + sync + 512b (metadata) + sync.
// WA = 1.012. Very good :)
// Stabilize delete:
// 1) Remove metadata entry and sync it
// 2) Remove dirty_db entry and clear previous journal entries
// We have 2 problems here:
// - In the cluster environment, we must store the "tombstones" of deleted objects until
// all replicas (not just quorum) agrees about their deletion. That is, "stabilize" is
// not possible for deletes in degraded placement groups
// - With simple "fixed" metadata tables we can't just clear the metadata entry of the latest
// object version. We must clear all previous entries, too.
// FIXME Fix both problems - probably, by switching from "fixed" metadata tables to "dynamic"
// AND We must do it in batches, for the sake of reduced fsync call count
// AND We must know what we stabilize. Basic workflow is like:
// 1) primary OSD receives sync request
// 2) it submits syncs to blockstore and peers
// 3) after everyone acks sync it acks sync to the client
// 4) after a while it takes his synced object list and sends stabilize requests
// to peers and to its own blockstore, thus freeing the old version
struct ver_vector_t
{
obj_ver_id *items = NULL;
uint64_t alloc = 0, size = 0;
};
static void init_versions(ver_vector_t & vec, obj_ver_id *start, obj_ver_id *end, uint64_t len)
{
if (!vec.items)
{
vec.alloc = len;
vec.items = (obj_ver_id*)malloc_or_die(sizeof(obj_ver_id) * vec.alloc);
for (auto sv = start; sv < end; sv++)
{
vec.items[vec.size++] = *sv;
}
}
}
static void append_version(ver_vector_t & vec, obj_ver_id ov)
{
if (vec.size >= vec.alloc)
{
vec.alloc = !vec.alloc ? 4 : vec.alloc*2;
vec.items = (obj_ver_id*)realloc_or_die(vec.items, sizeof(obj_ver_id) * vec.alloc);
}
vec.items[vec.size++] = ov;
}
static bool check_unsynced(std::vector<obj_ver_id> & check, obj_ver_id ov, std::vector<obj_ver_id> & to, int *count)
{
bool found = false;
int j = 0, k = 0;
while (j < check.size())
{
if (check[j] == ov)
found = true;
if (check[j].oid == ov.oid && check[j].version <= ov.version)
{
to.push_back(check[j++]);
if (count)
(*count)--;
}
else
check[k++] = check[j++];
}
check.resize(k);
return found;
}
blockstore_op_t* blockstore_impl_t::selective_sync(blockstore_op_t *op)
{
unsynced_big_write_count -= unsynced_big_writes.size();
unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
unsynced_big_write_count += unsynced_big_writes.size();
unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
// Create a sync operation, insert into the end of the queue
// And move ourselves into the end too!
// Rather hacky but that's what we need...
blockstore_op_t *sync_op = new blockstore_op_t;
sync_op->opcode = BS_OP_SYNC;
sync_op->buf = NULL;
sync_op->callback = [](blockstore_op_t *sync_op)
{
delete sync_op;
};
init_op(sync_op);
int sync_res = continue_sync(sync_op);
if (sync_res != 2)
{
// Put SYNC into the queue if it's not finished yet
submit_queue.push_back(sync_op);
}
// Restore unsynced_writes
unsynced_small_writes.swap(PRIV(op)->sync_small_writes);
unsynced_big_write_count -= unsynced_big_writes.size();
unsynced_big_writes.swap(PRIV(op)->sync_big_writes);
unsynced_big_write_count += unsynced_big_writes.size();
if (sync_res == 2)
{
// Sync is immediately completed
return NULL;
}
return sync_op;
}
// Returns: 2 = stop processing and dequeue, 0 = stop processing and do not dequeue, 1 = proceed with op itself
int blockstore_impl_t::split_stab_op(blockstore_op_t *op, std::function<int(obj_ver_id v)> decider)
{
bool add_sync = false;
ver_vector_t good_vers, bad_vers;
obj_ver_id* v;
int i, todo = 0;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{
int action = decider(*v);
if (action < 0)
{
// Rollback changes
for (auto & ov: PRIV(op)->sync_big_writes)
{
unsynced_big_writes.push_back(ov);
unsynced_big_write_count++;
}
for (auto & ov: PRIV(op)->sync_small_writes)
{
unsynced_small_writes.push_back(ov);
}
free(good_vers.items);
good_vers.items = NULL;
free(bad_vers.items);
bad_vers.items = NULL;
// Error
op->retval = action;
FINISH_OP(op);
return 2;
}
else if (action == STAB_SPLIT_DONE)
{
// Already done
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
}
else if (action == STAB_SPLIT_WAIT)
{
// Already in progress, we just have to wait until it finishes
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
append_version(bad_vers, *v);
}
else if (action == STAB_SPLIT_SYNC)
{
// Needs a SYNC, we have to send a SYNC if not already in progress
//
// If the object is not present in unsynced_(big|small)_writes then
// it's currently being synced. If it's present then we can initiate
// its sync ourselves.
init_versions(good_vers, (obj_ver_id*)op->buf, v, op->len);
append_version(bad_vers, *v);
if (!add_sync)
{
PRIV(op)->sync_big_writes.clear();
PRIV(op)->sync_small_writes.clear();
add_sync = true;
}
check_unsynced(unsynced_small_writes, *v, PRIV(op)->sync_small_writes, NULL);
check_unsynced(unsynced_big_writes, *v, PRIV(op)->sync_big_writes, &unsynced_big_write_count);
}
else /* if (action == STAB_SPLIT_TODO) */
{
if (good_vers.items)
{
// If we're selecting versions then append it
// Main idea is that 99% of the time all versions passed to BS_OP_STABLE are synced
// And we don't want to select/allocate anything in that optimistic case
append_version(good_vers, *v);
}
todo++;
}
}
// In a pessimistic scenario, an operation may be split into 3:
// - Stabilize synced entries
// - Sync unsynced entries
// - Continue for unsynced entries after sync
add_sync = add_sync && (PRIV(op)->sync_big_writes.size() || PRIV(op)->sync_small_writes.size());
if (!todo && !bad_vers.size)
{
// Already stable
op->retval = 0;
FINISH_OP(op);
return 2;
}
op->retval = 0;
if (!todo && !add_sync)
{
// Only wait for inflight writes or current in-progress syncs
return 0;
}
blockstore_op_t *sync_op = NULL, *split_stab_op = NULL;
if (add_sync)
{
// Initiate a selective sync for PRIV(op)->sync_(big|small)_writes
sync_op = selective_sync(op);
}
if (bad_vers.size)
{
// Split part of the request into a separate operation
split_stab_op = new blockstore_op_t;
split_stab_op->opcode = op->opcode;
split_stab_op->buf = bad_vers.items;
split_stab_op->len = bad_vers.size;
init_op(split_stab_op);
submit_queue.push_back(split_stab_op);
}
if (sync_op || split_stab_op || good_vers.items)
{
void *orig_buf = op->buf;
if (good_vers.items)
{
op->buf = good_vers.items;
op->len = good_vers.size;
}
// Make a wrapped callback
int *split_op_counter = (int*)malloc_or_die(sizeof(int));
*split_op_counter = (sync_op ? 1 : 0) + (split_stab_op ? 1 : 0) + (todo ? 1 : 0);
auto cb = [op, good_items = good_vers.items,
bad_items = bad_vers.items, split_op_counter,
orig_buf, real_cb = op->callback](blockstore_op_t *split_op)
{
if (split_op->retval != 0)
op->retval = split_op->retval;
(*split_op_counter)--;
assert((*split_op_counter) >= 0);
if (op != split_op)
delete split_op;
if (!*split_op_counter)
{
free(good_items);
free(bad_items);
free(split_op_counter);
op->buf = orig_buf;
real_cb(op);
}
};
if (sync_op)
{
sync_op->callback = cb;
}
if (split_stab_op)
{
split_stab_op->callback = cb;
}
op->callback = cb;
}
if (!todo)
{
// All work is postponed
op->callback = NULL;
return 2;
}
return 1;
}
// Handles both stabilize (commit) and rollback
int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
{
if (PRIV(op)->op_state)
{
return continue_stable(op);
}
int r = split_stab_op(op, [this](obj_ver_id ov)
{
auto dirty_it = dirty_db.find(ov);
if (dirty_it == dirty_db.end())
{
auto & clean_db = clean_db_shard(ov.oid);
auto clean_it = clean_db.find(ov.oid);
if (clean_it == clean_db.end() || clean_it->second.version < ov.version)
{
// No such object version
printf("Error: %jx:%jx v%ju not found while stabilizing\n", ov.oid.inode, ov.oid.stripe, ov.version);
return -ENOENT;
}
else
{
// Already stable
return STAB_SPLIT_DONE;
}
}
else if (IS_STABLE(dirty_it->second.state))
{
// Already stable
return STAB_SPLIT_DONE;
}
while (true)
{
if (IS_IN_FLIGHT(dirty_it->second.state))
{
// Object write is still in progress. Wait until the write request completes
return STAB_SPLIT_WAIT;
}
else if (!IS_SYNCED(dirty_it->second.state))
{
// Object not synced yet - sync it
// In previous versions we returned EBUSY here and required
// the caller (OSD) to issue a global sync first. But a global sync
// waits for all writes in the queue including inflight writes. And
// inflight writes may themselves be blocked by unstable writes being
// still present in the journal and not flushed away from it.
// So we must sync specific objects here.
//
// Even more, we have to process "stabilize" request in parts. That is,
// we must stabilize all objects which are already synced. Otherwise
// they may block objects which are NOT synced yet.
return STAB_SPLIT_SYNC;
}
else if (IS_STABLE(dirty_it->second.state))
{
break;
}
// Check previous versions too
if (dirty_it == dirty_db.begin())
{
break;
}
dirty_it--;
if (dirty_it->first.oid != ov.oid)
{
break;
}
}
return STAB_SPLIT_TODO;
});
if (r != 1)
{
return r;
}
// Check journal space
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, op->len, sizeof(journal_entry_stable), 0))
{
return 0;
}
// There is sufficient space. Check SQEs
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
// Prepare and submit journal entries
int s = 0;
auto v = (obj_ver_id*)op->buf;
for (int i = 0; i < op->len; i++, v++)
{
if (!journal.entry_fits(sizeof(journal_entry_stable)) &&
journal.sector_info[journal.cur_sector].dirty)
{
prepare_journal_sector_write(journal.cur_sector, op);
s++;
}
journal_entry_stable *je = (journal_entry_stable*)
prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable));
je->oid = v->oid;
je->version = v->version;
je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32;
}
prepare_journal_sector_write(journal.cur_sector, op);
s++;
assert(s == space_check.sectors_to_write);
PRIV(op)->op_state = 1;
return 1;
}
int blockstore_impl_t::continue_stable(blockstore_op_t *op)
{
if (PRIV(op)->op_state == 2)
goto resume_2;
else if (PRIV(op)->op_state == 4)
goto resume_4;
else
return 1;
resume_2:
if (!disable_journal_fsync)
{
BS_SUBMIT_GET_SQE(sqe, data);
my_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = 3;
return 1;
}
resume_4:
// Mark dirty_db entries as stable, acknowledge op completion
obj_ver_id* v;
int i;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
{
// Mark all dirty_db entries up to op->version as stable
#ifdef BLOCKSTORE_DEBUG
printf("Stabilize %jx:%jx v%ju\n", v->oid.inode, v->oid.stripe, v->version);
#endif
mark_stable(*v);
}
// Acknowledge op
obj_ver_id *v = (obj_ver_id*)op->buf;
auto priv = PRIV(op);
if (priv->op_state == 1) goto resume_1;
else if (priv->op_state == 2) goto resume_2;
else if (priv->op_state == 3) goto resume_3;
else if (priv->op_state == 4) goto resume_4;
assert(!priv->op_state);
// Modify in-memory state and assign contiguous LSNs
priv->stab_pos = 0;
priv->lsn = priv->to_lsn = 0;
op->retval = 0;
while (priv->stab_pos < op->len)
{
uint32_t modified_block = 0;
uint64_t new_lsn = 0;
uint64_t new_to_lsn = 0;
int res = op->opcode == BS_OP_STABLE
? heap->post_stabilize(v[priv->stab_pos].oid, v[priv->stab_pos].version, &modified_block, &new_lsn, &new_to_lsn)
: heap->post_rollback(v[priv->stab_pos].oid, v[priv->stab_pos].version, &new_lsn, &modified_block);
if (res != 0)
{
assert(res == ENOENT || res == EBUSY);
op->retval = -res;
}
if (new_lsn)
{
assert(priv->lsn == 0 || priv->to_lsn == new_lsn-1);
if (!priv->lsn)
priv->lsn = new_lsn;
priv->to_lsn = op->opcode == BS_OP_STABLE ? new_to_lsn : new_lsn;
}
priv->stab_pos++;
}
// Submit metadata writes
priv->stab_pos = 0;
resume_1:
priv->op_state = 1;
while (priv->stab_pos < op->len)
{
uint32_t block_num = 0;
heap_object_t *obj = heap->read_entry(v[priv->stab_pos].oid, &block_num);
if (obj)
{
io_uring_sqe *sqe = get_sqe();
if (!sqe)
{
if (priv->pending_ops > 0)
return 1;
priv->wait_detail = 1;
priv->wait_for = WAIT_SQE;
return 0;
}
prepare_meta_block_write(op, block_num, sqe);
}
priv->stab_pos++;
}
if (priv->pending_ops > 0)
{
priv->op_state = 1;
return 1;
}
// Mark writes as completed to allow compaction
for (uint64_t lsn = priv->lsn; lsn <= priv->to_lsn; lsn++)
{
heap->mark_lsn_completed(lsn);
}
unsynced_meta_write_count++;
// Fsync, just because our semantics imply that commit (stabilize) is immediately fsynced
priv->op_state = 2;
resume_2:
resume_3:
resume_4:
int res = do_sync(op, 2);
if (res != 2)
{
return res;
}
// Done. Don't touch op->retval - if anything resulted in ENOENT, return it as is
FINISH_OP(op);
return 2;
}
void blockstore_impl_t::mark_stable(obj_ver_id v, bool forget_dirty)
{
auto dirty_it = dirty_db.find(v);
if (dirty_it != dirty_db.end())
{
if (IS_INSTANT(dirty_it->second.state))
{
// 'Instant' (non-EC) operations may complete and try to become stable out of order. Prevent it.
auto back_it = dirty_it;
while (back_it != dirty_db.begin())
{
back_it--;
if (back_it->first.oid != v.oid)
{
break;
}
if (!IS_STABLE(back_it->second.state))
{
// There are preceding unstable versions, can't flush <v>
return;
}
}
while (true)
{
dirty_it++;
if (dirty_it == dirty_db.end() || dirty_it->first.oid != v.oid ||
!IS_SYNCED(dirty_it->second.state))
{
dirty_it--;
break;
}
v.version = dirty_it->first.version;
}
}
while (1)
{
bool was_stable = IS_STABLE(dirty_it->second.state);
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_SYNCED)
{
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_STABLE;
// Allocations and deletions are counted when they're stabilized
if (IS_BIG_WRITE(dirty_it->second.state))
{
int exists = -1;
if (dirty_it != dirty_db.begin())
{
auto prev_it = dirty_it;
prev_it--;
if (prev_it->first.oid == v.oid)
{
exists = IS_DELETE(prev_it->second.state) ? 0 : 1;
}
}
if (exists == -1)
{
auto & clean_db = clean_db_shard(v.oid);
auto clean_it = clean_db.find(v.oid);
exists = clean_it != clean_db.end() ? 1 : 0;
}
if (!exists)
{
uint64_t space_id = dirty_it->first.oid.inode;
if (no_inode_stats[dirty_it->first.oid.inode >> (64-POOL_ID_BITS)])
space_id = space_id & ~(((uint64_t)1 << (64-POOL_ID_BITS)) - 1);
inode_space_stats[space_id] += dsk.data_block_size;
used_blocks++;
}
big_to_flush++;
}
else if (IS_DELETE(dirty_it->second.state))
{
uint64_t space_id = dirty_it->first.oid.inode;
if (no_inode_stats[dirty_it->first.oid.inode >> (64-POOL_ID_BITS)])
space_id = space_id & ~(((uint64_t)1 << (64-POOL_ID_BITS)) - 1);
auto & sp = inode_space_stats[space_id];
if (sp > dsk.data_block_size)
sp -= dsk.data_block_size;
else
inode_space_stats.erase(space_id);
used_blocks--;
big_to_flush++;
}
}
else if (IS_IN_FLIGHT(dirty_it->second.state))
{
// mark_stable should never be called for in-flight or submitted writes
printf(
"BUG: Attempt to mark_stable object %jx:%jx v%ju state of which is %x\n",
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version,
dirty_it->second.state
);
exit(1);
}
if (forget_dirty && (IS_BIG_WRITE(dirty_it->second.state) ||
IS_DELETE(dirty_it->second.state)))
{
// Big write overrides all previous dirty entries
auto erase_end = dirty_it;
while (dirty_it != dirty_db.begin())
{
dirty_it--;
if (dirty_it->first.oid != v.oid)
{
dirty_it++;
break;
}
}
auto & clean_db = clean_db_shard(v.oid);
auto clean_it = clean_db.find(v.oid);
uint64_t clean_loc = clean_it != clean_db.end()
? clean_it->second.location : UINT64_MAX;
erase_dirty(dirty_it, erase_end, clean_loc);
break;
}
if (was_stable || dirty_it == dirty_db.begin())
{
break;
}
dirty_it--;
if (dirty_it->first.oid != v.oid)
{
break;
}
}
flusher->enqueue_flush(v);
}
auto unstab_it = unstable_writes.find(v.oid);
if (unstab_it != unstable_writes.end() &&
unstab_it->second <= v.version)
{
unstable_writes.erase(unstab_it);
}
}

View File

@@ -3,231 +3,112 @@
#include "blockstore_impl.h"
#define SYNC_HAS_SMALL 1
#define SYNC_HAS_BIG 2
#define SYNC_DATA_SYNC_SENT 3
#define SYNC_DATA_SYNC_DONE 4
#define SYNC_JOURNAL_WRITE_SENT 5
#define SYNC_JOURNAL_WRITE_DONE 6
#define SYNC_JOURNAL_SYNC_SENT 7
#define SYNC_DONE 8
int blockstore_impl_t::continue_sync(blockstore_op_t *op)
{
if (immediate_commit == IMMEDIATE_ALL)
if (!PRIV(op)->op_state)
{
// We can return immediately because sync is only dequeued after all previous writes
op->retval = 0;
}
int res = do_sync(op, 0);
if (res == 2)
{
FINISH_OP(op);
return 2;
}
if (PRIV(op)->op_state == 0)
{
stop_sync_submitted = false;
unsynced_big_write_count -= unsynced_big_writes.size();
PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
PRIV(op)->sync_small_writes.swap(unsynced_small_writes);
unsynced_big_writes.clear();
unsynced_small_writes.clear();
if (PRIV(op)->sync_big_writes.size() > 0)
PRIV(op)->op_state = SYNC_HAS_BIG;
else if (PRIV(op)->sync_small_writes.size() > 0)
PRIV(op)->op_state = SYNC_HAS_SMALL;
else
PRIV(op)->op_state = SYNC_DONE;
}
if (PRIV(op)->op_state == SYNC_HAS_SMALL)
{
// No big writes, just fsync the journal
if (journal.sector_info[journal.cur_sector].dirty)
{
// Write out the last journal sector if it happens to be dirty
BS_SUBMIT_CHECK_SQES(1);
prepare_journal_sector_write(journal.cur_sector, op);
PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
return 1;
}
else
{
PRIV(op)->op_state = SYNC_JOURNAL_WRITE_DONE;
}
}
if (PRIV(op)->op_state == SYNC_HAS_BIG)
{
// 1st step: fsync data
if (!disable_data_fsync)
{
BS_SUBMIT_GET_SQE(sqe, data);
my_uring_prep_fsync(sqe, dsk.data_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = SYNC_DATA_SYNC_SENT;
return 1;
}
else
{
PRIV(op)->op_state = SYNC_DATA_SYNC_DONE;
}
}
if (PRIV(op)->op_state == SYNC_DATA_SYNC_DONE)
{
// 2nd step: Data device is synced, prepare & write journal entries
// Check space in the journal and journal memory buffers
blockstore_journal_check_t space_check(this);
if (dsk.csum_block_size)
{
// More complex check because all journal entries have different lengths
int left = PRIV(op)->sync_big_writes.size();
for (auto & sbw: PRIV(op)->sync_big_writes)
{
left--;
auto & dirty_entry = dirty_db.at(sbw);
uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size, 0))
{
return 0;
}
}
}
else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, 0))
{
return 0;
}
// Check SQEs. Don't bother about merging, submit each journal sector as a separate request
BS_SUBMIT_CHECK_SQES(space_check.sectors_to_write);
// Prepare and submit journal entries
auto it = PRIV(op)->sync_big_writes.begin();
int s = 0;
while (it != PRIV(op)->sync_big_writes.end())
{
auto & dirty_entry = dirty_db.at(*it);
uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len);
if (!journal.entry_fits(sizeof(journal_entry_big_write) + dyn_size) &&
journal.sector_info[journal.cur_sector].dirty)
{
prepare_journal_sector_write(journal.cur_sector, op);
s++;
}
journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
journal, (dirty_entry.state & BS_ST_INSTANT) ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
sizeof(journal_entry_big_write) + dyn_size
);
auto jsec = dirty_entry.journal_sector = journal.sector_info[journal.cur_sector].offset;
assert(journal.next_free >= journal.used_start
? (jsec >= journal.used_start && jsec < journal.next_free)
: (jsec >= journal.used_start || jsec < journal.next_free));
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
#ifdef BLOCKSTORE_DEBUG
printf(
"journal offset %08jx is used by %jx:%jx v%ju (%ju refs)\n",
dirty_entry.journal_sector, it->oid.inode, it->oid.stripe, it->version,
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
);
#endif
je->oid = it->oid;
je->version = it->version;
je->offset = dirty_entry.offset;
je->len = dirty_entry.len;
je->location = dirty_entry.location;
memcpy((void*)(je+1), (alloc_dyn_data
? (uint8_t*)dirty_entry.dyn_data+sizeof(int) : (uint8_t*)&dirty_entry.dyn_data), dyn_size);
je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32;
it++;
}
prepare_journal_sector_write(journal.cur_sector, op);
s++;
assert(s == space_check.sectors_to_write);
PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
return 1;
}
if (PRIV(op)->op_state == SYNC_JOURNAL_WRITE_DONE)
{
if (!disable_journal_fsync)
{
BS_SUBMIT_GET_SQE(sqe, data);
my_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = SYNC_JOURNAL_SYNC_SENT;
return 1;
}
else
{
PRIV(op)->op_state = SYNC_DONE;
}
}
if (PRIV(op)->op_state == SYNC_DONE)
{
ack_sync(op);
return 2;
}
return 1;
return res;
}
void blockstore_impl_t::ack_sync(blockstore_op_t *op)
bool blockstore_impl_t::submit_fsyncs(int & wait_count)
{
// Handle states
for (auto it = PRIV(op)->sync_big_writes.begin(); it != PRIV(op)->sync_big_writes.end(); it++)
int n = ((unsynced_small_write_count > 0 || unsynced_big_write_count > 0 || unsynced_meta_write_count > 0) && !dsk.disable_meta_fsync) +
(unsynced_small_write_count > 0 && !dsk.disable_journal_fsync && dsk.journal_fd != dsk.meta_fd) +
(unsynced_big_write_count > 0 && !dsk.disable_data_fsync && dsk.data_fd != dsk.meta_fd && dsk.data_fd != dsk.journal_fd);
if (ringloop->space_left() < n)
{
#ifdef BLOCKSTORE_DEBUG
printf("Ack sync big %jx:%jx v%ju\n", it->oid.inode, it->oid.stripe, it->version);
#endif
auto & unstab = unstable_writes[it->oid];
unstab = unstab < it->version ? it->version : unstab;
auto dirty_it = dirty_db.find(*it);
dirty_it->second.state = ((dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SYNCED);
if (dirty_it->second.state & BS_ST_INSTANT)
{
mark_stable(dirty_it->first);
}
else
{
unstable_unsynced--;
assert(unstable_unsynced >= 0);
}
dirty_it++;
while (dirty_it != dirty_db.end() && dirty_it->first.oid == it->oid)
{
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG)
{
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_IN_FLIGHT;
}
dirty_it++;
}
return false;
}
for (auto it = PRIV(op)->sync_small_writes.begin(); it != PRIV(op)->sync_small_writes.end(); it++)
if (!n)
{
#ifdef BLOCKSTORE_DEBUG
printf("Ack sync small %jx:%jx v%ju\n", it->oid.inode, it->oid.stripe, it->version);
#endif
auto & unstab = unstable_writes[it->oid];
unstab = unstab < it->version ? it->version : unstab;
if (dirty_db[*it].state == (BS_ST_DELETE | BS_ST_WRITTEN))
{
dirty_db[*it].state = (BS_ST_DELETE | BS_ST_SYNCED);
// Deletions are treated as immediately stable
mark_stable(*it);
}
else /* (BS_ST_INSTANT?) | BS_ST_SMALL_WRITE | BS_ST_WRITTEN */
{
dirty_db[*it].state = (dirty_db[*it].state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SYNCED;
if (dirty_db[*it].state & BS_ST_INSTANT)
{
mark_stable(*it);
}
else
{
unstable_unsynced--;
assert(unstable_unsynced >= 0);
}
}
return true;
}
op->retval = 0;
FINISH_OP(op);
auto cb = [this, & wait_count](ring_data_t *data)
{
if (data->res != 0)
disk_error_abort("sync meta", data->res, 0);
wait_count--;
assert(wait_count >= 0);
if (!wait_count)
ringloop->wakeup();
};
if ((unsynced_small_write_count > 0 || unsynced_big_write_count > 0 || unsynced_meta_write_count > 0) && !dsk.disable_meta_fsync)
{
// fsync meta
io_uring_sqe *sqe = get_sqe();
assert(sqe);
ring_data_t *data = ((ring_data_t*)sqe->user_data);
io_uring_prep_fsync(sqe, dsk.meta_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = cb;
wait_count++;
}
if (unsynced_small_write_count > 0 && !dsk.disable_journal_fsync && dsk.meta_fd != dsk.journal_fd)
{
// fsync buffer
io_uring_sqe *sqe = get_sqe();
assert(sqe);
ring_data_t *data = ((ring_data_t*)sqe->user_data);
io_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = cb;
wait_count++;
}
if (unsynced_big_write_count > 0 && !dsk.disable_data_fsync && dsk.data_fd != dsk.meta_fd && dsk.data_fd != dsk.journal_fd)
{
// fsync data
io_uring_sqe *sqe = get_sqe();
assert(sqe);
ring_data_t *data = ((ring_data_t*)sqe->user_data);
io_uring_prep_fsync(sqe, dsk.data_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = cb;
wait_count++;
}
unsynced_big_write_count = 0;
unsynced_small_write_count = 0;
unsynced_meta_write_count = 0;
return true;
}
int blockstore_impl_t::do_sync(blockstore_op_t *op, int base_state)
{
int op_state = PRIV(op)->op_state - base_state;
if (op_state == 1) goto resume_1;
if (op_state == 2) goto resume_2;
assert(!op_state);
if (flusher->get_syncing_buffer())
{
// Wait for flusher-initiated sync
return 0;
}
if (dsk.disable_journal_fsync && dsk.disable_meta_fsync && dsk.disable_data_fsync || !unsynced_big_write_count && !unsynced_small_write_count)
{
// We can return immediately because sync only syncs previous writes
unsynced_big_write_count = unsynced_small_write_count = unsynced_meta_write_count = 0;
return 2;
}
PRIV(op)->lsn = heap->get_completed_lsn();
if (!submit_fsyncs(PRIV(op)->pending_ops))
{
PRIV(op)->wait_detail = 1;
PRIV(op)->wait_for = WAIT_SQE;
return 0;
}
resume_1:
if (PRIV(op)->pending_ops > 0)
{
PRIV(op)->op_state = base_state+1;
return 1;
}
resume_2:
heap->mark_lsn_fsynced(PRIV(op)->lsn);
return 2;
}

File diff suppressed because it is too large Load Diff

View File

@@ -5,14 +5,14 @@
//
// Initialize storage for tests:
//
// dd if=/dev/zero of=test_data.bin bs=1024 count=1048576
// dd if=/dev/zero of=test_meta.bin bs=1024 count=256
// dd if=/dev/zero of=test_journal.bin bs=1024 count=4096
// dd if=/dev/zero of=test_data.bin bs=1M count=1024
//
// Random write:
//
// fio -thread -ioengine=./libfio_blockstore.so -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \
// -bs_config='{"data_device":"./test_data.bin"}' -size=1000M
// [LD_PRELOAD=libasan.so.8] \
// fio -name=test -thread -ioengine=../build/src/blockstore/libfio_vitastor_blk.so \
// -bs=4k -direct=1 -rw=randwrite -iodepth=16 -size=900M -loops=10 \
// -bs_config='{"data_device":"./test_data.bin","meta_offset":0,"journal_offset":16777216,"data_offset":33554432,"disable_data_fsync":true,"meta_format":3,"immediate_commit":"all","log_level":100,"journal_no_same_sector_overwrites":true,"journal_sector_buffer_count":1024}'
//
// Linear write:
//
@@ -38,12 +38,14 @@ struct bs_data
std::vector<io_u*> completed;
int op_n = 0, inflight = 0;
bool last_sync = false;
bool trace = false;
};
struct bs_options
{
int __pad;
char *json_config = NULL;
int trace = 0;
};
static struct fio_option options[] = {
@@ -56,6 +58,16 @@ static struct fio_option options[] = {
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = "bs_trace",
.lname = "trace",
.type = FIO_OPT_BOOL,
.off1 = offsetof(struct bs_options, trace),
.help = "Trace operations",
.def = "0",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{
.name = NULL,
},
@@ -63,6 +75,7 @@ static struct fio_option options[] = {
static int bs_setup(struct thread_data *td)
{
bs_options *o = (bs_options*)td->eo;
bs_data *bsd;
//fio_file *f;
//int r;
@@ -83,6 +96,8 @@ static int bs_setup(struct thread_data *td)
td->o.open_files++;
}
bsd->trace = o->trace ? true : false;
//f = td->files[0];
//f->real_file_size = size;
return 0;
@@ -168,7 +183,7 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
{
case DDIR_READ:
op->opcode = BS_OP_READ;
op->buf = io->xfer_buf;
op->buf = (uint8_t*)io->xfer_buf;
op->oid = {
.inode = 1,
.stripe = io->offset / bsd->bs->get_block_size(),
@@ -176,21 +191,20 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
op->version = UINT64_MAX; // last unstable
op->offset = io->offset % bsd->bs->get_block_size();
op->len = io->xfer_buflen;
op->callback = [io](blockstore_op_t *op)
op->callback = [io, n = bsd->op_n](blockstore_op_t *op)
{
io->error = op->retval < 0 ? -op->retval : 0;
bs_data *bsd = (bs_data*)io->engine_data;
bsd->inflight--;
bsd->completed.push_back(io);
#ifdef BLOCKSTORE_DEBUG
printf("--- OP_READ %llx n=%d retval=%d\n", io, n, op->retval);
#endif
if (bsd->trace)
printf("--- OP_READ %zx n=%d retval=%d\n", (size_t)op, n, op->retval);
delete op;
};
break;
case DDIR_WRITE:
op->opcode = BS_OP_WRITE;
op->buf = io->xfer_buf;
op->opcode = BS_OP_WRITE_STABLE;
op->buf = (uint8_t*)io->xfer_buf;
op->oid = {
.inode = 1,
.stripe = io->offset / bsd->bs->get_block_size(),
@@ -198,30 +212,28 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
op->version = 0; // assign automatically
op->offset = io->offset % bsd->bs->get_block_size();
op->len = io->xfer_buflen;
op->callback = [io](blockstore_op_t *op)
op->callback = [io, n = bsd->op_n](blockstore_op_t *op)
{
io->error = op->retval < 0 ? -op->retval : 0;
bs_data *bsd = (bs_data*)io->engine_data;
bsd->inflight--;
bsd->completed.push_back(io);
#ifdef BLOCKSTORE_DEBUG
printf("--- OP_WRITE %llx n=%d retval=%d\n", io, n, op->retval);
#endif
if (bsd->trace)
printf("--- OP_WRITE %zx n=%d retval=%d\n", (size_t)op, n, op->retval);
delete op;
};
bsd->last_sync = false;
break;
case DDIR_SYNC:
op->opcode = BS_OP_SYNC_STAB_ALL;
op->callback = [io](blockstore_op_t *op)
op->opcode = BS_OP_SYNC;
op->callback = [io, n = bsd->op_n](blockstore_op_t *op)
{
bs_data *bsd = (bs_data*)io->engine_data;
io->error = op->retval < 0 ? -op->retval : 0;
bsd->completed.push_back(io);
bsd->inflight--;
#ifdef BLOCKSTORE_DEBUG
printf("--- OP_SYNC %llx n=%d retval=%d\n", io, n, op->retval);
#endif
if (bsd->trace)
printf("--- OP_SYNC %zx n=%d retval=%d\n", (size_t)op, n, op->retval);
delete op;
};
bsd->last_sync = true;
@@ -232,9 +244,8 @@ static enum fio_q_status bs_queue(struct thread_data *td, struct io_u *io)
return FIO_Q_COMPLETED;
}
#ifdef BLOCKSTORE_DEBUG
printf("+++ %s %llx n=%d\n", op->opcode == OP_READ ? "OP_READ" : (op->opcode == OP_WRITE ? "OP_WRITE" : "OP_SYNC"), io, n);
#endif
if (bsd->trace)
printf("+++ %s %zx n=%d\n", op->opcode == BS_OP_READ ? "OP_READ" : (op->opcode == BS_OP_WRITE_STABLE ? "OP_WRITE" : "OP_SYNC"), (size_t)op, bsd->op_n);
io->error = 0;
bsd->inflight++;
bsd->bs->enqueue_op(op);
@@ -290,7 +301,7 @@ static int bs_invalidate(struct thread_data *td, struct fio_file *f)
return 0;
}
struct ioengine_ops ioengine = {
struct ioengine_ops __attribute__((visibility("default"))) ioengine = {
.name = "vitastor_blockstore",
.version = FIO_IOOPS_VERSION,
.flags = FIO_MEMALIGN | FIO_DISKLESSIO | FIO_NOEXTEND,

View File

@@ -1,12 +1,11 @@
// Old metadata format on-disk structures
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#pragma once
#include "crc32c.h"
#include <set>
#define MIN_JOURNAL_SIZE 4*1024*1024
#define JOURNAL_MAGIC 0x4A33
#define JOURNAL_VERSION_V1 1
#define JOURNAL_VERSION_V2 2
@@ -145,77 +144,14 @@ inline uint32_t je_crc32(journal_entry *je)
return crc32c(0x48674bc7, ((uint8_t*)je)+4, je->size-4);
}
struct journal_sector_info_t
// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
// per "clean" entry on disk with fixed metadata tables
struct __attribute__((__packed__)) clean_disk_entry
{
uint64_t offset;
uint64_t flush_count;
bool written;
bool dirty;
uint64_t submit_id;
object_id oid;
uint64_t version;
uint8_t bitmap[];
// Two more fields come after bitmap in metadata version 2:
// uint32_t data_csum[];
// uint32_t entry_csum;
};
struct pending_journaling_t
{
int pending;
int sector;
blockstore_op_t *op;
};
struct journal_t
{
int fd;
bool inmemory = false;
bool flush_journal = false;
void *buffer = NULL;
uint64_t block_size;
uint64_t offset, len;
// Next free block offset
uint64_t next_free = 0;
// First occupied block offset
uint64_t used_start = 0;
// End of the last block not used for writing anymore
uint64_t dirty_start = 0;
uint32_t crc32_last = 0;
// Current sector(s) used for writing
void *sector_buf = NULL;
journal_sector_info_t *sector_info = NULL;
uint64_t sector_count;
bool no_same_sector_overwrites = false;
int cur_sector = 0;
int in_sector_pos = 0;
std::vector<int> submitting_sectors;
std::multimap<uint64_t, pending_journaling_t> flushing_ops;
uint64_t submit_id = 0;
// Used sector map
// May use ~ 80 MB per 1 GB of used journal space in the worst case
std::map<uint64_t, uint64_t> used_sectors;
~journal_t();
bool trim();
uint64_t get_trim_pos();
void dump_diagnostics();
inline bool entry_fits(int size)
{
return !(block_size - in_sector_pos < size ||
no_same_sector_overwrites && sector_info[cur_sector].written);
}
};
struct blockstore_journal_check_t
{
blockstore_impl_t *bs;
uint64_t next_pos, next_sector, next_in_pos;
int sectors_to_write, first_sector;
bool right_dir; // writing to the end or the beginning of the ring buffer
blockstore_journal_check_t(blockstore_impl_t *bs);
int check_available(blockstore_op_t *op, int required, int size, int data_after);
};
journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size);
uint32_t crc32c_pad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad);
uint32_t crc32c_nopad(uint32_t prev_crc, const void *buf, size_t len, size_t left_pad, size_t right_pad);

View File

@@ -0,0 +1,338 @@
// Variable-length O(1) disk space allocator
// Copyright (c) Vitaliy Filippov, 2025+
// License: VNPL-1.1 (see README.md for details)
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <set>
#include "multilist.h"
multilist_alloc_t::multilist_alloc_t(uint32_t count, uint32_t maxn):
count(count), maxn(maxn)
{
// not-so-memory-efficient: 16 MB memory per 1 GB buffer space, but buffer spaces are small, so OK
assert(count > 1 && count < 0x80000000);
sizes.resize(count);
nexts.resize(count); // nexts[i] = 0 -> area is used; nexts[i] = 1 -> no next; nexts[i] >= 2 -> next item
prevs.resize(count);
heads.resize(maxn); // heads[i] = 0 -> empty list; heads[i] >= 1 -> list head
sizes[0] = count;
sizes[count-1] = -count; // end
nexts[0] = 1;
heads[maxn-1] = 1;
#ifdef MULTILIST_TRACE
print();
#endif
}
bool multilist_alloc_t::is_free(uint32_t pos)
{
assert(pos < count);
if (sizes[pos] < 0)
pos += sizes[pos]+1;
while (pos > 0 && !sizes[pos])
pos--;
return nexts[pos] > 0;
}
uint32_t multilist_alloc_t::find(uint32_t size)
{
assert(size > 0);
assert(size <= maxn);
for (uint32_t i = size-1; i < maxn; i++)
{
if (heads[i])
{
return heads[i]-1;
}
}
return UINT32_MAX;
}
void multilist_alloc_t::verify()
{
std::set<uint32_t> reachable;
for (int i = 0; i < maxn; i++)
{
uint32_t cur = heads[i];
while (cur)
{
if (!nexts[cur-1])
{
fprintf(stderr, "ERROR: item %d from freelist %d is not free\n", cur-1, i);
print();
abort();
}
if (nexts[cur-1] >= count+2)
{
fprintf(stderr, "ERROR: next out of range at %d: %d\n", cur-1, nexts[cur-1]);
print();
abort();
}
if (!(i < maxn-1 ? sizes[cur-1] == i+1 : (sizes[cur-1] >= i+1)))
{
fprintf(stderr, "ERROR: item %d is in wrong freelist: expected size %d, but actual size is %d\n", cur-1, i+1, sizes[cur-1]);
print();
abort();
}
if (reachable.find(cur-1) != reachable.end())
{
fprintf(stderr, "ERROR: doubly-claimed item %d\n", cur-1);
print();
abort();
}
reachable.insert(cur-1);
cur = nexts[cur-1]-1;
}
}
for (int i = 0; i < count; )
{
if (sizes[i])
{
assert(i+sizes[i] <= count);
if (sizes[i] > 1 && sizes[i+sizes[i]-1] != -sizes[i])
{
fprintf(stderr, "ERROR: start/end mismatch at %d: sizes[%d] should be %d, but is %d\n", i, i+sizes[i]-1, -sizes[i], sizes[i+sizes[i]-1]);
print();
abort();
}
for (int j = i+1; j < i+sizes[i]-1; j++)
{
if (sizes[j])
{
fprintf(stderr, "ERROR: internal non-zero at %d: %d\n", j, sizes[j]);
print();
abort();
}
}
if (nexts[i] && reachable.find(i) == reachable.end())
{
fprintf(stderr, "ERROR: %d is unreachable from heads\n", i);
print();
abort();
}
if (nexts[i] >= 2)
{
if (nexts[i] >= 2+count)
{
fprintf(stderr, "ERROR: next out of range at %d: %d\n", i, nexts[i]);
print();
abort();
}
if (prevs[nexts[i]-2] != i+1)
{
fprintf(stderr, "ERROR: prev[next] (%d) != this (%d) at %d", prevs[nexts[i]-2], i+1, i);
print();
abort();
}
}
i += (sizes[i] > 1 ? sizes[i] : 1);
}
else
i++;
}
}
void multilist_alloc_t::print()
{
printf("heads:");
for (int i = 0; i < maxn; i++)
if (heads[i])
printf(" %u=%u", i, heads[i]);
printf("\n");
printf("sizes:");
for (int i = 0; i < count; i++)
if (sizes[i])
printf(" %d=%d", i, sizes[i]);
printf("\n");
printf("prevs:");
for (int i = 0; i < count; i++)
if (prevs[i])
printf(" %d=%d", i, prevs[i]);
printf("\n");
printf("nexts:");
for (int i = 0; i < count; i++)
if (nexts[i])
printf(" %d=%d", i, nexts[i]);
printf("\n");
printf("items:");
for (int i = 0; i < count; )
{
if (sizes[i])
{
printf(" %u=(s:%d,n:%u,p:%u)", i, sizes[i], nexts[i], prevs[i]);
assert(i+sizes[i] <= count);
i += (sizes[i] > 1 ? sizes[i] : 1);
}
else
i++;
}
printf("\n");
}
void multilist_alloc_t::use(uint32_t pos, uint32_t size)
{
assert(pos+size <= count && size > 0);
if (sizes[pos] <= 0)
{
uint32_t start = pos;
if (sizes[start] < 0)
start += sizes[start]+1;
else
while (start > 0 && !sizes[start])
start--;
assert(sizes[start] >= size);
use_full(start);
uint32_t full = sizes[start];
sizes[pos-1] = -pos+start;
sizes[start] = pos-start;
free(start);
sizes[pos+size-1] = -size;
sizes[pos] = size;
if (pos+size < start+full)
{
sizes[start+full-1] = -(start+full-pos-size);
sizes[pos+size] = start+full-pos-size;
free(pos+size);
}
}
else
{
assert(sizes[pos] >= size);
use_full(pos);
if (sizes[pos] > size)
{
uint32_t full = sizes[pos];
sizes[pos+size-1] = -size;
sizes[pos] = size;
sizes[pos+full-1] = -full+size;
sizes[pos+size] = full-size;
free(pos+size);
}
}
#ifdef MULTILIST_TRACE
print();
#endif
}
void multilist_alloc_t::use_full(uint32_t pos)
{
uint32_t prevsize = sizes[pos];
assert(prevsize);
assert(nexts[pos]);
uint32_t pi = (prevsize < maxn ? prevsize : maxn)-1;
if (heads[pi] == pos+1)
heads[pi] = nexts[pos]-1;
if (prevs[pos])
nexts[prevs[pos]-1] = nexts[pos];
if (nexts[pos] >= 2)
prevs[nexts[pos]-2] = prevs[pos];
prevs[pos] = 0;
nexts[pos] = 0;
}
void multilist_alloc_t::free(uint32_t pos)
{
do_free(pos);
#ifdef MULTILIST_TRACE
print();
#endif
}
void multilist_alloc_t::do_free(uint32_t pos)
{
assert(!nexts[pos]);
uint32_t size = sizes[pos];
assert(size > 0);
// merge with previous?
if (pos > 0 && nexts[pos+(sizes[pos-1] == 1 ? -1 : sizes[pos-1])] > 0)
{
assert(sizes[pos-1] < 0 || sizes[pos-1] == 1);
uint32_t prevsize = sizes[pos-1] < 0 ? -sizes[pos-1] : 1;
use_full(pos-prevsize);
sizes[pos] = 0;
sizes[pos-1] = 0;
size += prevsize;
pos -= prevsize;
sizes[pos+size-1] = -size;
sizes[pos] = size;
}
// merge with next?
if (pos+size < count && nexts[pos+size] >= 1)
{
uint32_t nextsize = sizes[pos+size];
use_full(pos+size);
sizes[pos+size] = 0;
sizes[pos+size-1] = 0;
size += nextsize;
sizes[pos+size-1] = -size;
sizes[pos] = size;
}
uint32_t ni = (size < maxn ? size : maxn)-1;
nexts[pos] = heads[ni]+1;
prevs[pos] = 0;
if (heads[ni])
prevs[heads[ni]-1] = pos+1;
heads[ni] = pos+1;
}
multilist_index_t::multilist_index_t(uint32_t count, uint32_t max_used, uint32_t init_used):
count(count), max_used(max_used)
{
assert(init_used < max_used);
nexts.resize(count, UINT32_MAX);
prevs.resize(count, UINT32_MAX);
heads.resize(max_used, UINT32_MAX);
for (size_t i = 0; i < count-1; i++)
{
nexts[i] = i+1;
prevs[i+1] = i;
}
prevs[0] = UINT32_MAX;
nexts[count-1] = UINT32_MAX;
heads[init_used] = 0;
}
uint32_t multilist_index_t::find(uint32_t wanted_used)
{
assert(wanted_used < max_used);
return heads[wanted_used];
}
void multilist_index_t::change(uint32_t pos, uint32_t old_used, uint32_t new_used)
{
if (new_used == old_used)
return;
assert(old_used < max_used && new_used < max_used);
if (prevs[pos] != UINT32_MAX)
nexts[prevs[pos]] = nexts[pos];
if (nexts[pos] != UINT32_MAX)
prevs[nexts[pos]] = prevs[pos];
if (heads[old_used] == pos)
heads[old_used] = nexts[pos];
prevs[pos] = UINT32_MAX;
if (heads[new_used] != UINT32_MAX)
prevs[heads[new_used]] = pos;
nexts[pos] = heads[new_used];
heads[new_used] = pos;
}
void multilist_index_t::print()
{
printf("heads:");
for (int i = 0; i < max_used; i++)
if (heads[i] != UINT32_MAX)
printf(" %u=%u", i, heads[i]);
printf("\n");
printf("prevs:");
for (int i = 0; i < count; i++)
if (prevs[i] != UINT32_MAX)
printf(" %d=%d", i, prevs[i]);
printf("\n");
printf("nexts:");
for (int i = 0; i < count; i++)
if (nexts[i] != UINT32_MAX)
printf(" %d=%d", i, nexts[i]);
printf("\n");
}

View File

@@ -0,0 +1,37 @@
// Variable-length O(1) disk space allocator
// Copyright (c) Vitaliy Filippov, 2025+
// License: VNPL-1.1 (see README.md for details)
#pragma once
#include <stdint.h>
#include <vector>
struct multilist_alloc_t
{
const uint32_t count, maxn;
std::vector<int32_t> sizes;
std::vector<uint32_t> nexts, prevs, heads;
multilist_alloc_t(uint32_t count, uint32_t maxn);
bool is_free(uint32_t pos);
uint32_t find(uint32_t size);
void use_full(uint32_t pos);
void use(uint32_t pos, uint32_t size);
void do_free(uint32_t pos);
void free(uint32_t pos);
void verify();
void print();
};
struct multilist_index_t
{
const uint32_t count, max_used;
std::vector<uint32_t> nexts, prevs, heads;
// used should be always < max_used
multilist_index_t(uint32_t count, uint32_t max_used, uint32_t init_used);
uint32_t find(uint32_t wanted_used);
void change(uint32_t pos, uint32_t old_used, uint32_t new_used);
void print();
};

Some files were not shown because too many files have changed in this diff Show More