Compare commits

..

21 Commits

Author SHA1 Message Date
88c9f9bd6d Test etcd schizophrenia
Some checks failed
Test / test_write_xor (push) Successful in 36s
Test / test_write_iothreads (push) Successful in 38s
Test / test_write_no_same (push) Successful in 9s
Test / test_rebalance_verify_ec_imm (push) Successful in 1m41s
Test / test_heal_pg_size_2 (push) Successful in 2m19s
Test / test_heal_local_read (push) Successful in 2m18s
Test / test_heal_ec (push) Successful in 2m20s
Test / test_etcd_fail (push) Failing after 10m5s
Test / test_heal_antietcd (push) Successful in 2m18s
Test / test_heal_csum_32k_dmj (push) Successful in 2m19s
Test / test_heal_csum_32k_dj (push) Successful in 2m19s
Test / test_heal_csum_32k (push) Successful in 2m19s
Test / test_resize (push) Successful in 14s
Test / test_resize_auto (push) Successful in 9s
Test / test_heal_csum_4k_dmj (push) Successful in 2m19s
Test / test_heal_csum_4k_dj (push) Successful in 2m19s
Test / test_heal_csum_4k (push) Successful in 2m19s
Test / test_osd_tags (push) Successful in 9s
Test / test_snapshot_pool2 (push) Successful in 15s
Test / test_enospc (push) Successful in 13s
Test / test_enospc_imm (push) Successful in 12s
Test / test_enospc_xor (push) Successful in 14s
Test / test_enospc_imm_xor (push) Successful in 13s
Test / test_scrub (push) Successful in 15s
Test / test_scrub_zero_osd_2 (push) Successful in 14s
Test / test_scrub_xor (push) Successful in 14s
Test / test_scrub_pg_size_3 (push) Successful in 14s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 17s
Test / test_nfs (push) Successful in 12s
Test / test_scrub_ec (push) Successful in 17s
2025-07-02 18:38:04 +03:00
f515fcce62 Fix volume_size_info in PVE VitastorPlugin 2025-06-18 12:38:09 +03:00
97bb809b54 Release 2.2.2
Some checks reported warnings
Test / test_osd_tags (push) Blocked by required conditions
Test / test_enospc (push) Blocked by required conditions
Test / test_enospc_xor (push) Blocked by required conditions
Test / test_enospc_imm (push) Blocked by required conditions
Test / test_enospc_imm_xor (push) Blocked by required conditions
Test / test_scrub (push) Blocked by required conditions
Test / test_scrub_zero_osd_2 (push) Blocked by required conditions
Test / test_scrub_xor (push) Blocked by required conditions
Test / test_scrub_pg_size_3 (push) Blocked by required conditions
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Blocked by required conditions
Test / test_scrub_ec (push) Blocked by required conditions
Test / test_nfs (push) Blocked by required conditions
Test / buildenv (push) Successful in 10s
Test / make_test (push) Has been cancelled
Test / npm_lint (push) Has been cancelled
Test / test_add_osd (push) Has been cancelled
Test / test_cas (push) Has been cancelled
Test / build (push) Has been cancelled
Test / test_change_pg_count (push) Has been cancelled
Test / test_change_pg_count_ec (push) Has been cancelled
Test / test_change_pg_size (push) Has been cancelled
Test / test_create_nomaxid (push) Has been cancelled
Test / test_etcd_fail (push) Has been cancelled
Test / test_etcd_fail_antietcd (push) Has been cancelled
Test / test_interrupted_rebalance (push) Has been cancelled
Test / test_interrupted_rebalance_imm (push) Has been cancelled
Test / test_interrupted_rebalance_ec (push) Has been cancelled
Test / test_interrupted_rebalance_ec_imm (push) Has been cancelled
Test / test_create_halfhost (push) Has been cancelled
Test / test_failure_domain (push) Has been cancelled
- Fix a bug introduced in 2.2.0 - pg_locks weren't disabled for pools without local_reads
  correctly which could lead to inactive pools during various operations
- Fix an old bug where OSDs could send sub-operations to incorrect peer OSDs when their
  connections were stopped and reestablished quickly, in 2.2.0 it was usually leading
  to "sequencing broken" messages in OSD logs
- Fix debug use_sync_send_recv mode
2025-06-07 12:56:48 +03:00
6022a61329 Decouple break_pg_locks from outbound OSD disconnections
All checks were successful
Test / test_switch_primary (push) Successful in 34s
Test / test_write (push) Successful in 33s
Test / test_write_xor (push) Successful in 36s
Test / test_write_no_same (push) Successful in 7s
Test / test_write_iothreads (push) Successful in 41s
Test / test_heal_pg_size_2 (push) Successful in 2m17s
Test / test_heal_local_read (push) Successful in 2m20s
Test / test_heal_ec (push) Successful in 2m20s
Test / test_heal_antietcd (push) Successful in 2m17s
Test / test_heal_csum_32k_dmj (push) Successful in 2m21s
Test / test_heal_csum_32k_dj (push) Successful in 2m21s
Test / test_heal_csum_32k (push) Successful in 2m27s
Test / test_heal_csum_4k_dmj (push) Successful in 2m18s
Test / test_resize (push) Successful in 13s
Test / test_resize_auto (push) Successful in 8s
Test / test_osd_tags (push) Successful in 7s
Test / test_snapshot_pool2 (push) Successful in 14s
Test / test_enospc (push) Successful in 11s
Test / test_enospc_xor (push) Successful in 13s
Test / test_enospc_imm (push) Successful in 11s
Test / test_enospc_imm_xor (push) Successful in 15s
Test / test_scrub (push) Successful in 14s
Test / test_scrub_zero_osd_2 (push) Successful in 13s
Test / test_scrub_xor (push) Successful in 14s
Test / test_scrub_pg_size_3 (push) Successful in 17s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s
Test / test_scrub_ec (push) Successful in 16s
Test / test_nfs (push) Successful in 13s
Test / test_heal_csum_4k_dj (push) Successful in 2m20s
Test / test_heal_csum_4k (push) Successful in 2m21s
2025-06-05 02:48:54 +03:00
a3c1996101 Do not accidentally clear incorrect osd_peer_fds entries
Some checks failed
Test / test_switch_primary (push) Successful in 33s
Test / test_write (push) Successful in 32s
Test / test_write_xor (push) Successful in 34s
Test / test_write_no_same (push) Successful in 8s
Test / test_write_iothreads (push) Successful in 39s
Test / test_heal_pg_size_2 (push) Successful in 2m18s
Test / test_heal_ec (push) Successful in 2m20s
Test / test_heal_antietcd (push) Successful in 2m18s
Test / test_heal_csum_32k_dmj (push) Successful in 2m21s
Test / test_heal_csum_32k_dj (push) Successful in 2m18s
Test / test_heal_csum_32k (push) Successful in 2m21s
Test / test_heal_csum_4k_dmj (push) Successful in 2m25s
Test / test_heal_csum_4k_dj (push) Successful in 2m22s
Test / test_resize_auto (push) Successful in 9s
Test / test_resize (push) Successful in 13s
Test / test_heal_csum_4k (push) Successful in 2m18s
Test / test_osd_tags (push) Successful in 8s
Test / test_snapshot_pool2 (push) Successful in 16s
Test / test_enospc (push) Successful in 12s
Test / test_enospc_xor (push) Successful in 13s
Test / test_enospc_imm (push) Successful in 10s
Test / test_enospc_imm_xor (push) Successful in 13s
Test / test_scrub (push) Successful in 13s
Test / test_scrub_zero_osd_2 (push) Successful in 14s
Test / test_scrub_xor (push) Successful in 15s
Test / test_scrub_pg_size_3 (push) Successful in 15s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 18s
Test / test_nfs (push) Successful in 11s
Test / test_scrub_ec (push) Successful in 16s
Test / test_heal_local_read (push) Failing after 10m10s
2025-06-05 02:22:13 +03:00
8d2a1f0297 Fix PG lock auto-enabling/auto-disabling in the default configuration 2025-06-05 02:22:01 +03:00
91cbc313c2 Change "on osd -123" logging to "on peer 123" for unknown connections 2025-06-05 02:22:01 +03:00
f0a025428e Postpone read/write handlers using timerfd in the debug use_sync_send_recv mode 2025-06-05 02:22:01 +03:00
67071158bd Cancel outbound operations only in the osd_client_t destructor
This is required to prevent disconnected peers from sometimes receiving messages
suited for other peers - stop_client was freeing the operations even though they
were still references in the io_uring requests in progress. This was leading to
OSDs sometimes receiving garbage and "broken sequencing" errors in logs as the
memory was usually already reallocated for other operations
2025-06-05 02:09:41 +03:00
cd028612c8 Use a separate osd_client_t::in_osd_num for inbound OSD connections 2025-06-05 02:09:41 +03:00
f390e73dae Log broken sequence numbers in "sequencing" errors 2025-06-05 02:09:41 +03:00
de2539c491 Correct Proxmox version 2025-06-03 01:56:09 +03:00
957a4fce7e Release 2.2.1
All checks were successful
Test / test_rebalance_verify_ec_imm (push) Successful in 1m47s
Test / test_write_no_same (push) Successful in 9s
Test / test_write_xor (push) Successful in 36s
Test / test_write_iothreads (push) Successful in 36s
Test / test_heal_pg_size_2 (push) Successful in 2m15s
Test / test_heal_local_read (push) Successful in 2m21s
Test / test_heal_ec (push) Successful in 2m16s
Test / test_heal_antietcd (push) Successful in 2m18s
Test / test_heal_csum_32k_dmj (push) Successful in 2m29s
Test / test_heal_csum_32k_dj (push) Successful in 2m24s
Test / test_heal_csum_32k (push) Successful in 2m29s
Test / test_heal_csum_4k_dmj (push) Successful in 2m27s
Test / test_resize_auto (push) Successful in 11s
Test / test_resize (push) Successful in 18s
Test / test_osd_tags (push) Successful in 10s
Test / test_snapshot_pool2 (push) Successful in 18s
Test / test_enospc (push) Successful in 12s
Test / test_enospc_xor (push) Successful in 14s
Test / test_enospc_imm (push) Successful in 11s
Test / test_enospc_imm_xor (push) Successful in 15s
Test / test_scrub (push) Successful in 12s
Test / test_scrub_zero_osd_2 (push) Successful in 14s
Test / test_scrub_xor (push) Successful in 16s
Test / test_scrub_pg_size_3 (push) Successful in 13s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s
Test / test_scrub_ec (push) Successful in 17s
Test / test_nfs (push) Successful in 11s
Test / test_heal_csum_4k_dj (push) Successful in 2m24s
Test / test_heal_csum_4k (push) Successful in 2m27s
Test / test_rebalance_verify (push) Successful in 1m51s
- Fix vitastor-disk purge broken after adding the "OSD is still running" check
- Fix iothreads hanging after adding zero-copy send support
- Fix enabling localized reads online (without restarting OSDs) in the default PG lock mode
2025-05-25 01:04:48 +03:00
f201ecdd51 Fix missing mutex unlock with zero-copy and iothreads O_o
All checks were successful
Test / test_switch_primary (push) Successful in 32s
Test / test_write (push) Successful in 33s
Test / test_write_xor (push) Successful in 36s
Test / test_write_no_same (push) Successful in 8s
Test / test_write_iothreads (push) Successful in 37s
Test / test_heal_pg_size_2 (push) Successful in 2m17s
Test / test_heal_local_read (push) Successful in 2m18s
Test / test_heal_ec (push) Successful in 2m18s
Test / test_heal_antietcd (push) Successful in 2m17s
Test / test_heal_csum_32k_dmj (push) Successful in 2m21s
Test / test_heal_csum_32k_dj (push) Successful in 2m26s
Test / test_heal_csum_32k (push) Successful in 2m30s
Test / test_heal_csum_4k_dmj (push) Successful in 2m18s
Test / test_resize_auto (push) Successful in 9s
Test / test_resize (push) Successful in 13s
Test / test_osd_tags (push) Successful in 8s
Test / test_snapshot_pool2 (push) Successful in 14s
Test / test_enospc (push) Successful in 12s
Test / test_enospc_xor (push) Successful in 13s
Test / test_enospc_imm (push) Successful in 11s
Test / test_enospc_imm_xor (push) Successful in 13s
Test / test_scrub (push) Successful in 14s
Test / test_scrub_zero_osd_2 (push) Successful in 15s
Test / test_scrub_xor (push) Successful in 16s
Test / test_scrub_pg_size_3 (push) Successful in 14s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 16s
Test / test_scrub_ec (push) Successful in 15s
Test / test_nfs (push) Successful in 11s
Test / test_heal_csum_4k_dj (push) Successful in 2m19s
Test / test_heal_csum_4k (push) Successful in 2m19s
2025-05-24 00:56:31 +03:00
4afb617f59 Also zero-init sqe
Some checks failed
Test / test_rebalance_verify_ec_imm (push) Successful in 1m43s
Test / test_write_no_same (push) Successful in 8s
Test / test_write_xor (push) Successful in 36s
Test / test_heal_pg_size_2 (push) Successful in 2m17s
Test / test_heal_local_read (push) Successful in 2m17s
Test / test_heal_ec (push) Successful in 2m18s
Test / test_heal_antietcd (push) Successful in 2m17s
Test / test_heal_csum_32k_dmj (push) Successful in 2m25s
Test / test_heal_csum_32k_dj (push) Successful in 2m19s
Test / test_heal_csum_32k (push) Successful in 2m28s
Test / test_resize (push) Successful in 14s
Test / test_resize_auto (push) Successful in 8s
Test / test_snapshot_pool2 (push) Successful in 14s
Test / test_osd_tags (push) Successful in 7s
Test / test_enospc (push) Successful in 10s
Test / test_enospc_xor (push) Successful in 13s
Test / test_enospc_imm (push) Successful in 11s
Test / test_heal_csum_4k_dmj (push) Successful in 2m20s
Test / test_enospc_imm_xor (push) Successful in 13s
Test / test_heal_csum_4k_dj (push) Successful in 2m20s
Test / test_scrub (push) Successful in 13s
Test / test_scrub_zero_osd_2 (push) Successful in 15s
Test / test_heal_csum_4k (push) Successful in 2m19s
Test / test_scrub_xor (push) Successful in 15s
Test / test_scrub_pg_size_3 (push) Successful in 14s
Test / test_nfs (push) Successful in 14s
Test / test_scrub_ec (push) Successful in 16s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 19s
Test / test_rebalance_verify (push) Successful in 1m55s
Test / test_write_iothreads (push) Failing after 3m5s
2025-05-23 21:18:37 +03:00
d3fde0569f Add a test with enabled iothreads
Some checks failed
Test / test_switch_primary (push) Successful in 34s
Test / test_write (push) Successful in 32s
Test / test_write_no_same (push) Successful in 8s
Test / test_write_xor (push) Successful in 34s
Test / test_heal_pg_size_2 (push) Successful in 2m16s
Test / test_heal_local_read (push) Successful in 2m18s
Test / test_heal_ec (push) Successful in 2m18s
Test / test_write_iothreads (push) Failing after 3m5s
Test / test_heal_csum_4k_dmj (push) Has been cancelled
Test / test_heal_csum_4k_dj (push) Has been cancelled
Test / test_heal_csum_4k (push) Has been cancelled
Test / test_resize (push) Has been cancelled
Test / test_resize_auto (push) Has been cancelled
Test / test_snapshot_pool2 (push) Has been cancelled
Test / test_osd_tags (push) Has been cancelled
Test / test_enospc (push) Has been cancelled
Test / test_enospc_xor (push) Has been cancelled
Test / test_enospc_imm (push) Has been cancelled
Test / test_enospc_imm_xor (push) Has been cancelled
Test / test_scrub (push) Has been cancelled
Test / test_scrub_zero_osd_2 (push) Has been cancelled
Test / test_scrub_xor (push) Has been cancelled
Test / test_scrub_pg_size_3 (push) Has been cancelled
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Has been cancelled
Test / test_scrub_ec (push) Has been cancelled
Test / test_nfs (push) Has been cancelled
Test / test_heal_csum_32k (push) Has been cancelled
Test / test_heal_csum_32k_dmj (push) Has been cancelled
Test / test_heal_antietcd (push) Has been cancelled
Test / test_heal_csum_32k_dj (push) Has been cancelled
2025-05-23 21:05:18 +03:00
438b64f6c3 Allow to enable PG locks online when changing local_reads in pool configuration
Some checks reported warnings
Test / test_switch_primary (push) Has been cancelled
Test / test_write (push) Has been cancelled
Test / test_write_xor (push) Has been cancelled
Test / test_write_no_same (push) Has been cancelled
Test / test_heal_pg_size_2 (push) Has been cancelled
Test / test_heal_local_read (push) Has been cancelled
Test / test_heal_ec (push) Has been cancelled
Test / test_heal_antietcd (push) Has been cancelled
Test / test_heal_csum_32k_dmj (push) Has been cancelled
Test / test_heal_csum_32k_dj (push) Has been cancelled
Test / test_heal_csum_32k (push) Has been cancelled
Test / test_heal_csum_4k_dmj (push) Has been cancelled
Test / test_heal_csum_4k_dj (push) Has been cancelled
Test / test_heal_csum_4k (push) Has been cancelled
Test / test_resize (push) Has been cancelled
Test / test_resize_auto (push) Has been cancelled
Test / test_snapshot_pool2 (push) Has been cancelled
Test / test_osd_tags (push) Has been cancelled
Test / test_enospc (push) Has been cancelled
Test / test_enospc_xor (push) Has been cancelled
Test / test_enospc_imm (push) Has been cancelled
Test / test_enospc_imm_xor (push) Has been cancelled
Test / test_scrub (push) Has been cancelled
Test / test_scrub_zero_osd_2 (push) Has been cancelled
Test / test_scrub_xor (push) Has been cancelled
Test / test_scrub_pg_size_3 (push) Has been cancelled
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Has been cancelled
Test / test_scrub_ec (push) Has been cancelled
Test / test_nfs (push) Has been cancelled
Test / make_test (push) Has been cancelled
2025-05-23 20:54:47 +03:00
2b0a802ea1 Fix iothreads sometimes hanging after adding zerocopy support
Some checks reported warnings
Test / test_rebalance_verify_imm (push) Successful in 1m39s
Test / test_dd (push) Successful in 11s
Test / test_rebalance_verify_ec (push) Successful in 1m44s
Test / test_root_node (push) Successful in 7s
Test / test_rebalance_verify_ec_imm (push) Successful in 1m44s
Test / test_write_no_same (push) Successful in 8s
Test / test_switch_primary (push) Successful in 32s
Test / test_write (push) Successful in 31s
Test / test_write_xor (push) Successful in 35s
Test / test_heal_csum_32k_dmj (push) Has been cancelled
Test / test_heal_csum_32k_dj (push) Has been cancelled
Test / test_heal_csum_32k (push) Has been cancelled
Test / test_heal_csum_4k_dmj (push) Has been cancelled
Test / test_heal_csum_4k_dj (push) Has been cancelled
Test / test_heal_ec (push) Has been cancelled
Test / test_heal_csum_4k (push) Has been cancelled
Test / test_resize (push) Has been cancelled
Test / test_heal_pg_size_2 (push) Has been cancelled
Test / test_resize_auto (push) Has been cancelled
Test / test_heal_local_read (push) Has been cancelled
Test / test_snapshot_pool2 (push) Has been cancelled
Test / test_heal_antietcd (push) Has been cancelled
Test / test_osd_tags (push) Has been cancelled
Test / test_enospc (push) Has been cancelled
Test / test_enospc_xor (push) Has been cancelled
Test / test_enospc_imm (push) Has been cancelled
Test / test_enospc_imm_xor (push) Has been cancelled
Test / test_scrub (push) Has been cancelled
Test / test_scrub_zero_osd_2 (push) Has been cancelled
Test / test_scrub_xor (push) Has been cancelled
2025-05-23 20:54:03 +03:00
0dd49c1d67 Followup to "allow to purge running OSDs again"
All checks were successful
Test / test_write_no_same (push) Successful in 7s
Test / test_rebalance_verify_ec_imm (push) Successful in 1m44s
Test / test_write (push) Successful in 34s
Test / test_write_xor (push) Successful in 35s
Test / test_heal_pg_size_2 (push) Successful in 2m16s
Test / test_heal_local_read (push) Successful in 2m19s
Test / test_heal_ec (push) Successful in 2m17s
Test / test_heal_antietcd (push) Successful in 2m19s
Test / test_heal_csum_32k_dmj (push) Successful in 2m16s
Test / test_heal_csum_32k_dj (push) Successful in 2m18s
Test / test_heal_csum_32k (push) Successful in 2m19s
Test / test_heal_csum_4k_dmj (push) Successful in 2m17s
Test / test_resize_auto (push) Successful in 8s
Test / test_resize (push) Successful in 13s
Test / test_osd_tags (push) Successful in 9s
Test / test_snapshot_pool2 (push) Successful in 13s
Test / test_enospc (push) Successful in 10s
Test / test_enospc_xor (push) Successful in 12s
Test / test_enospc_imm (push) Successful in 9s
Test / test_enospc_imm_xor (push) Successful in 14s
Test / test_scrub (push) Successful in 14s
Test / test_scrub_zero_osd_2 (push) Successful in 15s
Test / test_scrub_xor (push) Successful in 14s
Test / test_scrub_pg_size_3 (push) Successful in 14s
Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 15s
Test / test_scrub_ec (push) Successful in 14s
Test / test_nfs (push) Successful in 11s
Test / test_heal_csum_4k_dj (push) Successful in 2m17s
Test / test_heal_csum_4k (push) Successful in 2m18s
Test / test_rebalance_verify (push) Successful in 1m37s
2025-05-22 01:10:05 +03:00
410170db96 Add notes about VNPL in English 2025-05-20 02:12:49 +03:00
7d8523e0e5 Add more notes about VNPL in Russian 2025-05-19 02:41:34 +03:00
52 changed files with 437 additions and 537 deletions

View File

@@ -684,6 +684,24 @@ jobs:
echo ""
done
test_write_iothreads:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: TEST_NAME=iothreads GLOBAL_CONFIG=',"client_iothread_count":4' /root/vitastor/tests/test_write.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_write_no_same:
runs-on: ubuntu-latest
needs: build

View File

@@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)
project(vitastor)
set(VITASTOR_VERSION "2.2.0")
set(VITASTOR_VERSION "2.2.2")
add_subdirectory(src)

View File

@@ -1,4 +1,4 @@
VITASTOR_VERSION ?= v2.2.0
VITASTOR_VERSION ?= v2.2.2
all: build push

View File

@@ -49,7 +49,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
allowPrivilegeEscalation: true
image: vitalif/vitastor-csi:v2.2.0
image: vitalif/vitastor-csi:v2.2.2
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -121,7 +121,7 @@ spec:
privileged: true
capabilities:
add: ["SYS_ADMIN"]
image: vitalif/vitastor-csi:v2.2.0
image: vitalif/vitastor-csi:v2.2.2
args:
- "--node=$(NODE_ID)"
- "--endpoint=$(CSI_ENDPOINT)"

View File

@@ -5,7 +5,7 @@ package vitastor
const (
vitastorCSIDriverName = "csi.vitastor.io"
vitastorCSIDriverVersion = "2.2.0"
vitastorCSIDriverVersion = "2.2.2"
)
// Config struct fills the parameters of request or user input

2
debian/changelog vendored
View File

@@ -1,4 +1,4 @@
vitastor (2.2.0-1) unstable; urgency=medium
vitastor (2.2.2-1) unstable; urgency=medium
* Bugfixes

View File

@@ -1,4 +1,4 @@
VITASTOR_VERSION ?= v2.2.0
VITASTOR_VERSION ?= v2.2.2
all: build push

View File

@@ -4,7 +4,7 @@
#
# Desired Vitastor version
VITASTOR_VERSION=v2.2.0
VITASTOR_VERSION=v2.2.2
# Additional arguments for all containers
# For example, you may want to specify a custom logging driver here

View File

@@ -26,9 +26,9 @@ at Vitastor Kubernetes operator: https://github.com/Antilles7227/vitastor-operat
The instruction is very simple.
1. Download a Docker image of the desired version: \
`docker pull vitastor:v2.2.0`
`docker pull vitastor:v2.2.2`
2. Install scripts to the host system: \
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.0 install.sh`
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.2 install.sh`
3. Reload udev rules: \
`udevadm control --reload-rules`

View File

@@ -25,9 +25,9 @@ Vitastor можно установить в Docker/Podman. При этом etcd,
Инструкция по установке максимально простая.
1. Скачайте Docker-образ желаемой версии: \
`docker pull vitastor:v2.2.0`
`docker pull vitastor:v2.2.2`
2. Установите скрипты в хост-систему командой: \
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.0 install.sh`
`docker run --rm -it -v /etc:/host-etc -v /usr/bin:/host-bin vitastor:v2.2.2 install.sh`
3. Перезагрузите правила udev: \
`udevadm control --reload-rules`

View File

@@ -6,10 +6,10 @@
# Proxmox VE
To enable Vitastor support in Proxmox Virtual Environment (6.4-8.1 are supported):
To enable Vitastor support in Proxmox Virtual Environment (6.4-8.x are supported):
- Add the corresponding Vitastor Debian repository into sources.list on Proxmox hosts:
bookworm for 8.1, pve8.0 for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
bookworm for 8.1+, pve8.0 for 8.0, bullseye for 7.4, pve7.3 for 7.3, pve7.2 for 7.2, pve7.1 for 7.1, buster for 6.4
- Install vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* or see note) packages from Vitastor repository
- Define storage in `/etc/pve/storage.cfg` (see below)
- Block network access from VMs to Vitastor network (to OSDs and etcd),

View File

@@ -6,10 +6,10 @@
# Proxmox VE
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.1):
Чтобы подключить Vitastor к Proxmox Virtual Environment (поддерживаются версии 6.4-8.x):
- Добавьте соответствующий Debian-репозиторий Vitastor в sources.list на хостах Proxmox:
bookworm для 8.1, pve8.0 для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
bookworm для 8.1+, pve8.0 для 8.0, bullseye для 7.4, pve7.3 для 7.3, pve7.2 для 7.2, pve7.1 для 7.1, buster для 6.4
- Установите пакеты vitastor-client, pve-qemu-kvm, pve-storage-vitastor (* или см. сноску) из репозитория Vitastor
- Определите тип хранилища в `/etc/pve/storage.cfg` (см. ниже)
- Обязательно заблокируйте доступ от виртуальных машин к сети Vitastor (OSD и etcd), т.к. Vitastor (пока) не поддерживает аутентификацию

View File

@@ -10,8 +10,17 @@ Copyright (c) Vitaliy Filippov (vitalif [at] yourcmc.ru), 2019+
Join Vitastor Telegram Chat: https://t.me/vitastor
All server-side code (OSD, Monitor and so on) is licensed under the terms of
Vitastor Network Public License 1.1 (VNPL 1.1), a copyleft license based on
License: VNPL 1.1 for server-side code and dual VNPL 1.1 + GPL 2.0+ for client tools.
Server-side code is licensed only under the terms of VNPL.
Client libraries (cluster_client and so on) are dual-licensed under the same
VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
software like QEMU and fio.
## VNPL
Vitastor Network Public License 1.1 (VNPL 1.1) is a copyleft license based on
GNU GPLv3.0 with the additional "Network Interaction" clause which requires
opensourcing all programs directly or indirectly interacting with Vitastor
through a computer network and expressly designed to be used in conjunction
@@ -20,18 +29,83 @@ the terms of the same license, but also under the terms of any GPL-Compatible
Free Software License, as listed by the Free Software Foundation.
This is a stricter copyleft license than the Affero GPL.
Please note that VNPL doesn't require you to open the code of proprietary
software running inside a VM if it's not specially designed to be used with
Vitastor.
The idea of VNPL is, in addition to modules linked to Vitastor code in a single
binary file, to extend copyleft action to micro-service modules only interacting
with it over the network.
Basically, you can't use the software in a proprietary environment to provide
its functionality to users without opensourcing all intermediary components
standing between the user and Vitastor or purchasing a commercial license
from the author 😀.
Client libraries (cluster_client and so on) are dual-licensed under the same
VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed
software like QEMU and fio.
At the same time, VNPL doesn't impose any restrictions on software *not specially designed*
to be used with Vitastor, for example, on Windows running inside a VM with a Vitastor disk.
You can find the full text of VNPL-1.1 in the file [VNPL-1.1.txt](../../VNPL-1.1.txt).
GPL 2.0 is also included in this repository as [GPL-2.0.txt](../../GPL-2.0.txt).
## Explanation
Network copyleft is governed by the clause **13. Remote Network Interaction** of VNPL.
A program is considered to be a "Proxy Program" if it meets both conditions:
- It is specially designed to be used with Vitastor. Basically, it means that the program
has any functionality specific to Vitastor and thus "knows" that it works with Vitastor,
not with something random.
- It interacts with Vitastor directly or indirectly through any programming interface,
including API, CLI, network or any wrapper (also considered a Proxy Program itself).
If, in addition to that:
- You give any user an apportunity to interact with Vitastor directly or indirectly through
any computer interface including the network or any number of wrappers (Proxy Programs).
Then VNPL requires you to publish the code of all above Proxy Programs to all above users
under the terms of any GPL-compatible license - that is, GPL, LGPL, MIT/BSD or Apache 2,
because "GPL compatibility" is treated as an ability to legally include licensed code in
a GPL application.
So, if you have a "Proxy Program", but it's not open to the user who directly or indirectly
interacts with Vitastor - you are forbidden to use Vitastor under the terms of VNPL and you
need a commercial license which doesn't contain open-source requirements.
## Examples
- Vitastor Kubernetes CSI driver which creates PersistentVolumes by calling `vitastor-cli create`.
- Yes, it interacts with Vitastor through vitastor-cli.
- Yes, it is designed specially for use with Vitastor (it has no sense otherwise).
- So, CSI driver **definitely IS** a Proxy Program and must be published under the terms of
a free software license.
- Windows, installed in a VM with the system disk on Vitastor storage.
- Yes, it interacts with Vitastor indirectly - it reads and writes data through the block
device interface, emulated by QEMU.
- No, it definitely isn't designed specially for use with Vitastor - Windows was created long
ago before Vitastor and doesn't know anything about it.
- So, Windows **definitely IS NOT** a Proxy Program and VNPL doesn't require to open it.
- Cloud control panel which makes requests to Vitastor Kubernetes CSI driver.
- Yes, it interacts with Vitastor indirectly through the CSI driver, which is a Proxy Program.
- May or may not be designed specially for use with Vitastor. How to determine exactly?
Imagine that Vitastor is replaced with any other storage (for example, with a proprietary).
Do control panel functions change in any way? If they do (for example, if snapshots stop working),
then the panel contains specific functionality and thus is designed specially for use with Vitastor.
Otherwise, the panel is universal and isn't designed specially for Vitastor.
- So, whether you are required to open-source the panel also **depends** on whether it
contains specific functionality or not.
## Why?
Because I believe into the spirit of copyleft (Linux wouldn't become so popular without GPL!)
and, at the same time, I want to have a way to monetize the product.
Existing licenses including AGPL are useless for it with an SDS - SDS is a very deeply
internal software which is almost definitely invisible to the user and thus AGPL doesn't
require anyone to open the code even if they make a proprietary fork.
And, in fact, the current situation in the world where GPL is though to only restrict direct
linking of programs into a single executable file, isn't much correct. Nowadays, programs
are more often linked with network API calls, not with /usr/bin/ld, and a software product
may consist of dozens of microservices interacting with each other over the network.
That's why we need VNPL to keep the license sufficiently copyleft.
## License Texts
- VNPL 1.1 in English: [VNPL-1.1.txt](../../VNPL-1.1.txt)
- VNPL 1.1 in Russian: [VNPL-1.1-RU.txt](../../VNPL-1.1-RU.txt)
- GPL 2.0: [GPL-2.0.txt](../../GPL-2.0.txt)

View File

@@ -12,6 +12,14 @@
Лицензия: VNPL 1.1 на серверный код и двойная VNPL 1.1 + GPL 2.0+ на клиентский.
Серверные компоненты распространяются только на условиях VNPL.
Клиентские библиотеки распространяются на условиях двойной лицензии VNPL 1.0
и также на условиях GNU GPL 2.0 или более поздней версии. Так сделано в целях
совместимости с таким ПО, как QEMU и fio.
## VNPL
VNPL - "сетевой копилефт", собственная свободная копилефт-лицензия
Vitastor Network Public License 1.1, основанная на GNU GPL 3.0 с дополнительным
условием "Сетевого взаимодействия", требующим распространять все программы,
@@ -29,9 +37,70 @@ Vitastor Network Public License 1.1, основанная на GNU GPL 3.0 с д
На Windows и любое другое ПО, не разработанное *специально* для использования
вместе с Vitastor, никакие ограничения не накладываются.
Клиентские библиотеки распространяются на условиях двойной лицензии VNPL 1.0
и также на условиях GNU GPL 2.0 или более поздней версии. Так сделано в целях
совместимости с таким ПО, как QEMU и fio.
## Пояснение
Вы можете найти полный текст VNPL 1.1 на английском языке в файле [VNPL-1.1.txt](../../VNPL-1.1.txt),
VNPL 1.1 на русском языке в файле [VNPL-1.1-RU.txt](../../VNPL-1.1-RU.txt), а GPL 2.0 в файле [GPL-2.0.txt](../../GPL-2.0.txt).
Сетевой копилефт регулируется пунктом лицензии **13. Удалённое сетевое взаимодействие**.
Программа считается "прокси-программой", если верны оба условия:
- Она создана специально для работы вместе с Vitastor. По сути это означает, что программа
должна иметь специфичный для Vitastor функционал, то есть, "знать", что она взаимодействует
именно с Vitastor.
- Она прямо или косвенно взаимодействует с Vitastor через абсолютно любой программный
интерфейс, включая любые способы вызова: API, CLI, сеть или через какую-то обёртку (в
свою очередь тоже являющуюся прокси-программой).
Если в дополнение к этому также:
- Вы предоставляете любому пользователю возможность взаимодействовать с Vitastor по сети,
опять-таки, через любой интерфейс или любую серию "обёрток" (прокси-программ)
То, согласно VNPL, вы должны открыть код "прокси-программ" **таким пользователям** на условиях
любой GPL-совместимой лицензии - то есть, GPL, LGPL, MIT/BSD или Apache 2 - "совместимость с GPL"
понимается как возможность включать лицензируемый код в GPL-приложение.
Соответственно, если у вас есть "прокси-программа", но её код не открыт пользователю,
который прямо или косвенно взаимодействует с Vitastor - вам запрещено использовать Vitastor
на условиях VNPL и вам нужна коммерческая лицензия, не содержащая требований об открытии кода.
## Примеры
- Kubernetes CSI-драйвер Vitastor, создающий PersistentVolume с помощью вызова `vitastor-cli create`.
- Да, взаимодействует с Vitastor через vitastor-cli.
- Да, создавался специально для работы с Vitastor (иначе в чём же ещё его смысл).
- Значит, CSI-драйвер **точно считается** "прокси-программой" и должен быть открыт под свободной
лицензией.
- Windows, установленный в виртуальную машину на диске Vitastor.
- Да, взаимодействует с Vitastor "прямо или косвенно" - пишет и читает данные через интерфейс
блочного устройства, эмулируемый QEMU.
- Нет, точно не создан *специально для работы с Vitastor* - когда его создавали, никакого
Vitastor ещё и в помине не было.
- Значит, Windows **точно не считается** "прокси-программой" и на него требования VNPL не распространяются.
- Панель управления облака, делающая запросы к Kubernetes CSI-драйверу Vitastor.
- Да, взаимодействует с Vitastor косвенно через CSI-драйвер, являющийся "прокси-программой".
- Сходу не известно, создавалась ли конкретно для работы с Vitastor. Как понять, да или нет?
Представьте, что Vitastor заменён на любую другую систему хранения (например, на проприетарную).
Работа панели управления изменится? Если да (например, перестанут работать снапшоты) - значит,
панель содержит специфичный функционал и "создана специально для работы с Vitastor".
Если нет - значит, специфичного функционала панель не содержит и в принципе она универсальна.
- Нужно ли открывать панель - **зависит** от того, содержит она специфичный функционал или нет.
## Почему так?
Потому что я одновременно верю в дух копилефт-лицензий (Linux не стал бы так популярен,
если бы не GPL!) и хочу иметь возможность монетизации продукта.
При этом использовать даже AGPL для программной СХД бессмысленно - это глубоко внутреннее
ПО, которое пользователь почти наверняка не увидит вообще, поэтому и открывать код никому
никогда не придётся, даже при создании производного продукта.
Да и в целом сложившаяся в мире ситуация, при которой действие GPL ограничивается только
прямым связыванием в один исполняемый файл, не очень корректна. В настоящее время программы
гораздо чаще интегрируют сетевыми вызовами, а не с помощью /usr/bin/ld, и общий программный
продукт может состоять из нескольких десятков микросервисов, взаимодействующих по сети.
Поэтому для сохранения достаточной "копилефтности" и придумана VNPL.
## Тексты лицензий
- VNPL 1.1 на английском языке: [VNPL-1.1.txt](../../VNPL-1.1.txt)
- VNPL 1.1 на русском языке: [VNPL-1.1-RU.txt](../../VNPL-1.1-RU.txt)
- GPL 2.0: [GPL-2.0.txt](../../GPL-2.0.txt)

View File

@@ -1,6 +1,6 @@
{
"name": "vitastor-mon",
"version": "2.2.0",
"version": "2.2.2",
"description": "Vitastor SDS monitor service",
"main": "mon-main.js",
"scripts": {

View File

@@ -1,6 +1,6 @@
{
"name": "vitastor",
"version": "2.2.0",
"version": "2.2.2",
"description": "Low-level native bindings to Vitastor client library",
"main": "index.js",
"keywords": [

View File

@@ -410,8 +410,8 @@ sub volume_size_info
my $prefix = defined $scfg->{vitastor_prefix} ? $scfg->{vitastor_prefix} : 'pve/';
my ($vtype, $name, $vmid) = $class->parse_volname($volname);
my $info = _process_list($scfg, $storeid, run_cli($scfg, [ 'ls', $prefix.$name ]))->[0];
#return wantarray ? ($size, $format, $used, $parent, $st->ctime) : $size;
return $info->{size};
# (size, format, used, parent, ctime)
return wantarray ? ($info->{size}, $info->{format}, $info->{size}, $info->{parent}, 0) : $info->{size};
}
sub volume_resize

View File

@@ -50,7 +50,7 @@ from cinder.volume import configuration
from cinder.volume import driver
from cinder.volume import volume_utils
VITASTOR_VERSION = '2.2.0'
VITASTOR_VERSION = '2.2.2'
LOG = logging.getLogger(__name__)

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 2.2.0
Version: 2.2.2
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-2.2.0.el7.tar.gz
Source0: vitastor-2.2.2.el7.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 2.2.0
Version: 2.2.2
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-2.2.0.el8.tar.gz
Source0: vitastor-2.2.2.el8.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -1,11 +1,11 @@
Name: vitastor
Version: 2.2.0
Version: 2.2.2
Release: 1%{?dist}
Summary: Vitastor, a fast software-defined clustered block storage
License: Vitastor Network Public License 1.1
URL: https://vitastor.io/
Source0: vitastor-2.2.0.el9.tar.gz
Source0: vitastor-2.2.2.el9.tar.gz
BuildRequires: liburing-devel >= 0.6
BuildRequires: gperftools-devel

View File

@@ -19,7 +19,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
endif()
add_definitions(-DVITASTOR_VERSION="2.2.0")
add_definitions(-DVITASTOR_VERSION="2.2.2")
add_definitions(-D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
add_link_options(-fno-omit-frame-pointer)
if (${WITH_ASAN})

View File

@@ -31,7 +31,6 @@
#define DEFAULT_DATA_BLOCK_ORDER 17
#define MIN_DATA_BLOCK_SIZE 4*1024
#define MAX_DATA_BLOCK_SIZE 128*1024*1024
#define MAX_META_BLOCK_SIZE 64*1024
#define DEFAULT_BITMAP_GRANULARITY 4096
#define BS_OP_MIN 1

View File

@@ -127,9 +127,9 @@ void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config
{
throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
}
else if (meta_block_size > MAX_META_BLOCK_SIZE)
else if (meta_block_size > MAX_DATA_BLOCK_SIZE)
{
throw std::runtime_error("meta_block_size must not exceed "+std::to_string(MAX_META_BLOCK_SIZE));
throw std::runtime_error("meta_block_size must not exceed "+std::to_string(MAX_DATA_BLOCK_SIZE));
}
if (data_offset % disk_alignment)
{

View File

@@ -427,6 +427,13 @@ stop_flusher:
printf("Flushing %jx:%jx v%ju\n", cur.oid.inode, cur.oid.stripe, cur.version);
#endif
flusher->active_flushers++;
// Find it in clean_db
{
auto & clean_db = bs->clean_db_shard(cur.oid);
auto clean_it = clean_db.find(cur.oid);
old_clean_ver = (clean_it != clean_db.end() ? clean_it->second.version : 0);
old_clean_loc = (clean_it != clean_db.end() ? clean_it->second.location : UINT64_MAX);
}
// Scan dirty versions of the object to determine what we need to read
scan_dirty();
// Writes and deletes shouldn't happen at the same time
@@ -531,7 +538,7 @@ resume_2:
{
// zero out old metadata entry
{
clean_disk_entry *old_entry = (clean_disk_entry*)((uint8_t*)meta_old.buf + meta_old.pos);
clean_disk_entry *old_entry = (clean_disk_entry*)((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size);
if (old_entry->oid.inode != 0 && old_entry->oid != cur.oid)
{
printf("Fatal error (metadata corruption or bug): tried to wipe metadata entry %ju (%jx:%jx v%ju) as old location of %jx:%jx\n",
@@ -540,7 +547,7 @@ resume_2:
exit(1);
}
}
memset((uint8_t*)meta_old.buf + meta_old.pos, 0, bs->dsk.clean_entry_size);
memset((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
resume_20:
if (meta_old.sector != meta_new.sector && !write_meta_block(meta_old, 20))
return false;
@@ -601,7 +608,7 @@ resume_2:
void journal_flusher_co::update_metadata_entry()
{
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos);
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size);
if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
{
printf(
@@ -616,7 +623,7 @@ void journal_flusher_co::update_metadata_entry()
if (has_delete)
{
// Zero out the new metadata entry
memset((uint8_t*)meta_new.buf + meta_new.pos, 0, bs->dsk.clean_entry_size);
memset((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
}
else
{
@@ -798,7 +805,7 @@ bool journal_flusher_co::clear_incomplete_csum_block_bits(int wait_base)
}
}
{
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos);
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size);
if (new_entry->oid != cur.oid)
{
printf(
@@ -905,12 +912,6 @@ void journal_flusher_co::calc_block_checksums(uint32_t *new_data_csums, bool ski
void journal_flusher_co::scan_dirty()
{
// Find it in clean_db
auto & clean_db = bs->clean_db_shard(cur.oid);
auto clean_it = clean_db.find(cur.oid);
old_clean_ver = (clean_it != clean_db.end() ? clean_it->second.version : 0);
old_clean_loc = (clean_it != clean_db.end() ? clean_it->second.location : UINT64_MAX);
auto old_clean_bitmap = (clean_it != clean_db.end() ? bs->get_clean_entry_bitmap(clean_it, 0) : NULL);
dirty_it = dirty_start = dirty_end;
v.clear();
copy_count = 0;
@@ -1036,12 +1037,13 @@ void journal_flusher_co::scan_dirty()
read_to_fill_incomplete = 0;
return;
}
uint8_t *bmp_ptr = bs->get_clean_entry_bitmap(old_clean_loc, 0);
uint64_t fulfilled = 0;
int last = v.size()-1;
while (last >= 0 && (v[last].copy_flags & COPY_BUF_CSUM_FILL))
last--;
read_to_fill_incomplete = bs->fill_partial_checksum_blocks(
v, fulfilled, old_clean_bitmap, NULL, false, NULL, v[0].offset/bs->dsk.csum_block_size * bs->dsk.csum_block_size,
v, fulfilled, bmp_ptr, NULL, false, NULL, v[0].offset/bs->dsk.csum_block_size * bs->dsk.csum_block_size,
((v[last].offset+v[last].len-1) / bs->dsk.csum_block_size + 1) * bs->dsk.csum_block_size
);
}
@@ -1137,7 +1139,7 @@ bool journal_flusher_co::modify_meta_do_reads(int wait_base)
resume_0:
if (!modify_meta_read(clean_loc, meta_new, wait_base+0))
return false;
new_clean_bitmap = (uint8_t*)meta_new.buf + meta_new.pos + sizeof(clean_disk_entry);
new_clean_bitmap = (uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size + sizeof(clean_disk_entry);
if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
{
resume_1:
@@ -1191,7 +1193,7 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_
// so I'll avoid it as long as I can.
wr.submitted = false;
wr.sector = ((meta_loc >> bs->dsk.block_order) / (bs->dsk.meta_block_size / bs->dsk.clean_entry_size)) * bs->dsk.meta_block_size;
wr.pos = ((meta_loc >> bs->dsk.block_order) % (bs->dsk.meta_block_size / bs->dsk.clean_entry_size)) * bs->dsk.clean_entry_size;
wr.pos = ((meta_loc >> bs->dsk.block_order) % (bs->dsk.meta_block_size / bs->dsk.clean_entry_size));
if (bs->inmemory_meta)
{
wr.buf = (uint8_t*)bs->metadata_buffer + wr.sector;

View File

@@ -42,8 +42,6 @@ blockstore_impl_t::~blockstore_impl_t()
free(metadata_buffer);
if (clean_bitmaps)
free(clean_bitmaps);
if (heap_meta.blocks)
delete[] heap_meta.blocks;
}
bool blockstore_impl_t::is_started()
@@ -433,29 +431,13 @@ blockstore_clean_db_t& blockstore_impl_t::clean_db_shard(object_id oid)
{
uint64_t pg_num = 0;
uint64_t pool_id = (oid.inode >> (64-POOL_ID_BITS));
auto sett_it = clean_db_settings.find(pool_id);
if (sett_it != clean_db_settings.end())
auto sh_it = clean_db_settings.find(pool_id);
if (sh_it != clean_db_settings.end())
{
// like map_to_pg()
pg_num = (oid.stripe / sett_it->second.pg_stripe_size) % sett_it->second.pg_count + 1;
pg_num = (oid.stripe / sh_it->second.pg_stripe_size) % sh_it->second.pg_count + 1;
}
auto shard_id = (pool_id << (64-POOL_ID_BITS)) | pg_num;
if (dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
{
auto sh_it = clean_db_shards.find(shard_id);
if (sh_it == clean_db_shards.end())
{
// clean_db_t stores larger entries with heap_meta, but we disguise it as smaller clean_entry :)
// patched cpp-btree with extra_data
clean_db_shards[shard_id] = blockstore_clean_db_t(
sizeof(clean_entry_heap_t) - sizeof(clean_entry)
+ (inmemory_meta ? dsk.clean_dyn_size : 2*dsk.clean_entry_bitmap_size)
);
return clean_db_shards[shard_id];
}
return sh_it->second;
}
return clean_db_shards[shard_id];
return clean_db_shards[(pool_id << (64-POOL_ID_BITS)) | pg_num];
}
void blockstore_impl_t::reshard_clean_db(pool_id_t pool, uint32_t pg_count, uint32_t pg_stripe_size)

View File

@@ -96,9 +96,6 @@
#define BLOCKSTORE_META_MAGIC_V1 0x726F747341544956l
#define BLOCKSTORE_META_FORMAT_V1 1
#define BLOCKSTORE_META_FORMAT_V2 2
#define BLOCKSTORE_META_FORMAT_HEAP 3
#define BLOCKSTORE_META_HEADER_V1_SIZE 36
#define BLOCKSTORE_META_HEADER_V2_SIZE 48
// metadata header (superblock)
struct __attribute__((__packed__)) blockstore_meta_header_v1_t
@@ -122,7 +119,6 @@ struct __attribute__((__packed__)) blockstore_meta_header_v2_t
uint32_t data_csum_type;
uint32_t csum_block_size;
uint32_t header_csum;
uint32_t block_id_bits; // 32 by default in heap meta
};
// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
@@ -144,62 +140,6 @@ struct __attribute__((__packed__)) clean_entry
uint64_t location;
};
typedef uint32_t heap_block_num_t;
// 50 = 16 (key=object_id) + 26 (value) + 8 (bitmap) + N (checksum) bytes per "clean" entry in memory
struct __attribute__((__packed__)) clean_entry_heap_t
{
uint64_t version;
uint64_t location; // UINT64_MAX = deleted
// previous versions invalidated by this version
heap_block_num_t prev_versions;
// metadata block number
heap_block_num_t meta_block;
// offset within block
uint16_t block_offset;
uint8_t bitmap[];
};
struct __attribute__((__packed__)) heap_meta_block_header_t
{
uint64_t magic;
uint64_t seq_num;
uint32_t invalidates_blocks;
};
// 48+checksums = (40+bitmap)+checksums bytes per on-disk "heap" entry
// for 128 KB block without checksums, it's 48 bytes - 84 entries per 4 kb metadata block
// for 128 KB block with 4k checksums, it's 176 bytes - 22 entries per 4 kb metadata block
// for 1 MB block without checksums, it's 80 bytes - 50 entries per 4 kb metadata block
// for 1 MB block with 4k checksums, it's 1104 bytes O_o - only 3 entries per 4 kb metadata block
// for 1 MB block with 32k checksums, it's 176 bytes again
struct __attribute__((__packed__)) heap_meta_entry_t
{
object_id oid;
uint64_t version;
uint64_t location; // UINT64_MAX = deleted
uint64_t reserved;
uint8_t bitmap[];
};
struct heap_meta_block_t
{
heap_block_num_t offset = 0;
uint64_t seq_num = 0;
uint32_t used_space = 0;
std::vector<uint64_t> invalidates_blocks;
};
struct heap_meta_t
{
heap_block_num_t block_count = 0;
heap_meta_block_t *blocks = NULL;
// used space => block number
std::multimap<uint32_t, heap_block_num_t> used_space_map;
heap_block_num_t cur_written_block = 0;
uint8_t *written_block_buf = NULL;
};
// 64 = 24 + 40 bytes per dirty entry in memory (obj_ver_id => dirty_entry). Plus checksums
struct __attribute__((__packed__)) dirty_entry
{
@@ -332,8 +272,6 @@ class blockstore_impl_t
struct ring_consumer_t ring_consumer;
heap_meta_t heap_meta;
std::map<pool_id_t, pool_shard_settings_t> clean_db_settings;
std::map<pool_pg_id_t, blockstore_clean_db_t> clean_db_shards;
std::map<uint64_t, int> no_inode_stats;
@@ -379,7 +317,7 @@ class blockstore_impl_t
void open_data();
void open_meta();
void open_journal();
uint8_t* get_clean_entry_bitmap(blockstore_clean_db_t::iterator clean_it, int offset);
uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);
blockstore_clean_db_t& clean_db_shard(object_id oid);
void reshard_clean_db(pool_id_t pool_id, uint32_t pg_count, uint32_t pg_stripe_size);
@@ -407,9 +345,9 @@ class blockstore_impl_t
uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
uint32_t item_state, uint64_t item_version, uint64_t item_location,
uint64_t journal_sector, uint8_t *csum, int *dyn_data);
bool fulfill_clean_read_journal(blockstore_op_t *read_op, uint64_t & fulfilled,
uint8_t *clean_entry_bitmap, int *dyn_data, uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver);
bool fulfill_clean_read_meta(blockstore_op_t *read_op, uint64_t & fulfilled, blockstore_clean_db_t::iterator clean_it);
bool fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled,
uint8_t *clean_entry_bitmap, int *dyn_data,
uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver);
int fill_partial_checksum_blocks(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled,
uint8_t *clean_entry_bitmap, int *dyn_data, bool from_journal, uint8_t *read_buf, uint64_t read_offset, uint64_t read_end);
int pad_journal_read(std::vector<copy_buffer_t> & rv, copy_buffer_t & cp,
@@ -418,7 +356,7 @@ class blockstore_impl_t
bool read_range_fulfilled(std::vector<copy_buffer_t> & rv, uint64_t & fulfilled, uint8_t *read_buf,
uint8_t *clean_entry_bitmap, uint32_t item_start, uint32_t item_end);
bool read_checksum_block(blockstore_op_t *op, int rv_pos, uint64_t &fulfilled, uint64_t clean_loc);
uint8_t* read_clean_meta_block(blockstore_op_t *op, blockstore_clean_db_t::iterator clean_it, int rv_pos);
uint8_t* read_clean_meta_block(blockstore_op_t *read_op, uint64_t clean_loc, int rv_pos);
bool verify_padded_checksums(uint8_t *clean_entry_bitmap, uint8_t *csum_buf, uint32_t offset,
iovec *iov, int n_iov, std::function<void(uint32_t, uint32_t, uint32_t)> bad_block_cb);
bool verify_journal_checksums(uint8_t *csums, uint32_t offset,

View File

@@ -54,7 +54,6 @@ int blockstore_init_meta::loop()
else if (wait_state == 4) goto resume_4;
else if (wait_state == 5) goto resume_5;
else if (wait_state == 6) goto resume_6;
else if (wait_state == 7) goto resume_7;
printf("Reading blockstore metadata\n");
if (bs->inmemory_meta)
metadata_buffer = bs->metadata_buffer;
@@ -79,7 +78,6 @@ resume_1:
if (iszero((uint64_t*)metadata_buffer, bs->dsk.meta_block_size / sizeof(uint64_t)))
{
{
memset(metadata_buffer, 0, bs->dsk.meta_block_size);
blockstore_meta_header_v2_t *hdr = (blockstore_meta_header_v2_t *)metadata_buffer;
hdr->zero = 0;
hdr->magic = BLOCKSTORE_META_MAGIC_V1;
@@ -87,19 +85,12 @@ resume_1:
hdr->meta_block_size = bs->dsk.meta_block_size;
hdr->data_block_size = bs->dsk.data_block_size;
hdr->bitmap_granularity = bs->dsk.bitmap_granularity;
if (bs->dsk.meta_format >= BLOCKSTORE_META_FORMAT_HEAP)
{
hdr->block_id_bits = sizeof(heap_block_num_t);
}
if (bs->dsk.meta_format >= BLOCKSTORE_META_FORMAT_V2)
{
hdr->data_csum_type = bs->dsk.data_csum_type;
hdr->csum_block_size = bs->dsk.csum_block_size;
hdr->header_csum = 0;
hdr->header_csum = crc32c(0, hdr,
bs->dsk.meta_format == BLOCKSTORE_META_FORMAT_V2
? BLOCKSTORE_META_HEADER_V2_SIZE
: sizeof(*hdr));
hdr->header_csum = crc32c(0, hdr, sizeof(*hdr));
}
}
if (bs->readonly)
@@ -137,7 +128,7 @@ resume_1:
);
exit(1);
}
if (hdr->version == BLOCKSTORE_META_FORMAT_HEAP)
if (hdr->version == BLOCKSTORE_META_FORMAT_V2)
{
uint32_t csum = hdr->header_csum;
hdr->header_csum = 0;
@@ -147,23 +138,6 @@ resume_1:
exit(1);
}
hdr->header_csum = csum;
bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_HEAP;
if (hdr->block_id_bits != sizeof(heap_block_num_t))
{
printf("Heap metadata block ID size (%u) is not supported by this build\n", hdr->block_id_bits);
exit(1);
}
}
else if (hdr->version == BLOCKSTORE_META_FORMAT_V2)
{
uint32_t csum = hdr->header_csum;
hdr->header_csum = 0;
if (crc32c(0, hdr, BLOCKSTORE_META_HEADER_V2_SIZE) != csum)
{
printf("Metadata header is corrupt (checksum mismatch).\n");
exit(1);
}
hdr->header_csum = csum;
if (bs->dsk.meta_format != BLOCKSTORE_META_FORMAT_V2)
{
bs->dsk.meta_format = BLOCKSTORE_META_FORMAT_V2;
@@ -186,11 +160,11 @@ resume_1:
printf("Warning: Starting with metadata in the old format without checksums, as stored on disk\n");
}
}
else
else if (hdr->version > BLOCKSTORE_META_FORMAT_V2)
{
printf(
"Metadata format is too new for me (stored version is %ju, max supported %u).\n",
hdr->version, BLOCKSTORE_META_FORMAT_HEAP
hdr->version, BLOCKSTORE_META_FORMAT_V2
);
exit(1);
}
@@ -215,12 +189,7 @@ resume_1:
// Skip superblock
md_offset = bs->dsk.meta_block_size;
next_offset = md_offset;
entries_per_block = bs->dsk.meta_block_size / bs->dsk.clean_entry_size; // FIXME only array
if (bs->dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
{
bs->heap_meta.blocks = new heap_meta_block_t[bs->dsk.meta_len / bs->dsk.meta_block_size];
bs->heap_meta.block_count = bs->dsk.meta_len / bs->dsk.meta_block_size;
}
entries_per_block = bs->dsk.meta_block_size / bs->dsk.clean_entry_size;
// Read the rest of the metadata
resume_2:
if (next_offset < bs->dsk.meta_len && submitted == 0)
@@ -264,10 +233,9 @@ resume_2:
bool changed = false;
for (uint64_t sector = 0; sector < bufs[i].size; sector += bs->dsk.meta_block_size)
{
auto this_changed = bs->dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP
? handle_heap_meta_block(bufs[i].buf + sector, bufs[i].offset + sector - md_offset)
: handle_array_meta_block(bufs[i].buf + sector, bufs[i].offset + sector - md_offset);
if (this_changed)
// handle <count> entries
if (handle_meta_block(bufs[i].buf + sector, entries_per_block,
((bufs[i].offset + sector - md_offset) / bs->dsk.meta_block_size) * entries_per_block))
changed = true;
}
if (changed && !bs->inmemory_meta && !bs->readonly)
@@ -294,41 +262,6 @@ resume_2:
wait_state = 2;
return 1;
}
if (bs->dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
{
// build used_space index
for (heap_block_num_t i = 0; i < bs->heap_meta.block_count; i++)
{
bs->heap_meta.used_space_map.emplace(std::pair<uint32_t, heap_block_num_t>(bs->heap_meta.blocks[i].used_space, i));
}
}
if (heap_invalidated_block_seq.size() && !bs->readonly)
{
// zero out invalidated blocks not zeroed during the previous OSD execution
for (auto inv_seq: heap_invalidated_block_seq)
{
auto num_it = heap_block_by_seq.find(inv_seq);
if (num_it != heap_block_by_seq.end())
heap_invalidated_block_nums.push_back(num_it->second);
}
memset(metadata_buffer, 0, bs->dsk.meta_block_size);
for (i = 0; i < heap_invalidated_block_nums.size(); i++)
{
GET_SQE();
last_read_offset = heap_invalidated_block_nums[i]*bs->dsk.meta_block_size;
data->iov = { metadata_buffer, (size_t)bs->dsk.meta_block_size };
data->callback = [this](ring_data_t *data) { handle_event(data, -1); };
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + last_read_offset);
bs->ringloop->submit();
submitted++;
resume_7:
if (submitted > 0)
{
wait_state = 7;
return 1;
}
}
}
if (entries_to_zero.size() && !bs->inmemory_meta && !bs->readonly)
{
std::sort(entries_to_zero.begin(), entries_to_zero.end());
@@ -396,9 +329,8 @@ resume_6:
return 0;
}
bool blockstore_init_meta::handle_array_meta_block(uint8_t *buf, uint64_t block_offset)
bool blockstore_init_meta::handle_meta_block(uint8_t *buf, uint64_t entries_per_block, uint64_t done_cnt)
{
uint64_t done_cnt = (block_offset / bs->dsk.meta_block_size) * entries_per_block;
bool updated = false;
uint64_t max_i = entries_per_block;
if (max_i > bs->dsk.block_count-done_cnt)
@@ -497,132 +429,6 @@ bool blockstore_init_meta::handle_array_meta_block(uint8_t *buf, uint64_t block_
return updated;
}
static int bitmap_count_ones(uint8_t *bitmap, int size)
{
int n = 0, i = 0;
for (; i <= size-sizeof(unsigned); i += sizeof(unsigned))
{
n += __builtin_popcount(*(unsigned*)(bitmap+i));
}
for (; i < size; i++)
{
n += __builtin_popcount(*(unsigned char*)(bitmap+i));
}
return n;
}
// v3 / heap / "cow" metadata block
bool blockstore_init_meta::handle_heap_meta_block(uint8_t *buf, uint64_t block_offset)
{
if ((block_offset / bs->dsk.meta_block_size) > (heap_block_num_t)-1)
{
fprintf(stderr, "Metadata area too large\n");
exit(1);
}
// Validate block CRC
uint32_t block_crc = *(uint32_t*)(buf + bs->dsk.meta_block_size - 4);
if (crc32c(0, buf, bs->dsk.meta_block_size-4) != block_crc)
{
return false;
}
// Validate header
heap_meta_block_header_t *hdr = (heap_meta_block_header_t*)buf;
if (hdr->magic != BLOCKSTORE_META_MAGIC_V1)
{
return false;
}
if (hdr->invalidates_blocks > (bs->dsk.meta_block_size-4-sizeof(heap_meta_block_header_t))/sizeof(uint64_t))
{
fprintf(stderr, "Metadata block at %jx contains too large invalidates_blocks count: %x\n", block_offset, hdr->invalidates_blocks);
exit(1);
}
if (heap_invalidated_block_seq.find(hdr->seq_num) != heap_invalidated_block_seq.end())
{
// Check if the block is invalidated and handled after the block that invalidates it
return false;
}
uint64_t hdr_size = sizeof(heap_meta_block_header_t) + hdr->invalidates_blocks*8;
heap_meta_block_t & blk = bs->heap_meta.blocks[block_offset/bs->dsk.meta_block_size];
blk.offset = block_offset;
blk.seq_num = hdr->seq_num;
blk.used_space = hdr_size + 4;
uint64_t *hdr_inv = (uint64_t*)(hdr + 1);
for (int i = 0; i < hdr->invalidates_blocks; i++)
{
blk.invalidates_blocks.push_back(hdr_inv[i]);
heap_invalidated_block_seq.insert(hdr_inv[i]);
}
heap_block_by_seq[hdr->seq_num] = block_offset;
// Process sub-blocks
uint64_t heap_entry_size = sizeof(heap_meta_entry_t) + bs->dsk.clean_dyn_size;
for (uint64_t pos = sizeof(heap_meta_block_header_t); pos < bs->dsk.meta_block_size-4; pos += heap_entry_size)
{
heap_meta_entry_t *diskentry = (heap_meta_entry_t*)(buf + pos);
if (!diskentry->oid.inode || !diskentry->version)
{
continue;
}
auto & clean_db = bs->clean_db_shard(diskentry->oid);
auto mementry = (clean_entry_heap_t*)(&clean_db[diskentry->oid]);
bool exists = mementry->version != 0;
if (exists && mementry->version >= diskentry->version)
{
if (mementry->version == diskentry->version)
{
// Voluntarily allow duplicates of in-memory entries with different
// bitmaps to support checksum updates with hole-punching
int old_count = bitmap_count_ones(mementry->bitmap, bs->dsk.clean_entry_bitmap_size);
int new_count = bitmap_count_ones(diskentry->bitmap, bs->dsk.clean_entry_bitmap_size);
if (old_count < new_count)
{
continue;
}
}
else
{
continue;
}
}
blk.used_space += heap_entry_size;
if (exists && mementry->location != UINT64_MAX)
{
// free the previous block
uint64_t old_clean_loc = mementry->location >> bs->dsk.block_order;
#ifdef BLOCKSTORE_DEBUG
printf("Free block %ju from %jx:%jx v%ju\n", 1+old_clean_loc,
diskentry->oid.inode, diskentry->oid.stripe, mementry->version);
#endif
bs->data_alloc->set(old_clean_loc, false);
bs->inode_space_stats[diskentry->oid.inode] -= bs->dsk.data_block_size;
bs->used_blocks--;
bs->heap_meta.blocks[mementry->meta_block].used_space -= heap_entry_size;
}
if (diskentry->location != UINT64_MAX)
{
bs->data_alloc->set(diskentry->location >> bs->dsk.block_order, true);
bs->inode_space_stats[diskentry->oid.inode] += bs->dsk.data_block_size;
bs->used_blocks++;
#ifdef BLOCKSTORE_DEBUG
printf("Allocate block (heap entry) %ju: %jx:%jx v%ju\n", 1 + (diskentry->location >> bs->dsk.block_order),
diskentry->oid.inode, diskentry->oid.stripe, diskentry->version);
#endif
}
mementry->version = diskentry->version;
mementry->location = diskentry->location;
mementry->meta_block = block_offset / bs->dsk.meta_block_size;
mementry->block_offset = block_offset % bs->dsk.meta_block_size;
if (exists)
{
mementry->prev_versions++;
}
// Extra data: 2 bitmaps + checksums or just 2 bitmaps if inmemory_meta is disabled
memcpy(&mementry->bitmap, &diskentry->bitmap, bs->inmemory_meta ? bs->dsk.clean_dyn_size : 2*bs->dsk.clean_entry_bitmap_size);
entries_loaded++;
}
// We have to zero out headers of invalidated blocks, but we'll do it later
return false;
}
blockstore_init_journal::blockstore_init_journal(blockstore_impl_t *bs)
{
this->bs = bs;

View File

@@ -28,13 +28,7 @@ class blockstore_init_meta
unsigned entries_per_block = 0;
int i = 0, j = 0;
std::vector<uint64_t> entries_to_zero;
std::map<uint64_t, heap_block_num_t> heap_block_by_seq;
std::set<uint64_t> heap_invalidated_block_seq;
std::vector<heap_block_num_t> heap_invalidated_block_nums;
bool handle_array_meta_block(uint8_t *buf, uint64_t block_offset);
bool handle_heap_meta_block(uint8_t *buf, uint64_t block_offset);
bool handle_meta_block(uint8_t *buf, uint64_t count, uint64_t done_cnt);
void handle_event(ring_data_t *data, int buf_num);
public:
blockstore_init_meta(blockstore_impl_t *bs);

View File

@@ -111,10 +111,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init)
{
metadata_buf_size = 4*1024*1024;
}
if (metadata_buf_size % dsk.meta_block_size)
{
metadata_buf_size = ((metadata_buf_size+dsk.meta_block_size-1) / dsk.meta_block_size) * dsk.meta_block_size;
}
if (dsk.meta_device == dsk.data_device)
{
disable_meta_fsync = disable_data_fsync;

View File

@@ -148,14 +148,10 @@ int blockstore_impl_t::fulfill_read(blockstore_op_t *read_op,
return r;
}
uint8_t* blockstore_impl_t::get_clean_entry_bitmap(blockstore_clean_db_t::iterator clean_it, int offset)
uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offset)
{
if (dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
{
return ((uint8_t*)&clean_it->second) + sizeof(clean_entry_heap_t) + offset;
}
uint8_t *clean_entry_bitmap;
uint64_t meta_loc = clean_it->second.location >> dsk.block_order;
uint64_t meta_loc = block_loc >> dsk.block_order;
if (inmemory_meta)
{
uint64_t sector = (meta_loc / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
@@ -163,9 +159,7 @@ uint8_t* blockstore_impl_t::get_clean_entry_bitmap(blockstore_clean_db_t::iterat
clean_entry_bitmap = ((uint8_t*)metadata_buffer + sector + pos*dsk.clean_entry_size + sizeof(clean_disk_entry) + offset);
}
else
{
clean_entry_bitmap = (uint8_t*)(clean_bitmaps + meta_loc*2*dsk.clean_entry_bitmap_size + offset);
}
return clean_entry_bitmap;
}
@@ -439,7 +433,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
if (!IS_JOURNAL(dirty.state))
{
// Read from data disk, possibly checking checksums
if (!fulfill_clean_read_journal(read_op, fulfilled, bmp_ptr, dyn_data,
if (!fulfill_clean_read(read_op, fulfilled, bmp_ptr, dyn_data,
dirty.offset, dirty.offset+dirty.len, dirty.location, dirty_it->first.version))
{
goto undo_read;
@@ -470,13 +464,14 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
result_version = clean_it->second.version;
if (read_op->bitmap)
{
void *bmp_ptr = get_clean_entry_bitmap(clean_it, dsk.clean_entry_bitmap_size);
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, dsk.clean_entry_bitmap_size);
memcpy(read_op->bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
}
}
if (fulfilled < read_op->len)
{
if (!fulfill_clean_read_meta(read_op, fulfilled, clean_it))
if (!fulfill_clean_read(read_op, fulfilled, NULL, NULL, 0, dsk.data_block_size,
clean_it->second.location, clean_it->second.version))
{
goto undo_read;
}
@@ -586,22 +581,40 @@ int blockstore_impl_t::pad_journal_read(std::vector<copy_buffer_t> & rv, copy_bu
return 0;
}
bool blockstore_impl_t::fulfill_clean_read_journal(blockstore_op_t *read_op, uint64_t & fulfilled,
bool blockstore_impl_t::fulfill_clean_read(blockstore_op_t *read_op, uint64_t & fulfilled,
uint8_t *clean_entry_bitmap, int *dyn_data, uint32_t item_start, uint32_t item_end, uint64_t clean_loc, uint64_t clean_ver)
{
bool from_journal = clean_entry_bitmap != NULL;
if (!clean_entry_bitmap)
{
// NULL clean_entry_bitmap means we're reading from data, not from the journal,
// and the bitmap location is obvious
clean_entry_bitmap = get_clean_entry_bitmap(clean_loc, 0);
}
if (dsk.csum_block_size > dsk.bitmap_granularity)
{
auto & rv = PRIV(read_op)->read_vec;
int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap, dyn_data, true,
int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap, dyn_data, from_journal,
(uint8_t*)read_op->buf, read_op->offset, read_op->offset+read_op->len);
if (!inmemory_meta && !from_journal && req > 0)
{
// Read checksums from disk
uint8_t *csum_buf = read_clean_meta_block(read_op, clean_loc, rv.size()-req);
for (int i = req; i > 0; i--)
{
rv[rv.size()-i].csum_buf = csum_buf;
}
}
for (int i = req; i > 0; i--)
{
if (!read_checksum_block(read_op, i, fulfilled, clean_loc))
{
return false;
}
}
PRIV(read_op)->clean_block_used = req > 0;
}
else
else if (from_journal)
{
// Don't scan bitmap - journal writes don't have holes (internal bitmap)!
uint8_t *csum = !dsk.csum_block_size ? 0 : (clean_entry_bitmap + dsk.clean_entry_bitmap_size +
@@ -622,43 +635,6 @@ bool blockstore_impl_t::fulfill_clean_read_journal(blockstore_op_t *read_op, uin
assert(fulfill_read(read_op, fulfilled, item_end, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0, 0, NULL, NULL));
}
}
// Increment reference counter if clean data is being read from the disk
if (PRIV(read_op)->clean_block_used)
{
auto & uo = used_clean_objects[clean_loc];
uo.refs++;
if (dsk.csum_block_size && flusher->is_mutated(clean_loc))
uo.was_changed = true;
PRIV(read_op)->clean_block_used = clean_loc;
}
return true;
}
bool blockstore_impl_t::fulfill_clean_read_meta(blockstore_op_t *read_op, uint64_t & fulfilled, blockstore_clean_db_t::iterator clean_it)
{
uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_it, 0);
uint64_t clean_loc = clean_it->second.location;
if (dsk.csum_block_size > dsk.bitmap_granularity)
{
auto & rv = PRIV(read_op)->read_vec;
int req = fill_partial_checksum_blocks(rv, fulfilled, clean_entry_bitmap, NULL, false,
(uint8_t*)read_op->buf, read_op->offset, read_op->offset+read_op->len);
if (!inmemory_meta && req > 0)
{
// Read checksums from disk
uint8_t *csum_buf = read_clean_meta_block(read_op, clean_it, rv.size()-req);
for (int i = req; i > 0; i--)
{
rv[rv.size()-i].csum_buf = csum_buf;
}
}
for (int i = req; i > 0; i--)
{
if (!read_checksum_block(read_op, i, fulfilled, clean_loc))
return false;
}
PRIV(read_op)->clean_block_used = req > 0;
}
else
{
bool csum_done = !dsk.csum_block_size || inmemory_meta;
@@ -686,13 +662,13 @@ bool blockstore_impl_t::fulfill_clean_read_meta(blockstore_op_t *read_op, uint64
if (!csum_done)
{
// Read checksums from disk
csum_buf = read_clean_meta_block(read_op, clean_it, PRIV(read_op)->read_vec.size());
csum_buf = read_clean_meta_block(read_op, clean_loc, PRIV(read_op)->read_vec.size());
csum_done = true;
}
uint8_t *csum = !dsk.csum_block_size ? 0 : (csum_buf + 2*dsk.clean_entry_bitmap_size + bmp_start*(dsk.data_csum_type & 0xFF));
if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
clean_loc + bmp_start * dsk.bitmap_granularity, 0, csum, NULL))
clean_loc + bmp_start * dsk.bitmap_granularity, 0, csum, dyn_data))
{
return false;
}
@@ -712,22 +688,11 @@ bool blockstore_impl_t::fulfill_clean_read_meta(blockstore_op_t *read_op, uint64
return true;
}
uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, blockstore_clean_db_t::iterator clean_it, int rv_pos)
uint8_t* blockstore_impl_t::read_clean_meta_block(blockstore_op_t *op, uint64_t clean_loc, int rv_pos)
{
uint64_t sector, pos;
auto & rv = PRIV(op)->read_vec;
if (dsk.meta_format == BLOCKSTORE_META_FORMAT_HEAP)
{
auto clean_heap_entry = (clean_entry_heap_t*)(&clean_it->second);
sector = clean_heap_entry->meta_block * dsk.meta_block_size;
pos = clean_heap_entry->block_offset;
}
else
{
auto clean_loc = clean_it->second.location;
sector = ((clean_loc >> dsk.block_order) / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
pos = ((clean_loc >> dsk.block_order) % (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.clean_entry_size;
}
auto sector = ((clean_loc >> dsk.block_order) / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
auto pos = ((clean_loc >> dsk.block_order) % (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.clean_entry_size;
uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.meta_block_size);
rv.insert(rv.begin()+rv_pos, (copy_buffer_t){
.copy_flags = COPY_BUF_META_BLOCK|COPY_BUF_CSUM_FILL,
@@ -842,6 +807,11 @@ bool blockstore_impl_t::verify_clean_padded_checksums(blockstore_op_t *op, uint6
if (from_journal)
return verify_padded_checksums(dyn_data, dyn_data + dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb);
clean_loc = (clean_loc >> dsk.block_order) << dsk.block_order;
if (!dyn_data)
{
assert(inmemory_meta);
dyn_data = get_clean_entry_bitmap(clean_loc, 0);
}
return verify_padded_checksums(dyn_data, dyn_data + 2*dsk.clean_entry_bitmap_size, offset, iov, n_iov, bad_block_cb);
}
@@ -899,18 +869,8 @@ void blockstore_impl_t::handle_read_event(ring_data_t *data, blockstore_op_t *op
auto & uo = used_clean_objects.at((rv[i].disk_offset >> dsk.block_order) << dsk.block_order);
if (!uo.was_changed)
{
bool from_journal = (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG);
auto csum_buf = rv[i].csum_buf;
if (!from_journal && !csum_buf)
{
assert(inmemory_meta);
auto & clean_db = clean_db_shard(op->oid);
auto clean_it = clean_db.find(op->oid);
assert(clean_it != clean_db.end());
csum_buf = get_clean_entry_bitmap(clean_it, 0);
}
verify_clean_padded_checksums(
op, rv[i].disk_offset, csum_buf, from_journal, iov, n_iov,
op, rv[i].disk_offset, rv[i].csum_buf, (rv[i].copy_flags & COPY_BUF_JOURNALED_BIG), iov, n_iov,
[&](uint32_t bad_block, uint32_t calc_csum, uint32_t stored_csum)
{
ok = false;
@@ -1059,7 +1019,7 @@ int blockstore_impl_t::read_bitmap(object_id oid, uint64_t target_version, void
*result_version = clean_it->second.version;
if (bitmap)
{
void *bmp_ptr = get_clean_entry_bitmap(clean_it, dsk.clean_entry_bitmap_size);
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, dsk.clean_entry_bitmap_size);
memcpy(bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
}
return 0;

View File

@@ -57,7 +57,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
version = clean_it->second.version + 1;
if (!is_del)
{
void *bmp_ptr = get_clean_entry_bitmap(clean_it, dsk.clean_entry_bitmap_size);
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, dsk.clean_entry_bitmap_size);
memcpy(dyn_ptr, bmp_ptr, dsk.clean_entry_bitmap_size);
}
}
@@ -341,7 +341,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
cancel_all_writes(op, dirty_it, -ENOSPC);
return 2;
}
if (inmemory_meta && dsk.meta_format != BLOCKSTORE_META_FORMAT_HEAP)
if (inmemory_meta)
{
// Check once more that metadata entry is zeroed (the reverse means a bug or corruption)
uint64_t sector = (loc / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;

View File

@@ -188,7 +188,7 @@ void osd_messenger_t::init()
auto cl = cl_it->second;
cl_it++;
auto peer_fd = cl->peer_fd;
if (!cl->osd_num || cl->peer_state != PEER_CONNECTED && cl->peer_state != PEER_RDMA)
if (!cl->osd_num && !cl->in_osd_num || cl->peer_state != PEER_CONNECTED && cl->peer_state != PEER_RDMA)
{
// Do not run keepalive on regular clients
continue;
@@ -199,7 +199,7 @@ void osd_messenger_t::init()
if (!cl->ping_time_remaining)
{
// Ping timed out, stop the client
fprintf(stderr, "Ping timed out for OSD %ju (client %d), disconnecting peer\n", cl->osd_num, cl->peer_fd);
fprintf(stderr, "Ping timed out for OSD %ju (client %d), disconnecting peer\n", cl->in_osd_num ? cl->in_osd_num : cl->osd_num, cl->peer_fd);
stop_client(peer_fd, true);
// Restart iterator because it may be invalidated
cl_it = clients.upper_bound(peer_fd);
@@ -230,7 +230,7 @@ void osd_messenger_t::init()
return;
}
int fail_fd = (op->reply.hdr.retval != 0 ? op->peer_fd : -1);
auto fail_osd_num = cl->osd_num;
auto fail_osd_num = cl->in_osd_num ? cl->in_osd_num : cl->osd_num;
cl->ping_time_remaining = 0;
delete op;
if (fail_fd >= 0)

View File

@@ -60,6 +60,7 @@ struct osd_client_t
int ping_time_remaining = 0;
int idle_time_remaining = 0;
osd_num_t osd_num = 0;
osd_num_t in_osd_num = 0;
bool is_incoming = false;
void *in_buf = NULL;
@@ -98,6 +99,7 @@ struct osd_client_t
std::vector<osd_op_t*> zc_free_list;
~osd_client_t();
void cancel_ops();
};
struct osd_wanted_peer_t
@@ -235,6 +237,7 @@ public:
void outbox_push(osd_op_t *cur_op);
std::function<void(osd_op_t*)> exec_op;
std::function<void(osd_num_t)> repeer_pgs;
std::function<void(osd_num_t)> break_pg_locks;
std::function<bool(osd_client_t*, json11::Json)> check_config_hook;
void read_requests();
void send_replies();

View File

@@ -173,6 +173,7 @@ struct osd_op_t
osd_op_buf_list_t iov;
~osd_op_t();
void cancel();
bool is_recovery_related();
};

View File

@@ -510,13 +510,12 @@ void osd_messenger_t::rdmacm_established(rdma_cm_event *ev)
rc->qp = conn->cmid->qp;
// And an osd_client_t
auto cl = new osd_client_t();
cl->is_incoming = true;
cl->peer_addr = conn->parsed_addr;
cl->peer_port = conn->rdmacm_port;
cl->peer_fd = conn->peer_fd;
cl->peer_state = PEER_RDMA;
cl->connect_timeout_id = -1;
cl->osd_num = peer_osd;
cl->in_osd_num = peer_osd;
cl->in_buf = malloc_or_die(receive_buffer_size);
cl->rdma_conn = rc;
clients[conn->peer_fd] = cl;

View File

@@ -8,11 +8,12 @@ void osd_messenger_t::read_requests()
for (int i = 0; i < read_ready_clients.size(); i++)
{
int peer_fd = read_ready_clients[i];
osd_client_t *cl = clients[peer_fd];
if (cl->read_msg.msg_iovlen)
auto cl_it = clients.find(peer_fd);
if (cl_it == clients.end() || !cl_it->second || cl_it->second->read_msg.msg_iovlen)
{
continue;
}
auto cl = cl_it->second;
if (cl->read_remaining < receive_buffer_size)
{
cl->read_iov.iov_base = cl->in_buf;
@@ -33,8 +34,12 @@ void osd_messenger_t::read_requests()
auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
io_uring_sqe sqe_local;
ring_data_t data_local;
sqe_local.user_data = (uint64_t)&data_local;
io_uring_sqe* sqe = (iothread ? &sqe_local : ringloop->get_sqe());
if (iothread)
{
sqe_local = { .user_data = (uint64_t)&data_local };
data_local = {};
}
if (!sqe)
{
cl->read_msg.msg_iovlen = 0;
@@ -56,7 +61,8 @@ void osd_messenger_t::read_requests()
{
result = -errno;
}
handle_read(result, cl);
// like set_immediate
tfd->set_timer_us(0, false, [this, result, cl](int){ handle_read(result, cl); });
}
}
read_ready_clients.clear();
@@ -228,7 +234,7 @@ bool osd_messenger_t::handle_finished_read(osd_client_t *cl)
{
if (cl->read_op->req.hdr.id != cl->read_op_id)
{
fprintf(stderr, "Warning: operation sequencing is broken on client %d, stopping client\n", cl->peer_fd);
fprintf(stderr, "Warning: operation sequencing is broken on client %d: expected num %ju, got %ju, stopping client\n", cl->peer_fd, cl->read_op_id, cl->read_op->req.hdr.id);
stop_client(cl->peer_fd);
return false;
}

View File

@@ -194,12 +194,14 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
auto iothread = iothreads.size() ? iothreads[peer_fd % iothreads.size()] : NULL;
io_uring_sqe sqe_local;
ring_data_t data_local;
sqe_local.user_data = (uint64_t)&data_local;
io_uring_sqe* sqe = (iothread ? &sqe_local : ringloop->get_sqe());
if (!sqe)
if (iothread)
{
return false;
sqe_local = { .user_data = (uint64_t)&data_local };
data_local = {};
}
if (!sqe)
return false;
cl->write_msg.msg_iov = cl->send_list.data();
cl->write_msg.msg_iovlen = cl->send_list.size() < IOV_MAX ? cl->send_list.size() : IOV_MAX;
cl->refs++;
@@ -237,7 +239,8 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
{
result = -errno;
}
handle_send(result, false, false, cl);
// like set_immediate
tfd->set_timer_us(0, false, [this, result, cl](int){ handle_send(result, false, false, cl); });
}
return true;
}

View File

@@ -9,38 +9,37 @@
#include "msgr_rdma.h"
#endif
void osd_messenger_t::cancel_osd_ops(osd_client_t *cl)
void osd_client_t::cancel_ops()
{
std::vector<osd_op_t*> cancel_ops;
cancel_ops.resize(cl->sent_ops.size());
cancel_ops.resize(sent_ops.size());
int i = 0;
for (auto p: cl->sent_ops)
for (auto p: sent_ops)
{
cancel_ops[i++] = p.second;
}
cl->sent_ops.clear();
cl->outbox.clear();
sent_ops.clear();
for (auto op: cancel_ops)
{
cancel_op(op);
op->cancel();
}
}
void osd_messenger_t::cancel_op(osd_op_t *op)
void osd_op_t::cancel()
{
if (op->op_type == OSD_OP_OUT)
if (op_type == OSD_OP_OUT && callback)
{
op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
op->reply.hdr.id = op->req.hdr.id;
op->reply.hdr.opcode = op->req.hdr.opcode;
op->reply.hdr.retval = -EPIPE;
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(op->callback)(op);
reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
reply.hdr.id = req.hdr.id;
reply.hdr.opcode = req.hdr.opcode;
reply.hdr.retval = -EPIPE;
// Copy lambda to be unaffected by `delete this`
(std::function<void(osd_op_t*)>(callback))(this);
}
else
{
// This function is only called in stop_client(), so it's fine to destroy the operation
delete op;
delete this;
}
}
@@ -63,6 +62,10 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
{
fprintf(stderr, "[OSD %ju] Stopping client %d (OSD peer %ju)\n", osd_num, peer_fd, cl->osd_num);
}
else if (cl->in_osd_num)
{
fprintf(stderr, "[OSD %ju] Stopping client %d (incoming OSD peer %ju)\n", osd_num, peer_fd, cl->in_osd_num);
}
else
{
fprintf(stderr, "[OSD %ju] Stopping client %d (regular client)\n", osd_num, peer_fd);
@@ -73,8 +76,12 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
cl->peer_state = PEER_STOPPED;
if (cl->osd_num)
{
// ...and forget OSD peer
osd_peer_fds.erase(cl->osd_num);
auto osd_it = osd_peer_fds.find(cl->osd_num);
if (osd_it != osd_peer_fds.end() && osd_it->second == cl->peer_fd)
{
// ...and forget OSD peer
osd_peer_fds.erase(osd_it);
}
}
#ifndef __MOCK__
// Then remove FD from the eventloop so we don't accidentally read something
@@ -101,30 +108,17 @@ void osd_messenger_t::stop_client(int peer_fd, bool force, bool force_delete)
}
}
#endif
if (cl->in_osd_num && break_pg_locks)
{
// Break PG locks
break_pg_locks(cl->in_osd_num);
}
if (cl->osd_num)
{
// Then repeer PGs because cancel_op() callbacks can try to perform
// some actions and we need correct PG states to not do something silly
repeer_pgs(cl->osd_num);
}
// Then cancel all operations
if (cl->read_op)
{
if (!cl->read_op->callback)
{
delete cl->read_op;
}
else
{
cancel_op(cl->read_op);
}
cl->read_op = NULL;
}
if (cl->osd_num)
{
// Cancel outbound operations
cancel_osd_ops(cl);
}
// Find the item again because it can be invalidated at this point
it = clients.find(peer_fd);
if (it != clients.end())
@@ -149,6 +143,17 @@ osd_client_t::~osd_client_t()
close(peer_fd);
peer_fd = -1;
}
// Then cancel all operations
// Operations have to be canceled only after clearing all references to osd_client_t
// because otherwise their buffers may be still present in io_uring asynchronous requests
if (read_op)
{
// read_op may be an incoming op or a continued response for an outbound op
read_op->cancel();
read_op = NULL;
}
// Cancel outbound ops
cancel_ops();
#ifndef __MOCK__
#ifdef WITH_RDMA
if (rdma_conn)

View File

@@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: Vitastor
Description: Vitastor client library
Version: 2.2.0
Version: 2.2.2
Libs: -L${libdir} -lvitastor_client
Cflags: -I${includedir}

View File

@@ -70,7 +70,7 @@ struct rm_osd_t
{
if (parent->cli->st_cli.peer_states.find(osd_id) != parent->cli->st_cli.peer_states.end())
{
is_warning = true;
is_warning = !allow_up;
still_up.push_back(osd_id);
}
}

View File

@@ -85,6 +85,7 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
msgr.ringloop = this->ringloop;
msgr.exec_op = [this](osd_op_t *op) { exec_op(op); };
msgr.repeer_pgs = [this](osd_num_t peer_osd) { repeer_pgs(peer_osd); };
msgr.break_pg_locks = [this](osd_num_t peer_osd) { break_pg_locks(peer_osd); };
msgr.check_config_hook = [this](osd_client_t *cl, json11::Json conf) { return check_peer_config(cl, conf); };
msgr.init();

View File

@@ -278,6 +278,8 @@ class osd_t
void handle_peers();
bool check_peer_config(osd_client_t *cl, json11::Json conf);
void repeer_pgs(osd_num_t osd_num);
void repeer_pg(pg_t & pg);
void break_pg_locks(osd_num_t osd_num);
void start_pg_peering(pg_t & pg);
void drop_dirty_pg_connections(pool_pg_num_t pg);
void record_pg_lock(pg_t & pg, osd_num_t peer_osd, uint64_t pg_state);

View File

@@ -432,9 +432,16 @@ void osd_t::apply_pg_locks_localize_only()
}
auto & pool_cfg = pool_it->second;
auto & pg = pp.second;
auto old_disable_pg_locks = pg.disable_pg_locks;
pg.disable_pg_locks = pg_locks_localize_only &&
pool_cfg.scheme == POOL_SCHEME_REPLICATED &&
pool_cfg.local_reads == POOL_LOCAL_READ_PRIMARY;
(pool_cfg.scheme != POOL_SCHEME_REPLICATED ||
pool_cfg.local_reads == POOL_LOCAL_READ_PRIMARY);
if (!pg.disable_pg_locks && old_disable_pg_locks)
{
// Relock PG
printf("[PG %u/%u] Repeer to enable PG locks\n", pg.pool_id, pg.pg_num);
repeer_pg(pg);
}
}
}
@@ -877,8 +884,8 @@ void osd_t::apply_pg_config()
pg.next_scrub = pg_cfg.next_scrub;
pg.target_set = pg_cfg.target_set;
pg.disable_pg_locks = pg_locks_localize_only &&
pool_item.second.scheme == POOL_SCHEME_REPLICATED &&
pool_item.second.local_reads == POOL_LOCAL_READ_PRIMARY;
(pool_item.second.scheme != POOL_SCHEME_REPLICATED ||
pool_item.second.local_reads == POOL_LOCAL_READ_PRIMARY);
if (pg.scheme == POOL_SCHEME_EC)
{
use_ec(pg.pg_size, pg.pg_data_size, true);
@@ -1044,8 +1051,15 @@ void osd_t::report_pg_states()
etcd_reporting_pg_state = true;
st_cli.etcd_txn(json11::Json::object {
{ "compare", checks }, { "success", success }, { "failure", failure }
}, st_cli.etcd_quick_timeout, 0, 0, [this, reporting_pgs](std::string err, json11::Json data)
}, st_cli.etcd_quick_timeout, 0, 0, [this, reporting_pgs, success_count = success.size(), failure_count = failure.size()](std::string err, json11::Json data)
{
int expected_count = (data["succeeded"].bool_value() ? success_count : failure_count);
if (expected_count != data["responses"].array_items().size())
{
printf("Unexpected response from etcd - 'responses' count (%u) isn't equal to expected (%u), stopping\n",
data["responses"].array_items().size(), expected_count);
force_stop(1);
}
etcd_reporting_pg_state = false;
if (!data["succeeded"].bool_value())
{

View File

@@ -73,18 +73,25 @@ void osd_t::handle_peers()
}
}
void osd_t::break_pg_locks(osd_num_t peer_osd)
{
for (auto lock_it = pg_locks.begin(); lock_it != pg_locks.end(); )
{
if (lock_it->second.primary_osd == peer_osd)
{
if (log_level > 3)
{
printf("Break PG %u/%u lock on disconnection of OSD %ju\n", lock_it->first.pool_id, lock_it->first.pg_num, peer_osd);
}
pg_locks.erase(lock_it++);
}
else
lock_it++;
}
}
void osd_t::repeer_pgs(osd_num_t peer_osd)
{
if (msgr.osd_peer_fds.find(peer_osd) == msgr.osd_peer_fds.end())
{
for (auto lock_it = pg_locks.begin(); lock_it != pg_locks.end(); )
{
if (lock_it->second.primary_osd == peer_osd)
pg_locks.erase(lock_it++);
else
lock_it++;
}
}
// Re-peer affected PGs
for (auto & p: pgs)
{
@@ -104,21 +111,26 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
{
// Repeer this pg
printf("[PG %u/%u] Repeer because of OSD %ju\n", pg.pool_id, pg.pg_num, peer_osd);
if (!(pg.state & (PG_ACTIVE | PG_REPEERING)) || pg.can_repeer())
{
start_pg_peering(pg);
}
else
{
// Stop accepting new operations, wait for current ones to finish or fail
pg.state = pg.state & ~PG_ACTIVE | PG_REPEERING;
report_pg_state(pg);
}
repeer_pg(pg);
}
}
}
}
void osd_t::repeer_pg(pg_t & pg)
{
if (!(pg.state & (PG_ACTIVE | PG_REPEERING)) || pg.can_repeer())
{
start_pg_peering(pg);
}
else
{
// Stop accepting new operations, wait for current ones to finish or fail
pg.state = pg.state & ~PG_ACTIVE | PG_REPEERING;
report_pg_state(pg);
}
}
// Reset PG state (when peering or stopping)
void osd_t::reset_pg(pg_t & pg)
{
@@ -466,6 +478,7 @@ void osd_t::relock_pg(pg_t & pg)
auto pg_it = pgs.find(pg_id);
if (pg_it == pgs.end())
{
printf("Warning: PG %u/%u is gone during lock attempt\n", pg_id.pool_id, pg_id.pg_num);
return;
}
auto & pg = pg_it->second;

View File

@@ -417,15 +417,17 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
if (retval != expected)
{
int64_t peer_osd = (msgr.clients.find(subop->peer_fd) != msgr.clients.end()
? msgr.clients[subop->peer_fd]->osd_num : -subop->peer_fd);
? msgr.clients[subop->peer_fd]->osd_num : 0);
if (opcode == OSD_OP_SEC_READ || opcode == OSD_OP_SEC_WRITE || opcode == OSD_OP_SEC_WRITE_STABLE)
{
printf(
subop->peer_fd >= 0
? "%1$s subop to %2$jx:%3$jx v%4$ju failed on osd %7$jd: retval = %5$d (expected %6$d)\n"
? (peer_osd > 0
? "%1$s subop to %2$jx:%3$jx v%4$ju failed on osd %7$ju: retval = %5$d (expected %6$d)\n"
: "%1$s subop to %2$jx:%3$jx v%4$ju failed on peer %8$d: retval = %5$d (expected %6$d)\n")
: "%1$s subop to %2$jx:%3$jx v%4$ju failed locally: retval = %5$d (expected %6$d)\n",
osd_op_names[opcode], subop->req.sec_rw.oid.inode, subop->req.sec_rw.oid.stripe, subop->req.sec_rw.version,
retval, expected, peer_osd
retval, expected, peer_osd, subop->peer_fd
);
}
else if (opcode == OSD_OP_SEC_DELETE)

View File

@@ -91,16 +91,17 @@ bool osd_t::sec_check_pg_lock(osd_num_t primary_osd, const object_id &oid)
{
return false;
}
auto & pool_cfg = pool_cfg_it->second;
if (pg_locks_localize_only && (pool_cfg.scheme != POOL_SCHEME_REPLICATED || pool_cfg.local_reads == POOL_LOCAL_READ_PRIMARY))
{
return true;
}
auto ppg = (pool_pg_num_t){ .pool_id = pool_id, .pg_num = map_to_pg(oid, pool_cfg_it->second.pg_stripe_size) };
auto pg_it = pgs.find(ppg);
if (pg_it != pgs.end() && pg_it->second.state != PG_OFFLINE)
{
return false;
}
if (pg_it->second.disable_pg_locks)
{
return true;
}
auto lock_it = pg_locks.find(ppg);
return lock_it != pg_locks.end() && lock_it->second.primary_osd == primary_osd;
}
@@ -140,7 +141,7 @@ void osd_t::exec_secondary_real(osd_op_t *cur_op)
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE)
{
if (!(cur_op->req.sec_rw.flags & OSD_OP_IGNORE_PG_LOCK) &&
!sec_check_pg_lock(cl->osd_num, cur_op->req.sec_rw.oid))
!sec_check_pg_lock(cl->in_osd_num, cur_op->req.sec_rw.oid))
{
cur_op->bs_op->retval = -EPIPE;
secondary_op_callback(cur_op);
@@ -169,7 +170,7 @@ void osd_t::exec_secondary_real(osd_op_t *cur_op)
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_DELETE)
{
if (!(cur_op->req.sec_del.flags & OSD_OP_IGNORE_PG_LOCK) &&
!sec_check_pg_lock(cl->osd_num, cur_op->req.sec_del.oid))
!sec_check_pg_lock(cl->in_osd_num, cur_op->req.sec_del.oid))
{
cur_op->bs_op->retval = -EPIPE;
secondary_op_callback(cur_op);
@@ -193,7 +194,7 @@ void osd_t::exec_secondary_real(osd_op_t *cur_op)
{
for (int i = 0; i < cur_op->bs_op->len; i++)
{
if (!sec_check_pg_lock(cl->osd_num, ((obj_ver_id*)cur_op->buf)[i].oid))
if (!sec_check_pg_lock(cl->in_osd_num, ((obj_ver_id*)cur_op->buf)[i].oid))
{
cur_op->bs_op->retval = -EPIPE;
secondary_op_callback(cur_op);
@@ -247,7 +248,7 @@ void osd_t::exec_sec_read_bmp(osd_op_t *cur_op)
void *cur_buf = reply_buf;
for (int i = 0; i < n; i++)
{
if (!sec_check_pg_lock(cl->osd_num, ov[i].oid) &&
if (!sec_check_pg_lock(cl->in_osd_num, ov[i].oid) &&
!(cur_op->req.sec_read_bmp.flags & OSD_OP_IGNORE_PG_LOCK))
{
free(reply_buf);
@@ -269,7 +270,7 @@ void osd_t::exec_sec_lock(osd_op_t *cur_op)
{
cur_op->reply.sec_lock.cur_primary = 0;
auto cl = msgr.clients.at(cur_op->peer_fd);
if (!cl->osd_num ||
if (!cl->in_osd_num ||
cur_op->req.sec_lock.flags != OSD_SEC_LOCK_PG &&
cur_op->req.sec_lock.flags != OSD_SEC_UNLOCK_PG ||
cur_op->req.sec_lock.pool_id > ((uint64_t)1<<POOL_ID_BITS) ||
@@ -290,7 +291,7 @@ void osd_t::exec_sec_lock(osd_op_t *cur_op)
auto lock_it = pg_locks.find(ppg);
if (cur_op->req.sec_lock.flags == OSD_SEC_LOCK_PG)
{
if (lock_it != pg_locks.end() && lock_it->second.primary_osd != cl->osd_num)
if (lock_it != pg_locks.end() && lock_it->second.primary_osd != cl->in_osd_num)
{
cur_op->reply.sec_lock.cur_primary = lock_it->second.primary_osd;
finish_op(cur_op, -EBUSY);
@@ -303,13 +304,21 @@ void osd_t::exec_sec_lock(osd_op_t *cur_op)
finish_op(cur_op, -EBUSY);
return;
}
if (log_level > 3)
{
printf("Lock PG %u/%u for OSD %ju\n", ppg.pool_id, ppg.pg_num, cl->in_osd_num);
}
pg_locks[ppg] = (osd_pg_lock_t){
.primary_osd = cl->osd_num,
.primary_osd = cl->in_osd_num,
.state = cur_op->req.sec_lock.pg_state,
};
}
else if (lock_it != pg_locks.end() && lock_it->second.primary_osd == cl->osd_num)
else if (lock_it != pg_locks.end() && lock_it->second.primary_osd == cl->in_osd_num)
{
if (log_level > 3)
{
printf("Unlock PG %u/%u by OSD %ju\n", ppg.pool_id, ppg.pg_num, cl->in_osd_num);
}
pg_locks.erase(lock_it);
}
finish_op(cur_op, 0);
@@ -323,7 +332,7 @@ void osd_t::exec_show_config(osd_op_t *cur_op)
: json11::Json();
auto peer_osd_num = req_json["osd_num"].uint64_value();
auto cl = msgr.clients.at(cur_op->peer_fd);
cl->osd_num = peer_osd_num;
cl->in_osd_num = peer_osd_num;
if (req_json["features"]["check_sequencing"].bool_value())
{
cl->check_sequencing = true;

View File

@@ -121,6 +121,7 @@ void pretend_connected(cluster_client_t *cli, osd_num_t osd_num)
cli->msgr.osd_peer_fds[osd_num] = peer_fd;
cli->msgr.clients[peer_fd] = new osd_client_t();
cli->msgr.clients[peer_fd]->osd_num = osd_num;
cli->msgr.clients[peer_fd]->peer_fd = peer_fd;
cli->msgr.clients[peer_fd]->peer_state = PEER_CONNECTED;
cli->msgr.wanted_peers.erase(osd_num);
cli->msgr.repeer_pgs(osd_num);

View File

@@ -125,6 +125,8 @@ void ring_loop_t::loop()
if (cqe->flags & IORING_CQE_F_MORE)
{
// There will be a second notification
if (mt)
mu.unlock();
d->res = cqe->res;
d->more = true;
if (d->callback)

View File

@@ -59,6 +59,7 @@ SCHEME=ec IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh
./test_write.sh
SCHEME=xor ./test_write.sh
TEST_NAME=iothreads GLOBAL_CONFIG=',"client_iothread_count":4' ./test_write.sh
./test_write_no_same.sh