From 5d9d6f32a05c1027a06865701a67a109fd2720ec Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sat, 13 Jan 2024 01:29:22 +0300 Subject: [PATCH 01/33] Fix common realloc memory leak mistakes found by cppcheck --- src/blockstore_impl.cpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/blockstore_impl.cpp b/src/blockstore_impl.cpp index d8ccff9c..2178395d 100644 --- a/src/blockstore_impl.cpp +++ b/src/blockstore_impl.cpp @@ -558,13 +558,14 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) if (stable_count >= stable_alloc) { stable_alloc *= 2; - stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc); - if (!stable) + obj_ver_id* nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc); + if (!nst) { op->retval = -ENOMEM; FINISH_OP(op); return; } + stable = nst; } stable[stable_count++] = { .oid = clean_it->first, @@ -642,8 +643,8 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) if (stable_count >= stable_alloc) { stable_alloc += 32768; - stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc); - if (!stable) + obj_ver_id *nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc); + if (!nst) { if (unstable) free(unstable); @@ -651,6 +652,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) FINISH_OP(op); return; } + stable = nst; } stable[stable_count++] = dirty_it->first; } @@ -666,8 +668,8 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) if (unstable_count >= unstable_alloc) { unstable_alloc += 32768; - unstable = (obj_ver_id*)realloc(unstable, sizeof(obj_ver_id) * unstable_alloc); - if (!unstable) + obj_ver_id *nst = (obj_ver_id*)realloc(unstable, sizeof(obj_ver_id) * unstable_alloc); + if (!nst) { if (stable) free(stable); @@ -675,6 +677,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) FINISH_OP(op); return; } + unstable = nst; } unstable[unstable_count++] = dirty_it->first; } @@ -694,8 +697,8 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) if (stable_count+unstable_count > stable_alloc) { stable_alloc = stable_count+unstable_count; - stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc); - if (!stable) + obj_ver_id *nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc); + if (!nst) { if (unstable) free(unstable); @@ -703,6 +706,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) FINISH_OP(op); return; } + stable = nst; } // Copy unstable entries for (int i = 0; i < unstable_count; i++) -- 2.30.2 From d00d4dbac079384c91561be6c652f79c2d57df1a Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sat, 13 Jan 2024 01:29:55 +0300 Subject: [PATCH 02/33] Initialize mod_revision field in etcd_state_client --- src/etcd_state_client.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/etcd_state_client.h b/src/etcd_state_client.h index bf7b18e5..b081d848 100644 --- a/src/etcd_state_client.h +++ b/src/etcd_state_client.h @@ -28,7 +28,7 @@ struct etcd_kv_t { std::string key; json11::Json value; - uint64_t mod_revision; + uint64_t mod_revision = 0; }; struct pg_config_t -- 2.30.2 From 5935640a4a64b2bf6903d71b43fc2da91c1da4e6 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 14 Jan 2024 16:45:18 +0300 Subject: [PATCH 03/33] Add CLA PR form --- CLA-en.md | 4 ++-- pull_request_template.yml | 28 ++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 pull_request_template.yml diff --git a/CLA-en.md b/CLA-en.md index 0407b56e..b912f06f 100644 --- a/CLA-en.md +++ b/CLA-en.md @@ -38,7 +38,7 @@ in the offer. on behalf of third parties, including on behalf of his employer. 2. Subject of the Agreement. \ - 2.1. Subject of the Agreement shall be the Contributions sent to the Author by Contributors. + 2.1. Subject of the Agreement shall be the Contributions sent to the Author by Contributors. \ 2.2. The Contributor grants to the Author the right to use Contributions at his own discretion and without any necessity to get a prior approval from Contributor or any other third party in any way, under a simple (non-exclusive), royalty-free, @@ -86,7 +86,7 @@ in the offer. of their provision to the Author. \ 5.2. The Contributor represents and warrants that he legally owns exclusive intellectual property rights to the Contributions. \ - 5.3. The Contributor represents and warrants that any further use of \ + 5.3. The Contributor represents and warrants that any further use of Contributions by the Author as provided by Contributor under the terms of the Agreement does not infringe on intellectual and other rights and legitimate interests of third parties. \ diff --git a/pull_request_template.yml b/pull_request_template.yml new file mode 100644 index 00000000..bc25a0fe --- /dev/null +++ b/pull_request_template.yml @@ -0,0 +1,28 @@ +name: Pull Request +about: Submit a pull request +body: + - type: textarea + id: description + attributes: + label: Description + description: Describe your pull request + placeholder: "" + value: "" + validations: + required: true + - type: input + id: author + attributes: + label: Contributor Name + description: Contributor Name or Company Details if the Contributor is a company + placeholder: "" + validations: + required: false + - type: checkboxes + id: terms + attributes: + label: CLA + description: By submitting this pull request, I accept [Vitastor CLA](https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md) + options: + - label: "I accept Vitastor CLA agreement: https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md" + required: true -- 2.30.2 From 2cf649eba6d2ec837eb9dca7c91fc86f7712bec8 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Mon, 15 Jan 2024 03:04:33 +0300 Subject: [PATCH 04/33] Fix "partly outside array bounds" warnings for GCC 12 in cpp-btree --- cpp-btree | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp-btree b/cpp-btree index 45e6d1f1..8de8b467 160000 --- a/cpp-btree +++ b/cpp-btree @@ -1 +1 @@ -Subproject commit 45e6d1f13196a0824e2089a586c53b9de0283f17 +Subproject commit 8de8b467acbca50cfd8835c20e0e379110f3b32b -- 2.30.2 From 3ca3b8a8d8efd00d9ebe010d53024f9f41c34f2f Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Tue, 16 Jan 2024 23:49:21 +0300 Subject: [PATCH 05/33] Fix recheck_pgs bug introduced in 1.4.0 --- mon/mon.js | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/mon/mon.js b/mon/mon.js index 56e6a573..0e852c30 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -1414,7 +1414,14 @@ class Mon } if (changed) { - await this.save_pg_config(new_config_pgs); + const ok = await this.save_pg_config(new_config_pgs); + if (ok) + console.log('PG configuration successfully changed'); + else + { + console.log('Someone changed PG configuration while we also tried to change it. Retrying in '+this.config.mon_change_timeout+' ms'); + this.schedule_recheck(); + } } } this.recheck_pgs_active = false; @@ -1495,6 +1502,11 @@ class Mon this.save_new_pgs_txn(new_config_pgs, etcd_request, pool_id, up_osds, osd_tree, real_prev_pgs, pool_res.pgs, pg_history); } new_config_pgs.hash = tree_hash; + return await this.save_pg_config(new_config_pgs, etcd_request); + } + + async save_pg_config(new_config_pgs, etcd_request = { compare: [], success: [] }) + { etcd_request.compare.push( { key: b64(this.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id }, { key: b64(this.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' }, -- 2.30.2 From 2aa5aa7ab6a845e8b498df1d01cc8f0373f45278 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Wed, 17 Jan 2024 00:16:56 +0300 Subject: [PATCH 06/33] Add a test for simple master switching without PG reconfiguration Also use osd_out_time:1 only in select tests and restart mon in tests only on connection errors --- .gitea/workflows/test.yml | 18 ++++++++++++++++++ mon/mon.js | 13 +++++++------ tests/run_3osds.sh | 7 ++++--- tests/run_tests.sh | 2 ++ tests/test_add_osd.sh | 2 +- tests/test_heal.sh | 1 + tests/test_minsize_1.sh | 1 + tests/test_splitbrain.sh | 1 + tests/test_switch_primary.sh | 18 ++++++++++++++++++ 9 files changed, 53 insertions(+), 10 deletions(-) create mode 100755 tests/test_switch_primary.sh diff --git a/.gitea/workflows/test.yml b/.gitea/workflows/test.yml index 421089ed..9bb0816a 100644 --- a/.gitea/workflows/test.yml +++ b/.gitea/workflows/test.yml @@ -532,6 +532,24 @@ jobs: echo "" done + test_switch_primary: + runs-on: ubuntu-latest + needs: build + container: ${{env.TEST_IMAGE}}:${{github.sha}} + steps: + - name: Run test + id: test + timeout-minutes: 3 + run: /root/vitastor/tests/test_switch_primary.sh + - name: Print logs + if: always() && steps.test.outcome == 'failure' + run: | + for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do + echo "-------- $i --------" + cat $i + echo "" + done + test_write: runs-on: ubuntu-latest needs: build diff --git a/mon/mon.js b/mon/mon.js index 0e852c30..4eb7a763 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -390,7 +390,8 @@ class Mon { constructor(config) { - this.die = (e) => this._die(e); + this.failconnect = (e) => this._die(e, 2); + this.die = (e) => this._die(e, 1); if (fs.existsSync(config.config_path||'/etc/vitastor/vitastor.conf')) { config = { @@ -604,7 +605,7 @@ class Mon } if (!this.ws) { - this.die('Failed to open etcd watch websocket'); + this.failconnect('Failed to open etcd watch websocket'); } const cur_addr = this.selected_etcd_url; this.ws_alive = true; @@ -791,7 +792,7 @@ class Mon const res = await this.etcd_call('/lease/keepalive', { ID: this.etcd_lease_id }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries); if (!res.result.TTL) { - this.die('Lease expired'); + this.failconnect('Lease expired'); } }, this.config.etcd_mon_timeout); if (!this.signals_set) @@ -1997,14 +1998,14 @@ class Mon return res.json; } } - this.die(); + this.failconnect(); } - _die(err) + _die(err, code) { // In fact we can just try to rejoin console.error(new Error(err || 'Cluster connection failed')); - process.exit(1); + process.exit(code || 2); } local_ips(all) diff --git a/tests/run_3osds.sh b/tests/run_3osds.sh index 5d54676e..2f6073eb 100644 --- a/tests/run_3osds.sh +++ b/tests/run_3osds.sh @@ -10,6 +10,7 @@ SCHEME=${SCHEME:-replicated} # OFFSET_ARGS # PG_SIZE # PG_MINSIZE +# GLOBAL_CONFIG if [ "$SCHEME" = "ec" ]; then OSD_COUNT=${OSD_COUNT:-5} @@ -19,10 +20,10 @@ fi if [ "$IMMEDIATE_COMMIT" != "" ]; then NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 10 --etcd_stats_interval 5" - $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"recovery_tune_util_low":1,"osd_out_time":1,"immediate_commit":"all","client_enable_writeback":true}' + $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"recovery_tune_util_low":1,"immediate_commit":"all","client_enable_writeback":true,"client_max_writeback_iodepth":32'$GLOBAL_CONFIG'}' else NO_SAME="--journal_sector_buffer_count 1024 --log_level 10 --etcd_stats_interval 5" - $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"recovery_tune_util_low":1,"osd_out_time":1,"client_enable_writeback":true}' + $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"recovery_tune_util_low":1,"client_enable_writeback":true,"client_max_writeback_iodepth":32'$GLOBAL_CONFIG'}' fi start_osd_on() @@ -53,7 +54,7 @@ for i in $(seq 1 $OSD_COUNT); do start_osd $i done -(while true; do node mon/mon-main.js --etcd_address $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 || true; done) >>./testdata/mon.log 2>&1 & +(while true; do set +e; node mon/mon-main.js --etcd_address $ETCD_URL --etcd_prefix "/vitastor" --verbose 1; if [[ $? -ne 2 ]]; then break; fi; done) >>./testdata/mon.log 2>&1 & MON_PID=$! if [ "$SCHEME" = "ec" ]; then diff --git a/tests/run_tests.sh b/tests/run_tests.sh index c93058bb..fcf079ec 100755 --- a/tests/run_tests.sh +++ b/tests/run_tests.sh @@ -45,6 +45,8 @@ IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh SCHEME=ec ./test_rebalance_verify.sh SCHEME=ec IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh +./test_switch_primary.sh + ./test_write.sh SCHEME=xor ./test_write.sh diff --git a/tests/test_add_osd.sh b/tests/test_add_osd.sh index de75ed68..42fa996c 100755 --- a/tests/test_add_osd.sh +++ b/tests/test_add_osd.sh @@ -1,7 +1,7 @@ #!/bin/bash -ex PG_COUNT=2048 - +GLOBAL_CONFIG=',"osd_out_time":1' . `dirname $0`/run_3osds.sh LD_PRELOAD="build/src/libfio_vitastor.so" \ diff --git a/tests/test_heal.sh b/tests/test_heal.sh index cf4efd1b..3604b295 100755 --- a/tests/test_heal.sh +++ b/tests/test_heal.sh @@ -9,6 +9,7 @@ if [[ "$SCHEME" = "ec" ]]; then fi OSD_COUNT=${OSD_COUNT:-7} PG_COUNT=32 +GLOBAL_CONFIG=',"osd_out_time":1' . `dirname $0`/run_3osds.sh check_qemu diff --git a/tests/test_minsize_1.sh b/tests/test_minsize_1.sh index 2569bd64..51ef2ab9 100755 --- a/tests/test_minsize_1.sh +++ b/tests/test_minsize_1.sh @@ -2,6 +2,7 @@ PG_MINSIZE=1 SCHEME=replicated +GLOBAL_CONFIG=',"osd_out_time":1' . `dirname $0`/run_3osds.sh diff --git a/tests/test_splitbrain.sh b/tests/test_splitbrain.sh index 9c637d2b..19cf0f49 100755 --- a/tests/test_splitbrain.sh +++ b/tests/test_splitbrain.sh @@ -4,6 +4,7 @@ OSD_COUNT=2 PG_SIZE=2 PG_MINSIZE=1 SCHEME=replicated +GLOBAL_CONFIG=',"osd_out_time":1' . `dirname $0`/run_3osds.sh diff --git a/tests/test_switch_primary.sh b/tests/test_switch_primary.sh new file mode 100755 index 00000000..78dcde78 --- /dev/null +++ b/tests/test_switch_primary.sh @@ -0,0 +1,18 @@ +#!/bin/bash -ex + +. `dirname $0`/run_3osds.sh + +primary=$($ETCDCTL get --print-value-only /vitastor/config/pgs | jq -r '.items["1"]["1"].primary') +primary_pid=OSD${primary}_PID +kill -9 ${!primary_pid} + +sleep 15 +wait_condition 10 "$ETCDCTL get --print-value-only /vitastor/config/pgs | jq -s -e '.[0].items[\"1\"][\"1\"].primary != \"$primary\"'" + +newprim=$($ETCDCTL get --print-value-only /vitastor/config/pgs | jq -r '.items["1"]["1"].primary') + +if [ "$newprim" = "$primary" ]; then + format_error Primary not switched +fi + +format_green OK -- 2.30.2 From 80aac39513f8392388492f3330095fe6a26d6526 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Thu, 18 Jan 2024 00:36:32 +0300 Subject: [PATCH 07/33] Add detailed formula for theoretical EC N+K random write performance --- docs/performance/theoretical.en.md | 21 ++++++++++++++------- docs/performance/theoretical.ru.md | 21 ++++++++++++++------- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/docs/performance/theoretical.en.md b/docs/performance/theoretical.en.md index 043bd7ea..b85d0a43 100644 --- a/docs/performance/theoretical.en.md +++ b/docs/performance/theoretical.en.md @@ -11,19 +11,26 @@ Replicated setups: - Single-threaded write+fsync latency: - With immediate commit: 2 network roundtrips + 1 disk write. - With lazy commit: 4 network roundtrips + 1 disk write + 1 disk flush. -- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)). -- Saturated parallel write iops: min(network bandwidth, sum(disk write iops / number of replicas / write amplification)). +- Linear read: `min(total network bandwidth, sum(disk read MB/s))`. +- Linear write: `min(total network bandwidth, sum(disk write MB/s / number of replicas))`. +- Saturated parallel read iops: `min(total network bandwidth, sum(disk read iops))`. +- Saturated parallel write iops: `min(total network bandwidth / number of replicas, sum(disk write iops / number of replicas / (write amplification = 4)))`. -EC/XOR setups: +EC/XOR setups (EC N+K): - Single-threaded (T1Q1) read latency: 1.5 network roundtrips + 1 disk read. - Single-threaded write+fsync latency: - With immediate commit: 3.5 network roundtrips + 1 disk read + 2 disk writes. - With lazy commit: 5.5 network roundtrips + 1 disk read + 2 disk writes + 2 disk fsyncs. - - 0.5 in actually (k-1)/k which means that an additional roundtrip doesn't happen when + - 0.5 in actually `(N-1)/N` which means that an additional roundtrip doesn't happen when the read sub-operation can be served locally. -- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)). -- Saturated parallel write iops: min(network bandwidth, sum(disk write iops * number of data drives / (number of data + parity drives) / write amplification)). - In fact, you should put disk write iops under the condition of ~10% reads / ~90% writes in this formula. +- Linear read: `min(total network bandwidth, sum(disk read MB/s))`. +- Linear write: `min(total network bandwidth, sum(disk write MB/s * N/(N+K)))`. +- Saturated parallel read iops: `min(total network bandwidth, sum(disk read iops))`. +- Saturated parallel write iops: roughly `total iops / (N+K) / WA`. More exactly, + `min(total network bandwidth * N/(N+K), sum(disk randrw iops / (N*4 + K*5 + 1)))` with + random read/write mix corresponding to `(N-1)/(N*4 + K*5 + 1)*100 % reads`. + - For example, with EC 2+1 it is: `(7% randrw iops) / 14`. + - With EC 6+3 it is: `(12.5% randrw iops) / 40`. Write amplification for 4 KB blocks is usually 3-5 in Vitastor: 1. Journal block write diff --git a/docs/performance/theoretical.ru.md b/docs/performance/theoretical.ru.md index e32c27d3..d258545a 100644 --- a/docs/performance/theoretical.ru.md +++ b/docs/performance/theoretical.ru.md @@ -11,20 +11,27 @@ - Запись+fsync в 1 поток: - С мгновенным сбросом: 2 RTT + 1 запись. - С отложенным ("ленивым") сбросом: 4 RTT + 1 запись + 1 fsync. -- Параллельное чтение: сумма IOPS всех дисков либо производительность сети, если в сеть упрётся раньше. -- Параллельная запись: сумма IOPS всех дисков / число реплик / WA либо производительность сети, если в сеть упрётся раньше. +- Линейное чтение: сумма МБ/с чтения всех дисков, либо общая производительность сети (сумма пропускной способности сети всех нод), если в сеть упрётся раньше. +- Линейная запись: сумма МБ/с записи всех дисков / число реплик, либо производительность сети / число реплик, если в сеть упрётся раньше. +- Параллельное случайное мелкое чтение: сумма IOPS чтения всех дисков, либо производительность сети, если в сеть упрётся раньше. +- Параллельная случайная мелкая запись: сумма IOPS записи всех дисков / число реплик / WA, либо производительность сети / число реплик, если в сеть упрётся раньше. -При использовании кодов коррекции ошибок (EC): +При использовании кодов коррекции ошибок (EC N+K): - Задержка чтения в 1 поток (T1Q1): 1.5 RTT + 1 чтение. - Запись+fsync в 1 поток: - С мгновенным сбросом: 3.5 RTT + 1 чтение + 2 записи. - С отложенным ("ленивым") сбросом: 5.5 RTT + 1 чтение + 2 записи + 2 fsync. -- Под 0.5 на самом деле подразумевается (k-1)/k, где k - число дисков данных, +- Под 0.5 на самом деле подразумевается (N-1)/N, где N - число дисков данных, что означает, что дополнительное обращение по сети не нужно, когда операция чтения обслуживается локально. -- Параллельное чтение: сумма IOPS всех дисков либо производительность сети, если в сеть упрётся раньше. -- Параллельная запись: сумма IOPS всех дисков / общее число дисков данных и чётности / WA либо производительность сети, если в сеть упрётся раньше. - Примечание: IOPS дисков в данном случае надо брать в смешанном режиме чтения/записи в пропорции, аналогичной формулам выше. +- Линейное чтение: сумма МБ/с чтения всех дисков, либо общая производительность сети, если в сеть упрётся раньше. +- Линейная запись: сумма МБ/с записи всех дисков * N/(N+K), либо производительность сети * N / (N+K), если в сеть упрётся раньше. +- Параллельное случайное мелкое чтение: сумма IOPS чтения всех дисков либо производительность сети, если в сеть упрётся раньше. +- Параллельная случайная мелкая запись: грубо `(сумма IOPS / (N+K) / WA)`. Если точнее, то: + сумма смешанного IOPS всех дисков при `(N-1)/(N*4 + K*5 + 1)*100 %` чтения, делённая на `(N*4 + K*5 + 1)`. + Либо, производительность сети * N/(N+K), если в сеть упрётся раньше. + - Например, при EC 2+1 это: `(сумма IOPS при 7% чтения) / 14`. + - При EC 6+3 это: `(сумма IOPS при 12.5% чтения) / 40`. WA (мультипликатор записи) для 4 КБ блоков в Vitastor обычно составляет 3-5: 1. Запись метаданных в журнал -- 2.30.2 From ba55f91409bc5de7946dee2321e114600b9b15d6 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Thu, 18 Jan 2024 02:31:42 +0300 Subject: [PATCH 08/33] Release 1.4.1 - Fix a monitor crash on primary OSD switching introduced in 1.4.0 - Fix "partly outside array bounds" warnings for GCC 12 in cpp-btree - Fix a realloc memory leak in theory possible with too large listings (OSD_OP_LIST) --- CMakeLists.txt | 2 +- csi/Makefile | 2 +- csi/deploy/004-csi-nodeplugin.yaml | 2 +- csi/deploy/007-csi-provisioner.yaml | 2 +- csi/src/config.go | 2 +- debian/changelog | 2 +- debian/vitastor.Dockerfile | 8 ++++---- mon/package.json | 2 +- patches/cinder-vitastor.py | 2 +- rpm/build-tarball.sh | 2 +- rpm/vitastor-el7.Dockerfile | 2 +- rpm/vitastor-el7.spec | 4 ++-- rpm/vitastor-el8.Dockerfile | 2 +- rpm/vitastor-el8.spec | 4 ++-- rpm/vitastor-el9.Dockerfile | 2 +- rpm/vitastor-el9.spec | 4 ++-- src/CMakeLists.txt | 2 +- src/vitastor.pc.in | 2 +- 18 files changed, 24 insertions(+), 24 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9fa27825..71ab5180 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12) project(vitastor) -set(VERSION "1.4.0") +set(VERSION "1.4.1") add_subdirectory(src) diff --git a/csi/Makefile b/csi/Makefile index 86b563e4..82210c11 100644 --- a/csi/Makefile +++ b/csi/Makefile @@ -1,4 +1,4 @@ -VERSION ?= v1.4.0 +VERSION ?= v1.4.1 all: build push diff --git a/csi/deploy/004-csi-nodeplugin.yaml b/csi/deploy/004-csi-nodeplugin.yaml index b71c18c0..1addcd4c 100644 --- a/csi/deploy/004-csi-nodeplugin.yaml +++ b/csi/deploy/004-csi-nodeplugin.yaml @@ -49,7 +49,7 @@ spec: capabilities: add: ["SYS_ADMIN"] allowPrivilegeEscalation: true - image: vitalif/vitastor-csi:v1.4.0 + image: vitalif/vitastor-csi:v1.4.1 args: - "--node=$(NODE_ID)" - "--endpoint=$(CSI_ENDPOINT)" diff --git a/csi/deploy/007-csi-provisioner.yaml b/csi/deploy/007-csi-provisioner.yaml index c62791c3..e341722c 100644 --- a/csi/deploy/007-csi-provisioner.yaml +++ b/csi/deploy/007-csi-provisioner.yaml @@ -121,7 +121,7 @@ spec: privileged: true capabilities: add: ["SYS_ADMIN"] - image: vitalif/vitastor-csi:v1.4.0 + image: vitalif/vitastor-csi:v1.4.1 args: - "--node=$(NODE_ID)" - "--endpoint=$(CSI_ENDPOINT)" diff --git a/csi/src/config.go b/csi/src/config.go index a3ceeb20..c1a29711 100644 --- a/csi/src/config.go +++ b/csi/src/config.go @@ -5,7 +5,7 @@ package vitastor const ( vitastorCSIDriverName = "csi.vitastor.io" - vitastorCSIDriverVersion = "1.4.0" + vitastorCSIDriverVersion = "1.4.1" ) // Config struct fills the parameters of request or user input diff --git a/debian/changelog b/debian/changelog index 80884c55..b7216bb9 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -vitastor (1.4.0-1) unstable; urgency=medium +vitastor (1.4.1-1) unstable; urgency=medium * Bugfixes diff --git a/debian/vitastor.Dockerfile b/debian/vitastor.Dockerfile index 1d64949b..b1035604 100644 --- a/debian/vitastor.Dockerfile +++ b/debian/vitastor.Dockerfile @@ -35,8 +35,8 @@ RUN set -e -x; \ mkdir -p /root/packages/vitastor-$REL; \ rm -rf /root/packages/vitastor-$REL/*; \ cd /root/packages/vitastor-$REL; \ - cp -r /root/vitastor vitastor-1.4.0; \ - cd vitastor-1.4.0; \ + cp -r /root/vitastor vitastor-1.4.1; \ + cd vitastor-1.4.1; \ ln -s /root/fio-build/fio-*/ ./fio; \ FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \ ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \ @@ -49,8 +49,8 @@ RUN set -e -x; \ rm -rf a b; \ echo "dep:fio=$FIO" > debian/fio_version; \ cd /root/packages/vitastor-$REL; \ - tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.0.orig.tar.xz vitastor-1.4.0; \ - cd vitastor-1.4.0; \ + tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.1.orig.tar.xz vitastor-1.4.1; \ + cd vitastor-1.4.1; \ V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \ DEBFULLNAME="Vitaliy Filippov " dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \ DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \ diff --git a/mon/package.json b/mon/package.json index eeeb36bc..73f44427 100644 --- a/mon/package.json +++ b/mon/package.json @@ -1,6 +1,6 @@ { "name": "vitastor-mon", - "version": "1.4.0", + "version": "1.4.1", "description": "Vitastor SDS monitor service", "main": "mon-main.js", "scripts": { diff --git a/patches/cinder-vitastor.py b/patches/cinder-vitastor.py index f69d0f85..30525d59 100644 --- a/patches/cinder-vitastor.py +++ b/patches/cinder-vitastor.py @@ -50,7 +50,7 @@ from cinder.volume import configuration from cinder.volume import driver from cinder.volume import volume_utils -VERSION = '1.4.0' +VERSION = '1.4.1' LOG = logging.getLogger(__name__) diff --git a/rpm/build-tarball.sh b/rpm/build-tarball.sh index 2068d9ca..3e97e2f1 100755 --- a/rpm/build-tarball.sh +++ b/rpm/build-tarball.sh @@ -24,4 +24,4 @@ rm fio mv fio-copy fio FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'` perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec -tar --transform 's#^#vitastor-1.4.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.0$(rpm --eval '%dist').tar.gz * +tar --transform 's#^#vitastor-1.4.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.1$(rpm --eval '%dist').tar.gz * diff --git a/rpm/vitastor-el7.Dockerfile b/rpm/vitastor-el7.Dockerfile index 957282aa..081eebea 100644 --- a/rpm/vitastor-el7.Dockerfile +++ b/rpm/vitastor-el7.Dockerfile @@ -36,7 +36,7 @@ ADD . /root/vitastor RUN set -e; \ cd /root/vitastor/rpm; \ sh build-tarball.sh; \ - cp /root/vitastor-1.4.0.el7.tar.gz ~/rpmbuild/SOURCES; \ + cp /root/vitastor-1.4.1.el7.tar.gz ~/rpmbuild/SOURCES; \ cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \ cd ~/rpmbuild/SPECS/; \ rpmbuild -ba vitastor.spec; \ diff --git a/rpm/vitastor-el7.spec b/rpm/vitastor-el7.spec index 594e5573..abf1240b 100644 --- a/rpm/vitastor-el7.spec +++ b/rpm/vitastor-el7.spec @@ -1,11 +1,11 @@ Name: vitastor -Version: 1.4.0 +Version: 1.4.1 Release: 1%{?dist} Summary: Vitastor, a fast software-defined clustered block storage License: Vitastor Network Public License 1.1 URL: https://vitastor.io/ -Source0: vitastor-1.4.0.el7.tar.gz +Source0: vitastor-1.4.1.el7.tar.gz BuildRequires: liburing-devel >= 0.6 BuildRequires: gperftools-devel diff --git a/rpm/vitastor-el8.Dockerfile b/rpm/vitastor-el8.Dockerfile index c2db92dd..a03be469 100644 --- a/rpm/vitastor-el8.Dockerfile +++ b/rpm/vitastor-el8.Dockerfile @@ -35,7 +35,7 @@ ADD . /root/vitastor RUN set -e; \ cd /root/vitastor/rpm; \ sh build-tarball.sh; \ - cp /root/vitastor-1.4.0.el8.tar.gz ~/rpmbuild/SOURCES; \ + cp /root/vitastor-1.4.1.el8.tar.gz ~/rpmbuild/SOURCES; \ cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \ cd ~/rpmbuild/SPECS/; \ rpmbuild -ba vitastor.spec; \ diff --git a/rpm/vitastor-el8.spec b/rpm/vitastor-el8.spec index d9ea9155..c11b841b 100644 --- a/rpm/vitastor-el8.spec +++ b/rpm/vitastor-el8.spec @@ -1,11 +1,11 @@ Name: vitastor -Version: 1.4.0 +Version: 1.4.1 Release: 1%{?dist} Summary: Vitastor, a fast software-defined clustered block storage License: Vitastor Network Public License 1.1 URL: https://vitastor.io/ -Source0: vitastor-1.4.0.el8.tar.gz +Source0: vitastor-1.4.1.el8.tar.gz BuildRequires: liburing-devel >= 0.6 BuildRequires: gperftools-devel diff --git a/rpm/vitastor-el9.Dockerfile b/rpm/vitastor-el9.Dockerfile index f18620b9..d02ae827 100644 --- a/rpm/vitastor-el9.Dockerfile +++ b/rpm/vitastor-el9.Dockerfile @@ -18,7 +18,7 @@ ADD . /root/vitastor RUN set -e; \ cd /root/vitastor/rpm; \ sh build-tarball.sh; \ - cp /root/vitastor-1.4.0.el9.tar.gz ~/rpmbuild/SOURCES; \ + cp /root/vitastor-1.4.1.el9.tar.gz ~/rpmbuild/SOURCES; \ cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \ cd ~/rpmbuild/SPECS/; \ rpmbuild -ba vitastor.spec; \ diff --git a/rpm/vitastor-el9.spec b/rpm/vitastor-el9.spec index f06301bb..96751661 100644 --- a/rpm/vitastor-el9.spec +++ b/rpm/vitastor-el9.spec @@ -1,11 +1,11 @@ Name: vitastor -Version: 1.4.0 +Version: 1.4.1 Release: 1%{?dist} Summary: Vitastor, a fast software-defined clustered block storage License: Vitastor Network Public License 1.1 URL: https://vitastor.io/ -Source0: vitastor-1.4.0.el9.tar.gz +Source0: vitastor-1.4.1.el9.tar.gz BuildRequires: liburing-devel >= 0.6 BuildRequires: gperftools-devel diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 69e286e4..4e1d7f52 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$") set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") endif() -add_definitions(-DVERSION="1.4.0") +add_definitions(-DVERSION="1.4.1") add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src) add_link_options(-fno-omit-frame-pointer) if (${WITH_ASAN}) diff --git a/src/vitastor.pc.in b/src/vitastor.pc.in index bc3d847d..e8ff5e3b 100644 --- a/src/vitastor.pc.in +++ b/src/vitastor.pc.in @@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ Name: Vitastor Description: Vitastor client library -Version: 1.4.0 +Version: 1.4.1 Libs: -L${libdir} -lvitastor_client Cflags: -I${includedir} -- 2.30.2 From d27524f441db475b78a939e86d7f330ad36ca338 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Thu, 25 Jan 2024 01:09:12 +0300 Subject: [PATCH 09/33] Add patch for libvirt 9.10 --- patches/libvirt-9.10-vitastor.diff | 643 +++++++++++++++++++++++++++++ 1 file changed, 643 insertions(+) create mode 100644 patches/libvirt-9.10-vitastor.diff diff --git a/patches/libvirt-9.10-vitastor.diff b/patches/libvirt-9.10-vitastor.diff new file mode 100644 index 00000000..e4fbcdc0 --- /dev/null +++ b/patches/libvirt-9.10-vitastor.diff @@ -0,0 +1,643 @@ +commit c1cd026e211e94b120028e7c98a6e4ce5afe9846 +Author: Vitaliy Filippov +Date: Wed Jan 24 22:04:50 2024 +0300 + + Add Vitastor support + +diff --git a/include/libvirt/libvirt-storage.h b/include/libvirt/libvirt-storage.h +index aaad4a3da1..5f5daa8341 100644 +--- a/include/libvirt/libvirt-storage.h ++++ b/include/libvirt/libvirt-storage.h +@@ -326,6 +326,7 @@ typedef enum { + VIR_CONNECT_LIST_STORAGE_POOLS_ZFS = 1 << 17, /* (Since: 1.2.8) */ + VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE = 1 << 18, /* (Since: 3.1.0) */ + VIR_CONNECT_LIST_STORAGE_POOLS_ISCSI_DIRECT = 1 << 19, /* (Since: 5.6.0) */ ++ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR = 1 << 20, /* (Since: 5.0.0) */ + } virConnectListAllStoragePoolsFlags; + + int virConnectListAllStoragePools(virConnectPtr conn, +diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c +index 22ad43e1d7..56c81d6852 100644 +--- a/src/conf/domain_conf.c ++++ b/src/conf/domain_conf.c +@@ -7185,7 +7185,8 @@ virDomainDiskSourceNetworkParse(xmlNodePtr node, + src->configFile = virXPathString("string(./config/@file)", ctxt); + + if (src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTP || +- src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS) ++ src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS || ++ src->protocol == VIR_STORAGE_NET_PROTOCOL_VITASTOR) + src->query = virXMLPropString(node, "query"); + + if (virDomainStorageNetworkParseHosts(node, ctxt, &src->hosts, &src->nhosts) < 0) +@@ -30618,6 +30619,7 @@ virDomainStorageSourceTranslateSourcePool(virStorageSource *src, + + case VIR_STORAGE_POOL_MPATH: + case VIR_STORAGE_POOL_RBD: ++ case VIR_STORAGE_POOL_VITASTOR: + case VIR_STORAGE_POOL_SHEEPDOG: + case VIR_STORAGE_POOL_GLUSTER: + case VIR_STORAGE_POOL_LAST: +diff --git a/src/conf/domain_validate.c b/src/conf/domain_validate.c +index c72108886e..c739ed6c43 100644 +--- a/src/conf/domain_validate.c ++++ b/src/conf/domain_validate.c +@@ -495,6 +495,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src) + case VIR_STORAGE_NET_PROTOCOL_RBD: + break; + ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: + case VIR_STORAGE_NET_PROTOCOL_NBD: + case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG: + case VIR_STORAGE_NET_PROTOCOL_GLUSTER: +@@ -541,7 +542,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src) + } + } + +- /* internal snapshots and config files are currently supported only with rbd: */ ++ /* internal snapshots are currently supported only with rbd: */ + if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK && + src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD) { + if (src->snapshot) { +@@ -549,10 +550,15 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src) + _(" element is currently supported only with 'rbd' disks")); + return -1; + } ++ } + ++ /* config files are currently supported only with rbd and vitastor: */ ++ if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK && ++ src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD && ++ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR) { + if (src->configFile) { + virReportError(VIR_ERR_XML_ERROR, "%s", +- _(" element is currently supported only with 'rbd' disks")); ++ _(" element is currently supported only with 'rbd' and 'vitastor' disks")); + return -1; + } + } +diff --git a/src/conf/schemas/domaincommon.rng b/src/conf/schemas/domaincommon.rng +index b98a2ae602..7d7a872e01 100644 +--- a/src/conf/schemas/domaincommon.rng ++++ b/src/conf/schemas/domaincommon.rng +@@ -1997,6 +1997,35 @@ + + + ++ ++ ++ ++ ++ vitastor ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ + + + +@@ -2347,6 +2376,7 @@ + + + ++ + + + +diff --git a/src/conf/storage_conf.c b/src/conf/storage_conf.c +index 68842004b7..1d69a788b6 100644 +--- a/src/conf/storage_conf.c ++++ b/src/conf/storage_conf.c +@@ -56,7 +56,7 @@ VIR_ENUM_IMPL(virStoragePool, + "logical", "disk", "iscsi", + "iscsi-direct", "scsi", "mpath", + "rbd", "sheepdog", "gluster", +- "zfs", "vstorage", ++ "zfs", "vstorage", "vitastor", + ); + + VIR_ENUM_IMPL(virStoragePoolFormatFileSystem, +@@ -242,6 +242,18 @@ static virStoragePoolTypeInfo poolTypeInfo[] = { + .formatToString = virStorageFileFormatTypeToString, + } + }, ++ {.poolType = VIR_STORAGE_POOL_VITASTOR, ++ .poolOptions = { ++ .flags = (VIR_STORAGE_POOL_SOURCE_HOST | ++ VIR_STORAGE_POOL_SOURCE_NETWORK | ++ VIR_STORAGE_POOL_SOURCE_NAME), ++ }, ++ .volOptions = { ++ .defaultFormat = VIR_STORAGE_FILE_RAW, ++ .formatFromString = virStorageVolumeFormatFromString, ++ .formatToString = virStorageFileFormatTypeToString, ++ } ++ }, + {.poolType = VIR_STORAGE_POOL_SHEEPDOG, + .poolOptions = { + .flags = (VIR_STORAGE_POOL_SOURCE_HOST | +@@ -538,6 +550,11 @@ virStoragePoolDefParseSource(xmlXPathContextPtr ctxt, + _("element 'name' is mandatory for RBD pool")); + return -1; + } ++ if (pool_type == VIR_STORAGE_POOL_VITASTOR && source->name == NULL) { ++ virReportError(VIR_ERR_XML_ERROR, "%s", ++ _("element 'name' is mandatory for Vitastor pool")); ++ return -1; ++ } + + if (options->formatFromString) { + g_autofree char *format = NULL; +@@ -1127,6 +1144,7 @@ virStoragePoolDefFormatBuf(virBuffer *buf, + /* RBD, Sheepdog, Gluster and Iscsi-direct devices are not local block devs nor + * files, so they don't have a target */ + if (def->type != VIR_STORAGE_POOL_RBD && ++ def->type != VIR_STORAGE_POOL_VITASTOR && + def->type != VIR_STORAGE_POOL_SHEEPDOG && + def->type != VIR_STORAGE_POOL_GLUSTER && + def->type != VIR_STORAGE_POOL_ISCSI_DIRECT) { +diff --git a/src/conf/storage_conf.h b/src/conf/storage_conf.h +index fc67957cfe..720c07ef74 100644 +--- a/src/conf/storage_conf.h ++++ b/src/conf/storage_conf.h +@@ -103,6 +103,7 @@ typedef enum { + VIR_STORAGE_POOL_GLUSTER, /* Gluster device */ + VIR_STORAGE_POOL_ZFS, /* ZFS */ + VIR_STORAGE_POOL_VSTORAGE, /* Virtuozzo Storage */ ++ VIR_STORAGE_POOL_VITASTOR, /* Vitastor */ + + VIR_STORAGE_POOL_LAST, + } virStoragePoolType; +@@ -454,6 +455,7 @@ VIR_ENUM_DECL(virStoragePartedFs); + VIR_CONNECT_LIST_STORAGE_POOLS_SCSI | \ + VIR_CONNECT_LIST_STORAGE_POOLS_MPATH | \ + VIR_CONNECT_LIST_STORAGE_POOLS_RBD | \ ++ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR | \ + VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG | \ + VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER | \ + VIR_CONNECT_LIST_STORAGE_POOLS_ZFS | \ +diff --git a/src/conf/storage_source_conf.c b/src/conf/storage_source_conf.c +index f974a521b1..cd394d0a9f 100644 +--- a/src/conf/storage_source_conf.c ++++ b/src/conf/storage_source_conf.c +@@ -88,6 +88,7 @@ VIR_ENUM_IMPL(virStorageNetProtocol, + "ssh", + "vxhs", + "nfs", ++ "vitastor", + ); + + +@@ -1301,6 +1302,7 @@ virStorageSourceNetworkDefaultPort(virStorageNetProtocol protocol) + case VIR_STORAGE_NET_PROTOCOL_GLUSTER: + return 24007; + ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: + case VIR_STORAGE_NET_PROTOCOL_RBD: + /* we don't provide a default for RBD */ + return 0; +diff --git a/src/conf/storage_source_conf.h b/src/conf/storage_source_conf.h +index 5e7d127453..283709eeb3 100644 +--- a/src/conf/storage_source_conf.h ++++ b/src/conf/storage_source_conf.h +@@ -129,6 +129,7 @@ typedef enum { + VIR_STORAGE_NET_PROTOCOL_SSH, + VIR_STORAGE_NET_PROTOCOL_VXHS, + VIR_STORAGE_NET_PROTOCOL_NFS, ++ VIR_STORAGE_NET_PROTOCOL_VITASTOR, + + VIR_STORAGE_NET_PROTOCOL_LAST + } virStorageNetProtocol; +diff --git a/src/conf/virstorageobj.c b/src/conf/virstorageobj.c +index 59fa5da372..4739167f5f 100644 +--- a/src/conf/virstorageobj.c ++++ b/src/conf/virstorageobj.c +@@ -1438,6 +1438,7 @@ virStoragePoolObjSourceFindDuplicateCb(const void *payload, + return 1; + break; + ++ case VIR_STORAGE_POOL_VITASTOR: + case VIR_STORAGE_POOL_ISCSI_DIRECT: + case VIR_STORAGE_POOL_RBD: + case VIR_STORAGE_POOL_LAST: +@@ -1921,6 +1922,8 @@ virStoragePoolObjMatch(virStoragePoolObj *obj, + (obj->def->type == VIR_STORAGE_POOL_MPATH)) || + (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_RBD) && + (obj->def->type == VIR_STORAGE_POOL_RBD)) || ++ (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR) && ++ (obj->def->type == VIR_STORAGE_POOL_VITASTOR)) || + (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG) && + (obj->def->type == VIR_STORAGE_POOL_SHEEPDOG)) || + (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER) && +diff --git a/src/libvirt-storage.c b/src/libvirt-storage.c +index db7660aac4..561df34709 100644 +--- a/src/libvirt-storage.c ++++ b/src/libvirt-storage.c +@@ -94,6 +94,7 @@ virStoragePoolGetConnect(virStoragePoolPtr pool) + * VIR_CONNECT_LIST_STORAGE_POOLS_SCSI + * VIR_CONNECT_LIST_STORAGE_POOLS_MPATH + * VIR_CONNECT_LIST_STORAGE_POOLS_RBD ++ * VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR + * VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG + * VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER + * VIR_CONNECT_LIST_STORAGE_POOLS_ZFS +diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c +index 62e1be6672..71a1d42896 100644 +--- a/src/libxl/libxl_conf.c ++++ b/src/libxl/libxl_conf.c +@@ -979,6 +979,7 @@ libxlMakeNetworkDiskSrcStr(virStorageSource *src, + case VIR_STORAGE_NET_PROTOCOL_SSH: + case VIR_STORAGE_NET_PROTOCOL_VXHS: + case VIR_STORAGE_NET_PROTOCOL_NFS: ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: + case VIR_STORAGE_NET_PROTOCOL_LAST: + case VIR_STORAGE_NET_PROTOCOL_NONE: + virReportError(VIR_ERR_NO_SUPPORT, +diff --git a/src/libxl/xen_xl.c b/src/libxl/xen_xl.c +index f175359307..8efcf4c329 100644 +--- a/src/libxl/xen_xl.c ++++ b/src/libxl/xen_xl.c +@@ -1456,6 +1456,7 @@ xenFormatXLDiskSrcNet(virStorageSource *src) + case VIR_STORAGE_NET_PROTOCOL_SSH: + case VIR_STORAGE_NET_PROTOCOL_VXHS: + case VIR_STORAGE_NET_PROTOCOL_NFS: ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: + case VIR_STORAGE_NET_PROTOCOL_LAST: + case VIR_STORAGE_NET_PROTOCOL_NONE: + virReportError(VIR_ERR_NO_SUPPORT, +diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c +index 7e9daf0bdc..825b4a3006 100644 +--- a/src/qemu/qemu_block.c ++++ b/src/qemu/qemu_block.c +@@ -758,6 +758,38 @@ qemuBlockStorageSourceGetRBDProps(virStorageSource *src, + } + + ++static virJSONValue * ++qemuBlockStorageSourceGetVitastorProps(virStorageSource *src) ++{ ++ virJSONValue *ret = NULL; ++ virStorageNetHostDef *host; ++ size_t i; ++ g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER; ++ g_autofree char *etcd = NULL; ++ ++ for (i = 0; i < src->nhosts; i++) { ++ host = src->hosts + i; ++ if ((virStorageNetHostTransport)host->transport != VIR_STORAGE_NET_HOST_TRANS_TCP) { ++ return NULL; ++ } ++ virBufferAsprintf(&buf, i > 0 ? ",%s:%u" : "%s:%u", host->name, host->port); ++ } ++ if (src->nhosts > 0) { ++ etcd = virBufferContentAndReset(&buf); ++ } ++ ++ if (virJSONValueObjectAdd(&ret, ++ "S:etcd-host", etcd, ++ "S:etcd-prefix", src->query, ++ "S:config-path", src->configFile, ++ "s:image", src->path, ++ NULL) < 0) ++ return NULL; ++ ++ return ret; ++} ++ ++ + static virJSONValue * + qemuBlockStorageSourceGetSheepdogProps(virStorageSource *src) + { +@@ -1140,6 +1172,12 @@ qemuBlockStorageSourceGetBackendProps(virStorageSource *src, + return NULL; + break; + ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: ++ driver = "vitastor"; ++ if (!(fileprops = qemuBlockStorageSourceGetVitastorProps(src))) ++ return NULL; ++ break; ++ + case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG: + driver = "sheepdog"; + if (!(fileprops = qemuBlockStorageSourceGetSheepdogProps(src))) +@@ -2032,6 +2070,7 @@ qemuBlockGetBackingStoreString(virStorageSource *src, + + case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG: + case VIR_STORAGE_NET_PROTOCOL_RBD: ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: + case VIR_STORAGE_NET_PROTOCOL_VXHS: + case VIR_STORAGE_NET_PROTOCOL_NFS: + case VIR_STORAGE_NET_PROTOCOL_SSH: +@@ -2415,6 +2454,12 @@ qemuBlockStorageSourceCreateGetStorageProps(virStorageSource *src, + return -1; + break; + ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: ++ driver = "vitastor"; ++ if (!(location = qemuBlockStorageSourceGetVitastorProps(src))) ++ return -1; ++ break; ++ + case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG: + driver = "sheepdog"; + if (!(location = qemuBlockStorageSourceGetSheepdogProps(src))) +diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c +index 953808fcfe..62860283d8 100644 +--- a/src/qemu/qemu_domain.c ++++ b/src/qemu/qemu_domain.c +@@ -5215,7 +5215,8 @@ qemuDomainValidateStorageSource(virStorageSource *src, + if (src->query && + (actualType != VIR_STORAGE_TYPE_NETWORK || + (src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTPS && +- src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP))) { ++ src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP && ++ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR))) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("query is supported only with HTTP(S) protocols")); + return -1; +@@ -10340,6 +10341,7 @@ qemuDomainPrepareStorageSourceTLS(virStorageSource *src, + break; + + case VIR_STORAGE_NET_PROTOCOL_RBD: ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: + case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG: + case VIR_STORAGE_NET_PROTOCOL_GLUSTER: + case VIR_STORAGE_NET_PROTOCOL_ISCSI: +diff --git a/src/qemu/qemu_snapshot.c b/src/qemu/qemu_snapshot.c +index 73ff533827..e9c799ca8f 100644 +--- a/src/qemu/qemu_snapshot.c ++++ b/src/qemu/qemu_snapshot.c +@@ -423,6 +423,7 @@ qemuSnapshotPrepareDiskExternalInactive(virDomainSnapshotDiskDef *snapdisk, + case VIR_STORAGE_NET_PROTOCOL_NONE: + case VIR_STORAGE_NET_PROTOCOL_NBD: + case VIR_STORAGE_NET_PROTOCOL_RBD: ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: + case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG: + case VIR_STORAGE_NET_PROTOCOL_GLUSTER: + case VIR_STORAGE_NET_PROTOCOL_ISCSI: +@@ -648,6 +649,7 @@ qemuSnapshotPrepareDiskInternal(virDomainDiskDef *disk, + case VIR_STORAGE_NET_PROTOCOL_NONE: + case VIR_STORAGE_NET_PROTOCOL_NBD: + case VIR_STORAGE_NET_PROTOCOL_RBD: ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: + case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG: + case VIR_STORAGE_NET_PROTOCOL_GLUSTER: + case VIR_STORAGE_NET_PROTOCOL_ISCSI: +diff --git a/src/storage/storage_driver.c b/src/storage/storage_driver.c +index 314fe930e0..fb615a8b4e 100644 +--- a/src/storage/storage_driver.c ++++ b/src/storage/storage_driver.c +@@ -1626,6 +1626,7 @@ storageVolLookupByPathCallback(virStoragePoolObj *obj, + + case VIR_STORAGE_POOL_GLUSTER: + case VIR_STORAGE_POOL_RBD: ++ case VIR_STORAGE_POOL_VITASTOR: + case VIR_STORAGE_POOL_SHEEPDOG: + case VIR_STORAGE_POOL_ZFS: + case VIR_STORAGE_POOL_LAST: +diff --git a/src/storage_file/storage_source_backingstore.c b/src/storage_file/storage_source_backingstore.c +index 80681924ea..8a3ade9ec0 100644 +--- a/src/storage_file/storage_source_backingstore.c ++++ b/src/storage_file/storage_source_backingstore.c +@@ -287,6 +287,75 @@ virStorageSourceParseRBDColonString(const char *rbdstr, + } + + ++static int ++virStorageSourceParseVitastorColonString(const char *colonstr, ++ virStorageSource *src) ++{ ++ char *p, *e, *next; ++ g_autofree char *options = NULL; ++ ++ /* optionally skip the "vitastor:" prefix if provided */ ++ if (STRPREFIX(colonstr, "vitastor:")) ++ colonstr += strlen("vitastor:"); ++ ++ options = g_strdup(colonstr); ++ ++ p = options; ++ while (*p) { ++ /* find : delimiter or end of string */ ++ for (e = p; *e && *e != ':'; ++e) { ++ if (*e == '\\') { ++ e++; ++ if (*e == '\0') ++ break; ++ } ++ } ++ if (*e == '\0') { ++ next = e; /* last kv pair */ ++ } else { ++ next = e + 1; ++ *e = '\0'; ++ } ++ ++ if (STRPREFIX(p, "image=")) { ++ src->path = g_strdup(p + strlen("image=")); ++ } else if (STRPREFIX(p, "etcd-prefix=")) { ++ src->query = g_strdup(p + strlen("etcd-prefix=")); ++ } else if (STRPREFIX(p, "config-path=")) { ++ src->configFile = g_strdup(p + strlen("config-path=")); ++ } else if (STRPREFIX(p, "etcd-host=")) { ++ char *h, *sep; ++ ++ h = p + strlen("etcd-host="); ++ while (h < e) { ++ for (sep = h; sep < e; ++sep) { ++ if (*sep == '\\' && (sep[1] == ',' || ++ sep[1] == ';' || ++ sep[1] == ' ')) { ++ *sep = '\0'; ++ sep += 2; ++ break; ++ } ++ } ++ ++ if (virStorageSourceRBDAddHost(src, h) < 0) ++ return -1; ++ ++ h = sep; ++ } ++ } ++ ++ p = next; ++ } ++ ++ if (!src->path) { ++ return -1; ++ } ++ ++ return 0; ++} ++ ++ + static int + virStorageSourceParseNBDColonString(const char *nbdstr, + virStorageSource *src) +@@ -399,6 +468,11 @@ virStorageSourceParseBackingColon(virStorageSource *src, + return -1; + break; + ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: ++ if (virStorageSourceParseVitastorColonString(path, src) < 0) ++ return -1; ++ break; ++ + case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG: + case VIR_STORAGE_NET_PROTOCOL_LAST: + case VIR_STORAGE_NET_PROTOCOL_NONE: +@@ -975,6 +1049,54 @@ virStorageSourceParseBackingJSONRBD(virStorageSource *src, + return 0; + } + ++static int ++virStorageSourceParseBackingJSONVitastor(virStorageSource *src, ++ virJSONValue *json, ++ const char *jsonstr G_GNUC_UNUSED, ++ int opaque G_GNUC_UNUSED) ++{ ++ const char *filename; ++ const char *image = virJSONValueObjectGetString(json, "image"); ++ const char *conf = virJSONValueObjectGetString(json, "config-path"); ++ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd-prefix"); ++ virJSONValue *servers = virJSONValueObjectGetArray(json, "server"); ++ size_t nservers; ++ size_t i; ++ ++ src->type = VIR_STORAGE_TYPE_NETWORK; ++ src->protocol = VIR_STORAGE_NET_PROTOCOL_VITASTOR; ++ ++ /* legacy syntax passed via 'filename' option */ ++ if ((filename = virJSONValueObjectGetString(json, "filename"))) ++ return virStorageSourceParseVitastorColonString(filename, src); ++ ++ if (!image) { ++ virReportError(VIR_ERR_INVALID_ARG, "%s", ++ _("missing image name in Vitastor backing volume " ++ "JSON specification")); ++ return -1; ++ } ++ ++ src->path = g_strdup(image); ++ src->configFile = g_strdup(conf); ++ src->query = g_strdup(etcd_prefix); ++ ++ if (servers) { ++ nservers = virJSONValueArraySize(servers); ++ ++ src->hosts = g_new0(virStorageNetHostDef, nservers); ++ src->nhosts = nservers; ++ ++ for (i = 0; i < nservers; i++) { ++ if (virStorageSourceParseBackingJSONInetSocketAddress(src->hosts + i, ++ virJSONValueArrayGet(servers, i)) < 0) ++ return -1; ++ } ++ } ++ ++ return 0; ++} ++ + static int + virStorageSourceParseBackingJSONRaw(virStorageSource *src, + virJSONValue *json, +@@ -1152,6 +1274,7 @@ static const struct virStorageSourceJSONDriverParser jsonParsers[] = { + {"sheepdog", false, virStorageSourceParseBackingJSONSheepdog, 0}, + {"ssh", false, virStorageSourceParseBackingJSONSSH, 0}, + {"rbd", false, virStorageSourceParseBackingJSONRBD, 0}, ++ {"vitastor", false, virStorageSourceParseBackingJSONVitastor, 0}, + {"raw", true, virStorageSourceParseBackingJSONRaw, 0}, + {"nfs", false, virStorageSourceParseBackingJSONNFS, 0}, + {"vxhs", false, virStorageSourceParseBackingJSONVxHS, 0}, +diff --git a/src/test/test_driver.c b/src/test/test_driver.c +index e87d7cfd44..ccc05d7aae 100644 +--- a/src/test/test_driver.c ++++ b/src/test/test_driver.c +@@ -7335,6 +7335,7 @@ testStorageVolumeTypeForPool(int pooltype) + case VIR_STORAGE_POOL_ISCSI_DIRECT: + case VIR_STORAGE_POOL_GLUSTER: + case VIR_STORAGE_POOL_RBD: ++ case VIR_STORAGE_POOL_VITASTOR: + return VIR_STORAGE_VOL_NETWORK; + case VIR_STORAGE_POOL_LOGICAL: + case VIR_STORAGE_POOL_DISK: +diff --git a/tests/storagepoolcapsschemadata/poolcaps-fs.xml b/tests/storagepoolcapsschemadata/poolcaps-fs.xml +index eee75af746..8bd0a57bdd 100644 +--- a/tests/storagepoolcapsschemadata/poolcaps-fs.xml ++++ b/tests/storagepoolcapsschemadata/poolcaps-fs.xml +@@ -204,4 +204,11 @@ + + + ++ ++ ++ ++ ++ ++ ++ + +diff --git a/tests/storagepoolcapsschemadata/poolcaps-full.xml b/tests/storagepoolcapsschemadata/poolcaps-full.xml +index 805950a937..852df0de16 100644 +--- a/tests/storagepoolcapsschemadata/poolcaps-full.xml ++++ b/tests/storagepoolcapsschemadata/poolcaps-full.xml +@@ -204,4 +204,11 @@ + + + ++ ++ ++ ++ ++ ++ ++ + +diff --git a/tests/storagepoolxml2argvtest.c b/tests/storagepoolxml2argvtest.c +index e8e40d695e..db55fe5f3a 100644 +--- a/tests/storagepoolxml2argvtest.c ++++ b/tests/storagepoolxml2argvtest.c +@@ -65,6 +65,7 @@ testCompareXMLToArgvFiles(bool shouldFail, + case VIR_STORAGE_POOL_GLUSTER: + case VIR_STORAGE_POOL_ZFS: + case VIR_STORAGE_POOL_VSTORAGE: ++ case VIR_STORAGE_POOL_VITASTOR: + case VIR_STORAGE_POOL_LAST: + default: + VIR_TEST_DEBUG("pool type '%s' has no xml2argv test", defTypeStr); +diff --git a/tools/virsh-pool.c b/tools/virsh-pool.c +index 36f00cf643..5f5bd3464e 100644 +--- a/tools/virsh-pool.c ++++ b/tools/virsh-pool.c +@@ -1223,6 +1223,9 @@ cmdPoolList(vshControl *ctl, const vshCmd *cmd G_GNUC_UNUSED) + case VIR_STORAGE_POOL_VSTORAGE: + flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE; + break; ++ case VIR_STORAGE_POOL_VITASTOR: ++ flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR; ++ break; + case VIR_STORAGE_POOL_LAST: + break; + } -- 2.30.2 From 1c322b33edc6aa312ac88fe63dd1a346a5a7a631 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Fri, 26 Jan 2024 01:50:54 +0300 Subject: [PATCH 10/33] Change default up_wait_retry_interval to 50 ms --- docs/config/src/network.yml | 4 ++-- mon/mon.js | 2 +- src/cluster_client.cpp | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/config/src/network.yml b/docs/config/src/network.yml index b95f48e2..5bd2c808 100644 --- a/docs/config/src/network.yml +++ b/docs/config/src/network.yml @@ -245,8 +245,8 @@ повторная попытка соединения. - name: up_wait_retry_interval type: ms - min: 50 - default: 500 + min: 10 + default: 50 online: true info: | OSDs respond to clients with a special error code when they receive I/O diff --git a/mon/mon.js b/mon/mon.js index 4eb7a763..f0d52261 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -92,7 +92,7 @@ const etcd_tree = { peer_connect_timeout: 5, // seconds. min: 1 osd_idle_timeout: 5, // seconds. min: 1 osd_ping_timeout: 5, // seconds. min: 1 - up_wait_retry_interval: 500, // ms. min: 50 + up_wait_retry_interval: 50, // ms. min: 10 max_etcd_attempts: 5, etcd_quick_timeout: 1000, // ms etcd_slow_timeout: 5000, // ms diff --git a/src/cluster_client.cpp b/src/cluster_client.cpp index efed244a..1f270889 100644 --- a/src/cluster_client.cpp +++ b/src/cluster_client.cpp @@ -352,13 +352,13 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co // up_wait_retry_interval up_wait_retry_interval = config["up_wait_retry_interval"].uint64_value(); if (!up_wait_retry_interval) - { - up_wait_retry_interval = 500; - } - else if (up_wait_retry_interval < 50) { up_wait_retry_interval = 50; } + else if (up_wait_retry_interval < 10) + { + up_wait_retry_interval = 10; + } // log_level log_level = config["log_level"].uint64_value(); msgr.parse_config(config); -- 2.30.2 From 1cec62d25dd1b7aef6d71cd5719d9679ff0f4094 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 31 Dec 2023 00:41:16 +0300 Subject: [PATCH 11/33] Sync only completed writes Should be a final remaining fix to EC + non-capacitor (non-immediate-commit) write hangs :). First it was breaking non-EC ("instantly stable") writes because they sometimes complete out of order which was leading to the following error: terminate called after throwing an instance of 'std::runtime_error' what(): BUG: Unexpected dirty_entry 1000000000001:29480000 v65540 unstable state during flush: 0x151 But it is easily fixed by scanning previous and next dirty_entries in mark_stable. --- src/blockstore_impl.cpp | 16 ++++------------ src/blockstore_impl.h | 4 +++- src/blockstore_rollback.cpp | 1 + src/blockstore_stable.cpp | 32 +++++++++++++++++++++++++++++++- src/blockstore_sync.cpp | 3 +-- src/blockstore_write.cpp | 2 +- 6 files changed, 41 insertions(+), 17 deletions(-) diff --git a/src/blockstore_impl.cpp b/src/blockstore_impl.cpp index 2178395d..55d00960 100644 --- a/src/blockstore_impl.cpp +++ b/src/blockstore_impl.cpp @@ -163,20 +163,10 @@ void blockstore_impl_t::loop() } else if (op->opcode == BS_OP_SYNC) { - // wait for all small writes to be submitted - // wait for all big writes to complete, submit data device fsync + // sync only completed writes? // wait for the data device fsync to complete, then submit journal writes for big writes // then submit an fsync operation - if (has_writes) - { - // Can't submit SYNC before previous writes - continue; - } wr_st = continue_sync(op); - if (wr_st != 2) - { - has_writes = wr_st > 0 ? 1 : 2; - } } else if (op->opcode == BS_OP_STABLE) { @@ -283,7 +273,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op) } else if (PRIV(op)->wait_for == WAIT_JOURNAL) { - if (journal.used_start == PRIV(op)->wait_detail) + if (journal.used_start == PRIV(op)->wait_detail && !unstable_count_changed) { // do not submit #ifdef BLOCKSTORE_DEBUG @@ -291,6 +281,7 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op) #endif return; } + unstable_count_changed = false; flusher->release_trim(); PRIV(op)->wait_for = 0; } @@ -362,6 +353,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op) }; } unstable_writes.clear(); + unstable_count_changed = true; op->callback = [old_callback](blockstore_op_t *op) { obj_ver_id *vers = (obj_ver_id*)op->buf; diff --git a/src/blockstore_impl.h b/src/blockstore_impl.h index e20e956c..99f01cef 100644 --- a/src/blockstore_impl.h +++ b/src/blockstore_impl.h @@ -55,6 +55,7 @@ #define IS_JOURNAL(st) (((st) & 0x0F) == BS_ST_SMALL_WRITE) #define IS_BIG_WRITE(st) (((st) & 0x0F) == BS_ST_BIG_WRITE) #define IS_DELETE(st) (((st) & 0x0F) == BS_ST_DELETE) +#define IS_INSTANT(st) (((st) & BS_ST_TYPE_MASK) == BS_ST_DELETE || ((st) & BS_ST_INSTANT)) #define BS_SUBMIT_CHECK_SQES(n) \ if (ringloop->sqes_left() < (n))\ @@ -275,6 +276,7 @@ class blockstore_impl_t std::vector submit_queue; std::vector unsynced_big_writes, unsynced_small_writes; int unsynced_big_write_count = 0, unstable_unsynced = 0; + bool unstable_count_changed = false; int unsynced_queued_ops = 0; allocator *data_alloc = NULL; uint64_t used_blocks = 0; @@ -377,7 +379,7 @@ class blockstore_impl_t // Stabilize int dequeue_stable(blockstore_op_t *op); int continue_stable(blockstore_op_t *op); - void mark_stable(const obj_ver_id & ov, bool forget_dirty = false); + void mark_stable(obj_ver_id ov, bool forget_dirty = false); void stabilize_object(object_id oid, uint64_t max_ver); blockstore_op_t* selective_sync(blockstore_op_t *op); int split_stab_op(blockstore_op_t *op, std::function decider); diff --git a/src/blockstore_rollback.cpp b/src/blockstore_rollback.cpp index 50b6eb88..cc686112 100644 --- a/src/blockstore_rollback.cpp +++ b/src/blockstore_rollback.cpp @@ -162,6 +162,7 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov) unstable_writes.erase(unstab_it); else unstab_it->second = max_unstable; + unstable_count_changed = true; } } } diff --git a/src/blockstore_stable.cpp b/src/blockstore_stable.cpp index 10648ad9..f3d4dc27 100644 --- a/src/blockstore_stable.cpp +++ b/src/blockstore_stable.cpp @@ -412,11 +412,40 @@ resume_4: return 2; } -void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty) +void blockstore_impl_t::mark_stable(obj_ver_id v, bool forget_dirty) { auto dirty_it = dirty_db.find(v); if (dirty_it != dirty_db.end()) { + if (IS_INSTANT(dirty_it->second.state)) + { + // 'Instant' (non-EC) operations may complete and try to become stable out of order. Prevent it. + auto back_it = dirty_it; + while (back_it != dirty_db.begin()) + { + back_it--; + if (back_it->first.oid != v.oid) + { + break; + } + if (!IS_STABLE(back_it->second.state)) + { + // There are preceding unstable versions, can't flush + return; + } + } + while (true) + { + dirty_it++; + if (dirty_it == dirty_db.end() || dirty_it->first.oid != v.oid || + !IS_SYNCED(dirty_it->second.state)) + { + dirty_it--; + break; + } + v.version = dirty_it->first.version; + } + } while (1) { bool was_stable = IS_STABLE(dirty_it->second.state); @@ -508,5 +537,6 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty) unstab_it->second <= v.version) { unstable_writes.erase(unstab_it); + unstable_count_changed = true; } } diff --git a/src/blockstore_sync.cpp b/src/blockstore_sync.cpp index 1d64a4f6..50891a13 100644 --- a/src/blockstore_sync.cpp +++ b/src/blockstore_sync.cpp @@ -85,8 +85,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op) left--; auto & dirty_entry = dirty_db.at(sbw); uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len); - if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size, - (unstable_writes.size()+unstable_unsynced)*journal.block_size)) + if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size, 0)) { return 0; } diff --git a/src/blockstore_write.cpp b/src/blockstore_write.cpp index 36828abc..355e4f38 100644 --- a/src/blockstore_write.cpp +++ b/src/blockstore_write.cpp @@ -593,7 +593,7 @@ resume_4: #endif bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE; bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE); - bool is_instant = ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT)); + bool is_instant = IS_INSTANT(dirty_it->second.state); if (imm) { auto & unstab = unstable_writes[op->oid]; -- 2.30.2 From cc76e6876bfbcbd83b004322a2c747afb710b7b1 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 28 Jan 2024 14:59:33 +0300 Subject: [PATCH 12/33] Fix flapping "scrub" test --- tests/test_scrub.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_scrub.sh b/tests/test_scrub.sh index 9da65a0c..8252e0bd 100755 --- a/tests/test_scrub.sh +++ b/tests/test_scrub.sh @@ -20,6 +20,9 @@ LD_PRELOAD="build/src/libfio_vitastor.so" \ fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 \ -mirror_file=./testdata/mirror.bin -end_fsync=1 -rw=write -etcd=$ETCD_URL -image=testimg +# Save PG primary +primary=$($ETCDCTL get --print-value-only /vitastor/config/pgs | jq -r '.items["1"]["1"].primary') + # Intentionally corrupt OSD data and restart it zero_osd_pid=OSD${ZERO_OSD}_PID kill ${!zero_osd_pid} @@ -34,6 +37,9 @@ start_osd $ZERO_OSD # Wait until start wait_up 10 +# Wait until PG is back on the same primary +wait_condition 10 "$ETCDCTL"$' get --print-value-only /vitastor/config/pgs | jq -s -e \'.[0].items["1"]["1"].primary == "'$primary'"'"'" + # Trigger scrub $ETCDCTL put /vitastor/pg/history/1/1 `$ETCDCTL get --print-value-only /vitastor/pg/history/1/1 | jq -s -c '(.[0] // {}) + {"next_scrub":1}'` -- 2.30.2 From d2b43cb118440841b72e94a5c4bfda262a33eebc Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Mon, 29 Jan 2024 23:45:07 +0300 Subject: [PATCH 13/33] Change default etcd_mon_ttl --- docs/config/src/monitor.yml | 4 ++-- mon/mon.js | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/config/src/monitor.yml b/docs/config/src/monitor.yml index 06f6a649..caf335e3 100644 --- a/docs/config/src/monitor.yml +++ b/docs/config/src/monitor.yml @@ -1,7 +1,7 @@ - name: etcd_mon_ttl type: sec - min: 10 - default: 30 + min: 5 + default: 1 info: Monitor etcd lease refresh interval in seconds info_ru: Интервал обновления etcd резервации (lease) монитором - name: etcd_mon_timeout diff --git a/mon/mon.js b/mon/mon.js index f0d52261..ab7221bb 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -55,7 +55,7 @@ const etcd_tree = { // etcd connection - configurable online etcd_address: "10.0.115.10:2379/v3", // mon - etcd_mon_ttl: 30, // min: 10 + etcd_mon_ttl: 5, // min: 1 etcd_mon_timeout: 1000, // ms. min: 0 etcd_mon_retries: 5, // min: 0 mon_change_timeout: 1000, // ms. min: 100 @@ -480,10 +480,10 @@ class Mon check_config() { - this.config.etcd_mon_ttl = Number(this.config.etcd_mon_ttl) || 30; - if (this.config.etcd_mon_ttl < 10) + this.config.etcd_mon_ttl = Number(this.config.etcd_mon_ttl) || 5; + if (this.config.etcd_mon_ttl < 1) { - this.config.etcd_mon_ttl = 10; + this.config.etcd_mon_ttl = 1; } this.config.etcd_mon_timeout = Number(this.config.etcd_mon_timeout) || 0; if (this.config.etcd_mon_timeout <= 0) @@ -794,7 +794,7 @@ class Mon { this.failconnect('Lease expired'); } - }, this.config.etcd_mon_timeout); + }, this.config.etcd_mon_ttl*1000); if (!this.signals_set) { process.on('SIGINT', this.on_stop_cb); -- 2.30.2 From a86a380d200603f9c752b89baab7034f9ac77bf2 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sat, 3 Feb 2024 20:25:22 +0300 Subject: [PATCH 14/33] Fix invalid parsing of autosync_writes in blockstore leading to autosyncs after every operation with disabled immediate_commit :D --- src/blockstore_open.cpp | 2 +- src/osd.cpp | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/blockstore_open.cpp b/src/blockstore_open.cpp index 7c57dbde..1bfbd064 100644 --- a/src/blockstore_open.cpp +++ b/src/blockstore_open.cpp @@ -19,7 +19,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init) throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10); throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10); throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10); - if (config.find("autosync_writes") != config.end()) + if (config["autosync_writes"] != "") { autosync_writes = strtoull(config["autosync_writes"].c_str(), NULL, 10); } diff --git a/src/osd.cpp b/src/osd.cpp index b85e7210..23a006e5 100644 --- a/src/osd.cpp +++ b/src/osd.cpp @@ -22,7 +22,7 @@ static blockstore_config_t json_to_bs(const json11::Json::object & config) { if (kv.second.is_string()) bs[kv.first] = kv.second.string_value(); - else + else if (!kv.second.is_null()) bs[kv.first] = kv.second.dump(); } return bs; @@ -194,7 +194,8 @@ void osd_t::parse_config(bool init) if (autosync_interval > MAX_AUTOSYNC_INTERVAL) autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; } - if (!config["autosync_writes"].is_null()) + if (config["autosync_writes"].is_number() || + config["autosync_writes"].string_value() != "") { // Allow to set it to 0 autosync_writes = config["autosync_writes"].uint64_value(); -- 2.30.2 From cb9c30bc31ec89ee7918c09a794ecb27d381285d Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sat, 3 Feb 2024 20:26:04 +0300 Subject: [PATCH 15/33] Sync after sending all deletes to each PG in cli rm-data --- src/cli_rm_data.cpp | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/src/cli_rm_data.cpp b/src/cli_rm_data.cpp index dcf726dc..37ea8c91 100644 --- a/src/cli_rm_data.cpp +++ b/src/cli_rm_data.cpp @@ -17,6 +17,7 @@ struct rm_pg_t uint64_t obj_count = 0, obj_done = 0; int state = 0; int in_flight = 0; + bool synced = false; }; struct rm_inode_t @@ -48,6 +49,7 @@ struct rm_inode_t .objects = objects, .obj_count = objects.size(), .obj_done = 0, + .synced = parent->cli->get_immediate_commit(inode), }); if (min_offset == 0) { @@ -151,6 +153,37 @@ struct rm_inode_t } cur_list->obj_pos++; } + if (cur_list->in_flight == 0 && cur_list->obj_pos == cur_list->objects.end() && + !cur_list->synced) + { + osd_op_t *op = new osd_op_t(); + op->op_type = OSD_OP_OUT; + op->peer_fd = parent->cli->msgr.osd_peer_fds.at(cur_list->rm_osd_num); + op->req = (osd_any_op_t){ + .sync = { + .header = { + .magic = SECONDARY_OSD_OP_MAGIC, + .id = parent->cli->next_op_id(), + .opcode = OSD_OP_SYNC, + }, + }, + }; + op->callback = [this, cur_list](osd_op_t *op) + { + cur_list->in_flight--; + cur_list->synced = true; + if (op->reply.hdr.retval < 0) + { + fprintf(stderr, "Failed to sync OSD %lu (retval=%ld)\n", + cur_list->rm_osd_num, op->reply.hdr.retval); + error_count++; + } + delete op; + continue_delete(); + }; + cur_list->in_flight++; + parent->cli->msgr.outbox_push(op); + } } void continue_delete() @@ -161,7 +194,8 @@ struct rm_inode_t } for (int i = 0; i < lists.size(); i++) { - if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end()) + if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end() && + lists[i]->synced) { delete lists[i]; lists.erase(lists.begin()+i, lists.begin()+i+1); @@ -187,7 +221,7 @@ struct rm_inode_t { fprintf(stderr, "\n"); } - if (parent->progress && (total_done < total_count || inactive_osds.size() > 0)) + if (parent->progress && (total_done < total_count || inactive_osds.size() > 0 || error_count > 0)) { fprintf( stderr, "Warning: Pool:%u,ID:%lu inode data may not have been fully removed.\n" -- 2.30.2 From f03a9db4d94d018c3492f86e4d4f51c49e8bbcee Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sat, 3 Feb 2024 20:26:34 +0300 Subject: [PATCH 16/33] Fix OSD space reporting sometimes adding garbage zeros for deleted inodes (causing extra pool/stats etcd keys for deleted pools) --- src/osd_cluster.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/osd_cluster.cpp b/src/osd_cluster.cpp index 2ecee460..180bd880 100644 --- a/src/osd_cluster.cpp +++ b/src/osd_cluster.cpp @@ -262,7 +262,8 @@ void osd_t::report_statistics() for (auto st_it = inode_stats.begin(); st_it != inode_stats.end(); ) { auto & kv = *st_it; - if (!bs_inode_space[kv.first]) + auto spc_it = bs_inode_space.find(kv.first); + if (spc_it == bs_inode_space.end() || !spc_it->second) // prevent autovivification { // Is it an empty inode? if (!tv_now.tv_sec) -- 2.30.2 From 581d02e58107f6492c8a2a3dbb3f2beed09beb61 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sat, 3 Feb 2024 20:30:42 +0300 Subject: [PATCH 17/33] Mark secondary OSDs with deletions as dirty to not forget to sync & autosync them --- src/osd_primary.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/osd_primary.cpp b/src/osd_primary.cpp index e7a76b0f..750d0ff9 100644 --- a/src/osd_primary.cpp +++ b/src/osd_primary.cpp @@ -706,6 +706,26 @@ resume_5: remove_object_from_state(op_data->oid, &op_data->object_state, pg); deref_object_state(pg, &op_data->object_state, true); } + // Mark PG and OSDs as dirty + for (auto & chunk: (op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set)) + { + this->dirty_osds.insert(chunk.osd_num); + } + for (auto cl_it = msgr.clients.find(cur_op->peer_fd); cl_it != msgr.clients.end(); ) + { + cl_it->second->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }); + break; + } + dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }); + if (immediate_commit == IMMEDIATE_NONE) + { + unstable_write_count++; + if (unstable_write_count >= autosync_writes) + { + unstable_write_count = 0; + autosync(); + } + } pg.total_count--; cur_op->reply.hdr.retval = 0; continue_others: -- 2.30.2 From 77c10fd1f8506f466f87ea8da227c469227bf5ab Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sat, 3 Feb 2024 20:37:36 +0300 Subject: [PATCH 18/33] In fact, do not autosync blockstore when autosync_writes=0 --- src/blockstore_write.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blockstore_write.cpp b/src/blockstore_write.cpp index 355e4f38..8f314ccf 100644 --- a/src/blockstore_write.cpp +++ b/src/blockstore_write.cpp @@ -129,7 +129,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) } bool imm = (op->len < dsk.data_block_size ? (immediate_commit != IMMEDIATE_NONE) : (immediate_commit == IMMEDIATE_ALL)); if (wait_big && !is_del && !deleted && op->len < dsk.data_block_size && !imm || - !imm && unsynced_queued_ops >= autosync_writes) + !imm && autosync_writes && unsynced_queued_ops >= autosync_writes) { // Issue an additional sync so that the previous big write can reach the journal blockstore_op_t *sync_op = new blockstore_op_t; -- 2.30.2 From e026de95d5cdb222c56dfc461fc7c1be78f0b417 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 4 Feb 2024 01:17:43 +0300 Subject: [PATCH 19/33] Log to systemd by default --- mon/vitastor-osd@.service | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mon/vitastor-osd@.service b/mon/vitastor-osd@.service index ac2857d2..1930752e 100644 --- a/mon/vitastor-osd@.service +++ b/mon/vitastor-osd@.service @@ -8,7 +8,9 @@ PartOf=vitastor.target LimitNOFILE=1048576 LimitNPROC=1048576 LimitMEMLOCK=infinity -ExecStart=bash -c 'exec vitastor-disk exec-osd /dev/vitastor/osd%i-data >>/var/log/vitastor/osd%i.log 2>&1' +# Use the following for direct logs to files +#ExecStart=bash -c 'exec vitastor-disk exec-osd /dev/vitastor/osd%i-data >>/var/log/vitastor/osd%i.log 2>&1' +ExecStart=vitastor-disk exec-osd /dev/vitastor/osd%i-data ExecStartPre=+vitastor-disk pre-exec /dev/vitastor/osd%i-data WorkingDirectory=/ User=vitastor -- 2.30.2 From 016115c0d4febe65a63d03fe3d1d6afdfdf988ee Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 4 Feb 2024 02:23:49 +0300 Subject: [PATCH 20/33] Release 1.4.2 - Log to systemd by default - Fix excessive autosyncs after every operation with disabled immediate_commit (introduced in 1.1.0) - Fix a possible write stall with EC due to the lack of OSD wakeup after stabilizing previous writes - Change sync operation semantics as a final fix to possible write stalls with EC and disabled immediate_commit - Sync after deleting data in CLI rm / rm-data if immediate_commit is disabled - Fix OSDs ignoring syncs & autosyncs for delete operations - Fix OSD space reporting sometimes adding garbage zeros for deleted inodes (causing extra pool/stats etcd keys for deleted pools) - Speed up monitor failover - change default etcd_mon_ttl from 30 to 5 seconds - Speed up operation retries - change default up_wait_retry_interval to 50 ms - Add patch for libvirt 9.10 --- CMakeLists.txt | 2 +- csi/Makefile | 2 +- csi/deploy/004-csi-nodeplugin.yaml | 2 +- csi/deploy/007-csi-provisioner.yaml | 2 +- csi/src/config.go | 2 +- debian/changelog | 2 +- debian/vitastor.Dockerfile | 8 ++++---- mon/package.json | 2 +- patches/cinder-vitastor.py | 2 +- rpm/build-tarball.sh | 2 +- rpm/vitastor-el7.Dockerfile | 2 +- rpm/vitastor-el7.spec | 4 ++-- rpm/vitastor-el8.Dockerfile | 2 +- rpm/vitastor-el8.spec | 4 ++-- rpm/vitastor-el9.Dockerfile | 2 +- rpm/vitastor-el9.spec | 4 ++-- src/CMakeLists.txt | 2 +- src/vitastor.pc.in | 2 +- 18 files changed, 24 insertions(+), 24 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 71ab5180..a9586978 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12) project(vitastor) -set(VERSION "1.4.1") +set(VERSION "1.4.2") add_subdirectory(src) diff --git a/csi/Makefile b/csi/Makefile index 82210c11..8dba7f80 100644 --- a/csi/Makefile +++ b/csi/Makefile @@ -1,4 +1,4 @@ -VERSION ?= v1.4.1 +VERSION ?= v1.4.2 all: build push diff --git a/csi/deploy/004-csi-nodeplugin.yaml b/csi/deploy/004-csi-nodeplugin.yaml index 1addcd4c..0fb34181 100644 --- a/csi/deploy/004-csi-nodeplugin.yaml +++ b/csi/deploy/004-csi-nodeplugin.yaml @@ -49,7 +49,7 @@ spec: capabilities: add: ["SYS_ADMIN"] allowPrivilegeEscalation: true - image: vitalif/vitastor-csi:v1.4.1 + image: vitalif/vitastor-csi:v1.4.2 args: - "--node=$(NODE_ID)" - "--endpoint=$(CSI_ENDPOINT)" diff --git a/csi/deploy/007-csi-provisioner.yaml b/csi/deploy/007-csi-provisioner.yaml index e341722c..83601cc8 100644 --- a/csi/deploy/007-csi-provisioner.yaml +++ b/csi/deploy/007-csi-provisioner.yaml @@ -121,7 +121,7 @@ spec: privileged: true capabilities: add: ["SYS_ADMIN"] - image: vitalif/vitastor-csi:v1.4.1 + image: vitalif/vitastor-csi:v1.4.2 args: - "--node=$(NODE_ID)" - "--endpoint=$(CSI_ENDPOINT)" diff --git a/csi/src/config.go b/csi/src/config.go index c1a29711..209cdd74 100644 --- a/csi/src/config.go +++ b/csi/src/config.go @@ -5,7 +5,7 @@ package vitastor const ( vitastorCSIDriverName = "csi.vitastor.io" - vitastorCSIDriverVersion = "1.4.1" + vitastorCSIDriverVersion = "1.4.2" ) // Config struct fills the parameters of request or user input diff --git a/debian/changelog b/debian/changelog index b7216bb9..6bcc6b93 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -vitastor (1.4.1-1) unstable; urgency=medium +vitastor (1.4.2-1) unstable; urgency=medium * Bugfixes diff --git a/debian/vitastor.Dockerfile b/debian/vitastor.Dockerfile index b1035604..2ebbc5d3 100644 --- a/debian/vitastor.Dockerfile +++ b/debian/vitastor.Dockerfile @@ -35,8 +35,8 @@ RUN set -e -x; \ mkdir -p /root/packages/vitastor-$REL; \ rm -rf /root/packages/vitastor-$REL/*; \ cd /root/packages/vitastor-$REL; \ - cp -r /root/vitastor vitastor-1.4.1; \ - cd vitastor-1.4.1; \ + cp -r /root/vitastor vitastor-1.4.2; \ + cd vitastor-1.4.2; \ ln -s /root/fio-build/fio-*/ ./fio; \ FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \ ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \ @@ -49,8 +49,8 @@ RUN set -e -x; \ rm -rf a b; \ echo "dep:fio=$FIO" > debian/fio_version; \ cd /root/packages/vitastor-$REL; \ - tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.1.orig.tar.xz vitastor-1.4.1; \ - cd vitastor-1.4.1; \ + tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.2.orig.tar.xz vitastor-1.4.2; \ + cd vitastor-1.4.2; \ V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \ DEBFULLNAME="Vitaliy Filippov " dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \ DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \ diff --git a/mon/package.json b/mon/package.json index 73f44427..ed521bbf 100644 --- a/mon/package.json +++ b/mon/package.json @@ -1,6 +1,6 @@ { "name": "vitastor-mon", - "version": "1.4.1", + "version": "1.4.2", "description": "Vitastor SDS monitor service", "main": "mon-main.js", "scripts": { diff --git a/patches/cinder-vitastor.py b/patches/cinder-vitastor.py index 30525d59..e2d04ed4 100644 --- a/patches/cinder-vitastor.py +++ b/patches/cinder-vitastor.py @@ -50,7 +50,7 @@ from cinder.volume import configuration from cinder.volume import driver from cinder.volume import volume_utils -VERSION = '1.4.1' +VERSION = '1.4.2' LOG = logging.getLogger(__name__) diff --git a/rpm/build-tarball.sh b/rpm/build-tarball.sh index 3e97e2f1..16816088 100755 --- a/rpm/build-tarball.sh +++ b/rpm/build-tarball.sh @@ -24,4 +24,4 @@ rm fio mv fio-copy fio FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'` perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec -tar --transform 's#^#vitastor-1.4.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.1$(rpm --eval '%dist').tar.gz * +tar --transform 's#^#vitastor-1.4.2/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.2$(rpm --eval '%dist').tar.gz * diff --git a/rpm/vitastor-el7.Dockerfile b/rpm/vitastor-el7.Dockerfile index 081eebea..3216f725 100644 --- a/rpm/vitastor-el7.Dockerfile +++ b/rpm/vitastor-el7.Dockerfile @@ -36,7 +36,7 @@ ADD . /root/vitastor RUN set -e; \ cd /root/vitastor/rpm; \ sh build-tarball.sh; \ - cp /root/vitastor-1.4.1.el7.tar.gz ~/rpmbuild/SOURCES; \ + cp /root/vitastor-1.4.2.el7.tar.gz ~/rpmbuild/SOURCES; \ cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \ cd ~/rpmbuild/SPECS/; \ rpmbuild -ba vitastor.spec; \ diff --git a/rpm/vitastor-el7.spec b/rpm/vitastor-el7.spec index abf1240b..f59dac26 100644 --- a/rpm/vitastor-el7.spec +++ b/rpm/vitastor-el7.spec @@ -1,11 +1,11 @@ Name: vitastor -Version: 1.4.1 +Version: 1.4.2 Release: 1%{?dist} Summary: Vitastor, a fast software-defined clustered block storage License: Vitastor Network Public License 1.1 URL: https://vitastor.io/ -Source0: vitastor-1.4.1.el7.tar.gz +Source0: vitastor-1.4.2.el7.tar.gz BuildRequires: liburing-devel >= 0.6 BuildRequires: gperftools-devel diff --git a/rpm/vitastor-el8.Dockerfile b/rpm/vitastor-el8.Dockerfile index a03be469..ffd0d6b4 100644 --- a/rpm/vitastor-el8.Dockerfile +++ b/rpm/vitastor-el8.Dockerfile @@ -35,7 +35,7 @@ ADD . /root/vitastor RUN set -e; \ cd /root/vitastor/rpm; \ sh build-tarball.sh; \ - cp /root/vitastor-1.4.1.el8.tar.gz ~/rpmbuild/SOURCES; \ + cp /root/vitastor-1.4.2.el8.tar.gz ~/rpmbuild/SOURCES; \ cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \ cd ~/rpmbuild/SPECS/; \ rpmbuild -ba vitastor.spec; \ diff --git a/rpm/vitastor-el8.spec b/rpm/vitastor-el8.spec index c11b841b..3a2257e3 100644 --- a/rpm/vitastor-el8.spec +++ b/rpm/vitastor-el8.spec @@ -1,11 +1,11 @@ Name: vitastor -Version: 1.4.1 +Version: 1.4.2 Release: 1%{?dist} Summary: Vitastor, a fast software-defined clustered block storage License: Vitastor Network Public License 1.1 URL: https://vitastor.io/ -Source0: vitastor-1.4.1.el8.tar.gz +Source0: vitastor-1.4.2.el8.tar.gz BuildRequires: liburing-devel >= 0.6 BuildRequires: gperftools-devel diff --git a/rpm/vitastor-el9.Dockerfile b/rpm/vitastor-el9.Dockerfile index d02ae827..6d8fdc3c 100644 --- a/rpm/vitastor-el9.Dockerfile +++ b/rpm/vitastor-el9.Dockerfile @@ -18,7 +18,7 @@ ADD . /root/vitastor RUN set -e; \ cd /root/vitastor/rpm; \ sh build-tarball.sh; \ - cp /root/vitastor-1.4.1.el9.tar.gz ~/rpmbuild/SOURCES; \ + cp /root/vitastor-1.4.2.el9.tar.gz ~/rpmbuild/SOURCES; \ cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \ cd ~/rpmbuild/SPECS/; \ rpmbuild -ba vitastor.spec; \ diff --git a/rpm/vitastor-el9.spec b/rpm/vitastor-el9.spec index 96751661..bdfae3d1 100644 --- a/rpm/vitastor-el9.spec +++ b/rpm/vitastor-el9.spec @@ -1,11 +1,11 @@ Name: vitastor -Version: 1.4.1 +Version: 1.4.2 Release: 1%{?dist} Summary: Vitastor, a fast software-defined clustered block storage License: Vitastor Network Public License 1.1 URL: https://vitastor.io/ -Source0: vitastor-1.4.1.el9.tar.gz +Source0: vitastor-1.4.2.el9.tar.gz BuildRequires: liburing-devel >= 0.6 BuildRequires: gperftools-devel diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4e1d7f52..3fccbc5f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$") set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") endif() -add_definitions(-DVERSION="1.4.1") +add_definitions(-DVERSION="1.4.2") add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src) add_link_options(-fno-omit-frame-pointer) if (${WITH_ASAN}) diff --git a/src/vitastor.pc.in b/src/vitastor.pc.in index e8ff5e3b..0cf0aa80 100644 --- a/src/vitastor.pc.in +++ b/src/vitastor.pc.in @@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ Name: Vitastor Description: Vitastor client library -Version: 1.4.1 +Version: 1.4.2 Libs: -L${libdir} -lvitastor_client Cflags: -I${includedir} -- 2.30.2 From 5d3317e4f2726958d313bf32dcd95f04ed5fd2f2 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Thu, 8 Feb 2024 19:34:29 +0300 Subject: [PATCH 21/33] Followup to 1.4.2 write stall fix - sadly, the previous version was not working correctly :) --- src/blockstore_impl.cpp | 9 ++++++--- src/blockstore_impl.h | 3 +-- src/blockstore_rollback.cpp | 1 - src/blockstore_stable.cpp | 1 - 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/blockstore_impl.cpp b/src/blockstore_impl.cpp index 55d00960..399e0c78 100644 --- a/src/blockstore_impl.cpp +++ b/src/blockstore_impl.cpp @@ -195,6 +195,10 @@ void blockstore_impl_t::loop() // ring is full, stop submission break; } + else if (PRIV(op)->wait_for == WAIT_JOURNAL) + { + PRIV(op)->wait_detail2 = (unstable_writes.size()+unstable_unsynced); + } } } if (op_idx != new_idx) @@ -273,7 +277,8 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op) } else if (PRIV(op)->wait_for == WAIT_JOURNAL) { - if (journal.used_start == PRIV(op)->wait_detail && !unstable_count_changed) + if (journal.used_start == PRIV(op)->wait_detail && + (unstable_writes.size()+unstable_unsynced) == PRIV(op)->wait_detail2) { // do not submit #ifdef BLOCKSTORE_DEBUG @@ -281,7 +286,6 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op) #endif return; } - unstable_count_changed = false; flusher->release_trim(); PRIV(op)->wait_for = 0; } @@ -353,7 +357,6 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op) }; } unstable_writes.clear(); - unstable_count_changed = true; op->callback = [old_callback](blockstore_op_t *op) { obj_ver_id *vers = (obj_ver_id*)op->buf; diff --git a/src/blockstore_impl.h b/src/blockstore_impl.h index 99f01cef..c1b88638 100644 --- a/src/blockstore_impl.h +++ b/src/blockstore_impl.h @@ -202,7 +202,7 @@ struct blockstore_op_private_t { // Wait status int wait_for; - uint64_t wait_detail; + uint64_t wait_detail, wait_detail2; int pending_ops; int op_state; @@ -276,7 +276,6 @@ class blockstore_impl_t std::vector submit_queue; std::vector unsynced_big_writes, unsynced_small_writes; int unsynced_big_write_count = 0, unstable_unsynced = 0; - bool unstable_count_changed = false; int unsynced_queued_ops = 0; allocator *data_alloc = NULL; uint64_t used_blocks = 0; diff --git a/src/blockstore_rollback.cpp b/src/blockstore_rollback.cpp index cc686112..50b6eb88 100644 --- a/src/blockstore_rollback.cpp +++ b/src/blockstore_rollback.cpp @@ -162,7 +162,6 @@ void blockstore_impl_t::mark_rolled_back(const obj_ver_id & ov) unstable_writes.erase(unstab_it); else unstab_it->second = max_unstable; - unstable_count_changed = true; } } } diff --git a/src/blockstore_stable.cpp b/src/blockstore_stable.cpp index f3d4dc27..2dba0555 100644 --- a/src/blockstore_stable.cpp +++ b/src/blockstore_stable.cpp @@ -537,6 +537,5 @@ void blockstore_impl_t::mark_stable(obj_ver_id v, bool forget_dirty) unstab_it->second <= v.version) { unstable_writes.erase(unstab_it); - unstable_count_changed = true; } } -- 2.30.2 From 8e25a28a08e7265c9d30e3dec7077bc51bfc3de2 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Thu, 8 Feb 2024 21:28:03 +0300 Subject: [PATCH 22/33] Ignore down OSDs in monitor statistics aggregation --- mon/mon.js | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/mon/mon.js b/mon/mon.js index ab7221bb..6e8864c9 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -675,7 +675,12 @@ class Mon { this.parse_kv(e.kv); const key = e.kv.key.substr(this.etcd_prefix.length); - if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/' || key.substr(0, 16) == '/osd/inodestats/') + if (key.substr(0, 11) == '/osd/state/') + { + stats_changed = true; + changed = true; + } + else if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/' || key.substr(0, 16) == '/osd/inodestats/') { stats_changed = true; } @@ -1635,9 +1640,13 @@ class Mon } const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} }; // Sum derived values instead of deriving summed - for (const osd in this.state.osd.stats) + for (const osd in this.state.osd.state) { const derived = this.prev_stats.osd_diff[osd]; + if (!this.state.osd.state[osd] || !derived) + { + continue; + } for (const type in sum_diff) { for (const op in derived[type]||{}) @@ -1738,9 +1747,13 @@ class Mon const used = this.state.pool.stats[pool_id].used_raw_tb; this.state.pool.stats[pool_id].used_raw_tb = Number(used)/1024/1024/1024/1024; } - for (const osd_num in this.state.osd.inodestats) + for (const osd_num in this.state.osd.state) { const ist = this.state.osd.inodestats[osd_num]; + if (!ist || !this.state.osd.state[osd_num]) + { + continue; + } for (const pool_id in ist) { inode_stats[pool_id] = inode_stats[pool_id] || {}; @@ -1756,9 +1769,14 @@ class Mon } } } - for (const osd in this.prev_stats.osd_diff) + for (const osd in this.state.osd.state) { - for (const pool_id in this.prev_stats.osd_diff[osd].inode_stats) + const osd_diff = this.prev_stats.osd_diff[osd]; + if (!osd_diff || !this.state.osd.state[osd]) + { + continue; + } + for (const pool_id in osd_diff.inode_stats) { for (const inode_num in this.prev_stats.osd_diff[osd].inode_stats[pool_id]) { -- 2.30.2 From 27e9f244ec457739d9ce20ef429c39c0b009fc13 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Fri, 9 Feb 2024 00:29:31 +0300 Subject: [PATCH 23/33] Release 1.4.3 Hotfix for hotfix O:-) - "Write stall fix" was incomplete and EC write stalls could continue even on 1.4.2. Now they're finally fixed O:-) - Make monitor ignore statistics of stopped OSDs. Previously if you stopped all OSDs the last total I/O numbers would remain the same indefinitely --- CMakeLists.txt | 2 +- csi/Makefile | 2 +- csi/deploy/004-csi-nodeplugin.yaml | 2 +- csi/deploy/007-csi-provisioner.yaml | 2 +- csi/src/config.go | 2 +- debian/changelog | 2 +- debian/vitastor.Dockerfile | 8 ++++---- mon/package.json | 2 +- patches/cinder-vitastor.py | 2 +- rpm/build-tarball.sh | 2 +- rpm/vitastor-el7.Dockerfile | 2 +- rpm/vitastor-el7.spec | 4 ++-- rpm/vitastor-el8.Dockerfile | 2 +- rpm/vitastor-el8.spec | 4 ++-- rpm/vitastor-el9.Dockerfile | 2 +- rpm/vitastor-el9.spec | 4 ++-- src/CMakeLists.txt | 2 +- src/vitastor.pc.in | 2 +- 18 files changed, 24 insertions(+), 24 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a9586978..faa40448 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12) project(vitastor) -set(VERSION "1.4.2") +set(VERSION "1.4.3") add_subdirectory(src) diff --git a/csi/Makefile b/csi/Makefile index 8dba7f80..56cb9b37 100644 --- a/csi/Makefile +++ b/csi/Makefile @@ -1,4 +1,4 @@ -VERSION ?= v1.4.2 +VERSION ?= v1.4.3 all: build push diff --git a/csi/deploy/004-csi-nodeplugin.yaml b/csi/deploy/004-csi-nodeplugin.yaml index 0fb34181..d5badfac 100644 --- a/csi/deploy/004-csi-nodeplugin.yaml +++ b/csi/deploy/004-csi-nodeplugin.yaml @@ -49,7 +49,7 @@ spec: capabilities: add: ["SYS_ADMIN"] allowPrivilegeEscalation: true - image: vitalif/vitastor-csi:v1.4.2 + image: vitalif/vitastor-csi:v1.4.3 args: - "--node=$(NODE_ID)" - "--endpoint=$(CSI_ENDPOINT)" diff --git a/csi/deploy/007-csi-provisioner.yaml b/csi/deploy/007-csi-provisioner.yaml index 83601cc8..73b0ea7c 100644 --- a/csi/deploy/007-csi-provisioner.yaml +++ b/csi/deploy/007-csi-provisioner.yaml @@ -121,7 +121,7 @@ spec: privileged: true capabilities: add: ["SYS_ADMIN"] - image: vitalif/vitastor-csi:v1.4.2 + image: vitalif/vitastor-csi:v1.4.3 args: - "--node=$(NODE_ID)" - "--endpoint=$(CSI_ENDPOINT)" diff --git a/csi/src/config.go b/csi/src/config.go index 209cdd74..21091917 100644 --- a/csi/src/config.go +++ b/csi/src/config.go @@ -5,7 +5,7 @@ package vitastor const ( vitastorCSIDriverName = "csi.vitastor.io" - vitastorCSIDriverVersion = "1.4.2" + vitastorCSIDriverVersion = "1.4.3" ) // Config struct fills the parameters of request or user input diff --git a/debian/changelog b/debian/changelog index 6bcc6b93..18924223 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -vitastor (1.4.2-1) unstable; urgency=medium +vitastor (1.4.3-1) unstable; urgency=medium * Bugfixes diff --git a/debian/vitastor.Dockerfile b/debian/vitastor.Dockerfile index 2ebbc5d3..d002c826 100644 --- a/debian/vitastor.Dockerfile +++ b/debian/vitastor.Dockerfile @@ -35,8 +35,8 @@ RUN set -e -x; \ mkdir -p /root/packages/vitastor-$REL; \ rm -rf /root/packages/vitastor-$REL/*; \ cd /root/packages/vitastor-$REL; \ - cp -r /root/vitastor vitastor-1.4.2; \ - cd vitastor-1.4.2; \ + cp -r /root/vitastor vitastor-1.4.3; \ + cd vitastor-1.4.3; \ ln -s /root/fio-build/fio-*/ ./fio; \ FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \ ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \ @@ -49,8 +49,8 @@ RUN set -e -x; \ rm -rf a b; \ echo "dep:fio=$FIO" > debian/fio_version; \ cd /root/packages/vitastor-$REL; \ - tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.2.orig.tar.xz vitastor-1.4.2; \ - cd vitastor-1.4.2; \ + tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.3.orig.tar.xz vitastor-1.4.3; \ + cd vitastor-1.4.3; \ V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \ DEBFULLNAME="Vitaliy Filippov " dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \ DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \ diff --git a/mon/package.json b/mon/package.json index ed521bbf..769fef1c 100644 --- a/mon/package.json +++ b/mon/package.json @@ -1,6 +1,6 @@ { "name": "vitastor-mon", - "version": "1.4.2", + "version": "1.4.3", "description": "Vitastor SDS monitor service", "main": "mon-main.js", "scripts": { diff --git a/patches/cinder-vitastor.py b/patches/cinder-vitastor.py index e2d04ed4..fb410764 100644 --- a/patches/cinder-vitastor.py +++ b/patches/cinder-vitastor.py @@ -50,7 +50,7 @@ from cinder.volume import configuration from cinder.volume import driver from cinder.volume import volume_utils -VERSION = '1.4.2' +VERSION = '1.4.3' LOG = logging.getLogger(__name__) diff --git a/rpm/build-tarball.sh b/rpm/build-tarball.sh index 16816088..74437785 100755 --- a/rpm/build-tarball.sh +++ b/rpm/build-tarball.sh @@ -24,4 +24,4 @@ rm fio mv fio-copy fio FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'` perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec -tar --transform 's#^#vitastor-1.4.2/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.2$(rpm --eval '%dist').tar.gz * +tar --transform 's#^#vitastor-1.4.3/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.3$(rpm --eval '%dist').tar.gz * diff --git a/rpm/vitastor-el7.Dockerfile b/rpm/vitastor-el7.Dockerfile index 3216f725..74654034 100644 --- a/rpm/vitastor-el7.Dockerfile +++ b/rpm/vitastor-el7.Dockerfile @@ -36,7 +36,7 @@ ADD . /root/vitastor RUN set -e; \ cd /root/vitastor/rpm; \ sh build-tarball.sh; \ - cp /root/vitastor-1.4.2.el7.tar.gz ~/rpmbuild/SOURCES; \ + cp /root/vitastor-1.4.3.el7.tar.gz ~/rpmbuild/SOURCES; \ cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \ cd ~/rpmbuild/SPECS/; \ rpmbuild -ba vitastor.spec; \ diff --git a/rpm/vitastor-el7.spec b/rpm/vitastor-el7.spec index f59dac26..f850272c 100644 --- a/rpm/vitastor-el7.spec +++ b/rpm/vitastor-el7.spec @@ -1,11 +1,11 @@ Name: vitastor -Version: 1.4.2 +Version: 1.4.3 Release: 1%{?dist} Summary: Vitastor, a fast software-defined clustered block storage License: Vitastor Network Public License 1.1 URL: https://vitastor.io/ -Source0: vitastor-1.4.2.el7.tar.gz +Source0: vitastor-1.4.3.el7.tar.gz BuildRequires: liburing-devel >= 0.6 BuildRequires: gperftools-devel diff --git a/rpm/vitastor-el8.Dockerfile b/rpm/vitastor-el8.Dockerfile index ffd0d6b4..3a039103 100644 --- a/rpm/vitastor-el8.Dockerfile +++ b/rpm/vitastor-el8.Dockerfile @@ -35,7 +35,7 @@ ADD . /root/vitastor RUN set -e; \ cd /root/vitastor/rpm; \ sh build-tarball.sh; \ - cp /root/vitastor-1.4.2.el8.tar.gz ~/rpmbuild/SOURCES; \ + cp /root/vitastor-1.4.3.el8.tar.gz ~/rpmbuild/SOURCES; \ cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \ cd ~/rpmbuild/SPECS/; \ rpmbuild -ba vitastor.spec; \ diff --git a/rpm/vitastor-el8.spec b/rpm/vitastor-el8.spec index 3a2257e3..f3da7ea3 100644 --- a/rpm/vitastor-el8.spec +++ b/rpm/vitastor-el8.spec @@ -1,11 +1,11 @@ Name: vitastor -Version: 1.4.2 +Version: 1.4.3 Release: 1%{?dist} Summary: Vitastor, a fast software-defined clustered block storage License: Vitastor Network Public License 1.1 URL: https://vitastor.io/ -Source0: vitastor-1.4.2.el8.tar.gz +Source0: vitastor-1.4.3.el8.tar.gz BuildRequires: liburing-devel >= 0.6 BuildRequires: gperftools-devel diff --git a/rpm/vitastor-el9.Dockerfile b/rpm/vitastor-el9.Dockerfile index 6d8fdc3c..198952fd 100644 --- a/rpm/vitastor-el9.Dockerfile +++ b/rpm/vitastor-el9.Dockerfile @@ -18,7 +18,7 @@ ADD . /root/vitastor RUN set -e; \ cd /root/vitastor/rpm; \ sh build-tarball.sh; \ - cp /root/vitastor-1.4.2.el9.tar.gz ~/rpmbuild/SOURCES; \ + cp /root/vitastor-1.4.3.el9.tar.gz ~/rpmbuild/SOURCES; \ cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \ cd ~/rpmbuild/SPECS/; \ rpmbuild -ba vitastor.spec; \ diff --git a/rpm/vitastor-el9.spec b/rpm/vitastor-el9.spec index bdfae3d1..7bcfb1ad 100644 --- a/rpm/vitastor-el9.spec +++ b/rpm/vitastor-el9.spec @@ -1,11 +1,11 @@ Name: vitastor -Version: 1.4.2 +Version: 1.4.3 Release: 1%{?dist} Summary: Vitastor, a fast software-defined clustered block storage License: Vitastor Network Public License 1.1 URL: https://vitastor.io/ -Source0: vitastor-1.4.2.el9.tar.gz +Source0: vitastor-1.4.3.el9.tar.gz BuildRequires: liburing-devel >= 0.6 BuildRequires: gperftools-devel diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3fccbc5f..d0cae0ae 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$") set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") endif() -add_definitions(-DVERSION="1.4.2") +add_definitions(-DVERSION="1.4.3") add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src) add_link_options(-fno-omit-frame-pointer) if (${WITH_ASAN}) diff --git a/src/vitastor.pc.in b/src/vitastor.pc.in index 0cf0aa80..d9b3d8a6 100644 --- a/src/vitastor.pc.in +++ b/src/vitastor.pc.in @@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ Name: Vitastor Description: Vitastor client library -Version: 1.4.2 +Version: 1.4.3 Libs: -L${libdir} -lvitastor_client Cflags: -I${includedir} -- 2.30.2 From c53357ac4547df17cbb23a7264c2961c495bd4a3 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sat, 10 Feb 2024 12:06:27 +0300 Subject: [PATCH 24/33] Add a test for EC segfault with partial overwrite in 1234 -> 5030 rebalance scenario --- src/osd_rmw_test.cpp | 99 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/src/osd_rmw_test.cpp b/src/osd_rmw_test.cpp index 2e1542d7..a71aab3d 100644 --- a/src/osd_rmw_test.cpp +++ b/src/osd_rmw_test.cpp @@ -30,6 +30,7 @@ void test16(); void test_recover_22_d2(); void test_ec43_error_bruteforce(); void test_recover_53_d5(); +void test_recover_22(); int main(int narg, char *args[]) { @@ -70,6 +71,8 @@ int main(int narg, char *args[]) test_ec43_error_bruteforce(); // Test 19 test_recover_53_d5(); + // Test 20 + test_recover_22(); // End printf("all ok\n"); return 0; @@ -1244,3 +1247,99 @@ void test_recover_53_d5() // Done use_ec(8, 5, false); } + +void test_recover_22() +{ + const int bmp = 128*1024 / 4096 / 8; + use_ec(4, 2, true); + osd_num_t osd_set[4] = { 1, 2, 3, 4 }; + osd_num_t write_osd_set[4] = { 5, 0, 3, 0 }; + osd_rmw_stripe_t stripes[4] = {}; + unsigned bitmaps[4] = { 0 }; + // split + void *write_buf = (uint8_t*)malloc_or_die(4096); + set_pattern(write_buf, 4096, PATTERN0); + split_stripes(2, 128*1024, 120*1024, 4096, stripes); + assert(stripes[0].req_start == 120*1024 && stripes[0].req_end == 124*1024); + assert(stripes[1].req_start == 0 && stripes[1].req_end == 0); + assert(stripes[2].req_start == 0 && stripes[2].req_end == 0); + assert(stripes[3].req_start == 0 && stripes[3].req_end == 0); + // calc_rmw + void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 2, write_osd_set, 128*1024, bmp); + for (int i = 0; i < 4; i++) + stripes[i].bmp_buf = bitmaps+i; + assert(rmw_buf); + assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024); + assert(stripes[1].read_start == 120*1024 && stripes[1].read_end == 124*1024); + assert(stripes[2].read_start == 0 && stripes[2].read_end == 0); + assert(stripes[3].read_start == 0 && stripes[3].read_end == 0); + assert(stripes[0].write_start == 120*1024 && stripes[0].write_end == 124*1024); + assert(stripes[1].write_start == 0 && stripes[1].write_end == 0); + assert(stripes[2].write_start == 120*1024 && stripes[2].write_end == 124*1024); + assert(stripes[3].write_start == 0 && stripes[3].write_end == 0); + assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024); + assert(stripes[1].read_buf == (uint8_t*)rmw_buf+132*1024); + assert(stripes[2].read_buf == NULL); + assert(stripes[3].read_buf == NULL); + assert(stripes[0].write_buf == write_buf); + assert(stripes[1].write_buf == NULL); + assert(stripes[2].write_buf == (uint8_t*)rmw_buf); + assert(stripes[3].write_buf == NULL); + // encode + set_pattern(stripes[0].read_buf, 128*1024, PATTERN1); + set_pattern(stripes[1].read_buf, 4*1024, PATTERN2); + memset(stripes[0].bmp_buf, 0xff, bmp); + memset(stripes[1].bmp_buf, 0xff, bmp); + calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 128*1024, bmp); + assert(*(uint32_t*)stripes[2].bmp_buf == 0); + assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024); + assert(stripes[1].write_start == 0 && stripes[1].write_end == 0); + assert(stripes[2].write_start == 120*1024 && stripes[2].write_end == 124*1024); + assert(stripes[3].write_start == 0 && stripes[3].write_end == 0); + assert(stripes[0].write_buf == stripes[0].read_buf); + assert(stripes[1].write_buf == NULL); + assert(stripes[2].write_buf == (uint8_t*)rmw_buf); + assert(stripes[3].write_buf == NULL); + check_pattern(stripes[2].write_buf, 4*1024, PATTERN0^PATTERN2); + // decode and verify + memset(stripes, 0, sizeof(stripes)); + split_stripes(2, 128*1024, 0, 256*1024, stripes); + assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024); + assert(stripes[1].req_start == 0 && stripes[1].req_end == 128*1024); + assert(stripes[2].req_start == 0 && stripes[2].req_end == 0); + assert(stripes[3].req_start == 0 && stripes[3].req_end == 0); + for (int role = 0; role < 4; role++) + { + stripes[role].read_start = stripes[role].req_start; + stripes[role].read_end = stripes[role].req_end; + } + assert(extend_missing_stripes(stripes, write_osd_set, 2, 4) == 0); + assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024); + assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024); + assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024); + assert(stripes[3].read_start == 0 && stripes[3].read_end == 0); + void *read_buf = alloc_read_buffer(stripes, 4, 0); + for (int i = 0; i < 4; i++) + stripes[i].bmp_buf = bitmaps+i; + assert(read_buf); + assert(stripes[0].read_buf == read_buf); + assert(stripes[1].read_buf == (uint8_t*)read_buf+128*1024); + assert(stripes[2].read_buf == (uint8_t*)read_buf+2*128*1024); + set_pattern(stripes[0].read_buf, 128*1024, PATTERN1); + set_pattern(stripes[0].read_buf+120*1024, 4*1024, PATTERN0); + set_pattern(stripes[2].read_buf, 128*1024, PATTERN1^PATTERN2); + set_pattern(stripes[2].read_buf+120*1024, 4*1024, PATTERN0^PATTERN2); + memset(stripes[0].bmp_buf, 0xff, bmp); + memset(stripes[2].bmp_buf, 0, bmp); + bitmaps[1] = 0; + bitmaps[3] = 0; + reconstruct_stripes_ec(stripes, 4, 2, bmp); + assert(bitmaps[0] == 0xFFFFFFFF); + assert(*(uint32_t*)stripes[1].bmp_buf == 0xFFFFFFFF); + check_pattern(stripes[1].read_buf, 128*1024, PATTERN2); + free(read_buf); + // Done + free(rmw_buf); + free(write_buf); + use_ec(4, 2, false); +} -- 2.30.2 From e7ac855b0714ee3a77346310f07158cf25494c2e Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sat, 10 Feb 2024 12:06:39 +0300 Subject: [PATCH 25/33] Fix that EC segfault (1234 -> 5030 partial overwrite) --- src/osd_rmw.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/osd_rmw.cpp b/src/osd_rmw.cpp index 7788cf8b..eebcc265 100644 --- a/src/osd_rmw.cpp +++ b/src/osd_rmw.cpp @@ -861,15 +861,15 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int static void calc_rmw_parity_copy_parity(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t start, uint32_t end) { - if (write_osd_set != read_osd_set) + if (write_osd_set != read_osd_set && end != 0) { for (int role = pg_minsize; role < pg_size; role++) { - if (write_osd_set[role] != read_osd_set[role] && (start != 0 || end != chunk_size)) + if (write_osd_set[role] != read_osd_set[role] && write_osd_set[role] != 0 && (start != 0 || end != chunk_size)) { // Copy new parity into the read buffer to write it back memcpy( - (uint8_t*)stripes[role].read_buf + start, + (uint8_t*)stripes[role].read_buf + start - stripes[role].read_start, stripes[role].write_buf, end - start ); -- 2.30.2 From 1e3c4edea0a8b5d2360f6e582b3fe87dd6cc9d04 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 11 Feb 2024 02:26:37 +0300 Subject: [PATCH 26/33] Print etcd dbSize instead of dbSizeInUse in status --- src/cli_status.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cli_status.cpp b/src/cli_status.cpp index f9fcb16c..85606cb2 100644 --- a/src/cli_status.cpp +++ b/src/cli_status.cpp @@ -106,7 +106,7 @@ resume_2: if (etcd_states[i]["error"].is_null()) { etcd_alive++; - etcd_db_size = etcd_states[i]["dbSizeInUse"].uint64_value(); + etcd_db_size = etcd_states[i]["dbSize"].uint64_value(); } } int mon_count = 0; -- 2.30.2 From 38ba76e89395e9e1d58891fa40ec819debd58733 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 11 Feb 2024 02:28:38 +0300 Subject: [PATCH 27/33] Fix flusher sometimes being unable to trim journal when the flush queue is empty --- src/blockstore_flush.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/blockstore_flush.cpp b/src/blockstore_flush.cpp index 66eb8ea6..290a4dbf 100644 --- a/src/blockstore_flush.cpp +++ b/src/blockstore_flush.cpp @@ -184,8 +184,7 @@ void journal_flusher_t::mark_trim_possible() if (trim_wanted > 0) { dequeuing = true; - if (!journal_trim_counter) - journal_trim_counter = journal_trim_interval; + journal_trim_counter = 0; bs->ringloop->wakeup(); } } @@ -366,7 +365,7 @@ resume_0: !flusher->flush_queue.size() || !flusher->dequeuing) { stop_flusher: - if (flusher->trim_wanted > 0 && flusher->journal_trim_counter > 0) + if (flusher->trim_wanted > 0 && !flusher->journal_trim_counter) { // Attempt forced trim flusher->active_flushers++; @@ -1346,7 +1345,6 @@ bool journal_flusher_co::trim_journal(int wait_base) else if (wait_state == wait_base+2) goto resume_2; else if (wait_state == wait_base+3) goto resume_3; else if (wait_state == wait_base+4) goto resume_4; - flusher->journal_trim_counter = 0; new_trim_pos = bs->journal.get_trim_pos(); if (new_trim_pos != bs->journal.used_start) { @@ -1419,6 +1417,7 @@ bool journal_flusher_co::trim_journal(int wait_base) exit(0); } } + flusher->journal_trim_counter = 0; flusher->trimming = false; } return true; -- 2.30.2 From ca34a6047a56528d061aed65e74dcb4bdc06951f Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 11 Feb 2024 02:29:41 +0300 Subject: [PATCH 28/33] Fix dynamic journal space reservation: include the new write itself, too --- src/blockstore_sync.cpp | 6 +++--- src/blockstore_write.cpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/blockstore_sync.cpp b/src/blockstore_sync.cpp index 50891a13..d29035b5 100644 --- a/src/blockstore_sync.cpp +++ b/src/blockstore_sync.cpp @@ -76,6 +76,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op) // 2nd step: Data device is synced, prepare & write journal entries // Check space in the journal and journal memory buffers blockstore_journal_check_t space_check(this); + auto reservation = (unstable_writes.size()+unstable_unsynced+PRIV(op)->sync_big_writes.size())*journal.block_size; if (dsk.csum_block_size) { // More complex check because all journal entries have different lengths @@ -85,15 +86,14 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op) left--; auto & dirty_entry = dirty_db.at(sbw); uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len); - if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size, 0)) + if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size, left ? 0 : reservation)) { return 0; } } } else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(), - sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, - (unstable_writes.size()+unstable_unsynced)*journal.block_size)) + sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, reservation)) { return 0; } diff --git a/src/blockstore_write.cpp b/src/blockstore_write.cpp index 8f314ccf..fe768f88 100644 --- a/src/blockstore_write.cpp +++ b/src/blockstore_write.cpp @@ -320,7 +320,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) blockstore_journal_check_t space_check(this); if (!space_check.check_available(op, unsynced_big_write_count + 1, sizeof(journal_entry_big_write) + dsk.clean_dyn_size, - (unstable_writes.size()+unstable_unsynced)*journal.block_size)) + (unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size)) { return 0; } @@ -412,7 +412,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0) || !space_check.check_available(op, 1, sizeof(journal_entry_small_write) + dyn_size, - op->len + (unstable_writes.size()+unstable_unsynced)*journal.block_size)) + op->len + (unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size)) { return 0; } @@ -549,7 +549,7 @@ resume_2: uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len); blockstore_journal_check_t space_check(this); if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size, - (unstable_writes.size()+unstable_unsynced)*journal.block_size)) + (unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size)) { return 0; } -- 2.30.2 From b127da40f77c8e0338bc812a96707c6569f16a51 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 11 Feb 2024 02:33:30 +0300 Subject: [PATCH 29/33] Add a FIXME about incomplete PGs --- src/osd_peering.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/osd_peering.cpp b/src/osd_peering.cpp index 40657369..d8aff7dc 100644 --- a/src/osd_peering.cpp +++ b/src/osd_peering.cpp @@ -222,6 +222,9 @@ void osd_t::start_pg_peering(pg_t & pg) } if (pg.pg_cursize < pg.pg_minsize) { + // FIXME: Incomplete EC PGs may currently easily lead to write hangs ("slow ops" in OSD logs) + // because such PGs don't flush unstable entries on secondary OSDs so they can't remove these + // entries from their journals... pg.state = PG_INCOMPLETE; report_pg_state(pg); return; -- 2.30.2 From bb2f395f1e0a4c448c5253af0feb9fa22187fca6 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 11 Feb 2024 12:16:20 +0300 Subject: [PATCH 30/33] Add cutoff threshold for recovery auto-tuning --- docs/config/monitor.en.md | 4 ++-- docs/config/monitor.ru.md | 4 ++-- docs/config/network.en.md | 4 ++-- docs/config/network.ru.md | 4 ++-- docs/config/osd.en.md | 14 ++++++++++++-- docs/config/osd.ru.md | 13 ++++++++++++- docs/config/src/osd.yml | 17 ++++++++++++++--- src/osd.cpp | 2 ++ src/osd.h | 1 + src/osd_flush.cpp | 4 ++++ 10 files changed, 53 insertions(+), 14 deletions(-) diff --git a/docs/config/monitor.en.md b/docs/config/monitor.en.md index 9bf3f7cc..a2583ca7 100644 --- a/docs/config/monitor.en.md +++ b/docs/config/monitor.en.md @@ -19,8 +19,8 @@ These parameters only apply to Monitors. ## etcd_mon_ttl - Type: seconds -- Default: 30 -- Minimum: 10 +- Default: 1 +- Minimum: 5 Monitor etcd lease refresh interval in seconds diff --git a/docs/config/monitor.ru.md b/docs/config/monitor.ru.md index 12cca10a..9adab5da 100644 --- a/docs/config/monitor.ru.md +++ b/docs/config/monitor.ru.md @@ -19,8 +19,8 @@ ## etcd_mon_ttl - Тип: секунды -- Значение по умолчанию: 30 -- Минимальное значение: 10 +- Значение по умолчанию: 1 +- Минимальное значение: 5 Интервал обновления etcd резервации (lease) монитором diff --git a/docs/config/network.en.md b/docs/config/network.en.md index 547b675d..a28464ee 100644 --- a/docs/config/network.en.md +++ b/docs/config/network.en.md @@ -215,8 +215,8 @@ is scheduled. ## up_wait_retry_interval - Type: milliseconds -- Default: 500 -- Minimum: 50 +- Default: 50 +- Minimum: 10 - Can be changed online: yes OSDs respond to clients with a special error code when they receive I/O diff --git a/docs/config/network.ru.md b/docs/config/network.ru.md index de1a65fb..1d3ceaa0 100644 --- a/docs/config/network.ru.md +++ b/docs/config/network.ru.md @@ -224,8 +224,8 @@ OSD в любом случае согласовывают реальное зн ## up_wait_retry_interval - Тип: миллисекунды -- Значение по умолчанию: 500 -- Минимальное значение: 50 +- Значение по умолчанию: 50 +- Минимальное значение: 10 - Можно менять на лету: да Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не diff --git a/docs/config/osd.en.md b/docs/config/osd.en.md index 27945633..165a79df 100644 --- a/docs/config/osd.en.md +++ b/docs/config/osd.en.md @@ -59,6 +59,7 @@ them, even without restarting by updating configuration in etcd. - [recovery_tune_client_util_high](#recovery_tune_client_util_high) - [recovery_tune_agg_interval](#recovery_tune_agg_interval) - [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us) +- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us) ## etcd_report_interval @@ -604,5 +605,14 @@ is usually fine. - Default: 10 - Can be changed online: yes -Minimum possible value for auto-tuned recovery_sleep_us. Values lower -than this value are changed to 0. +Minimum possible value for auto-tuned recovery_sleep_us. Lower values +are changed to 0. + +## recovery_tune_sleep_cutoff_us + +- Type: microseconds +- Default: 10000000 +- Can be changed online: yes + +Maximum possible value for auto-tuned recovery_sleep_us. Higher values +are treated as outliers and ignored in aggregation. diff --git a/docs/config/osd.ru.md b/docs/config/osd.ru.md index b337f8f3..66456088 100644 --- a/docs/config/osd.ru.md +++ b/docs/config/osd.ru.md @@ -60,6 +60,7 @@ - [recovery_tune_client_util_high](#recovery_tune_client_util_high) - [recovery_tune_agg_interval](#recovery_tune_agg_interval) - [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us) +- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us) ## etcd_report_interval @@ -634,4 +635,14 @@ EC (кодов коррекции ошибок) с более, чем 1 диск - Можно менять на лету: да Минимальное возможное значение авто-подстроенного recovery_sleep_us. -Значения ниже данного заменяются на 0. +Меньшие значения заменяются на 0. + +## recovery_tune_sleep_cutoff_us + +- Тип: микросекунды +- Значение по умолчанию: 10000000 +- Можно менять на лету: да + +Максимальное возможное значение авто-подстроенного recovery_sleep_us. +Большие значения считаются случайными выбросами и игнорируются в +усреднении. diff --git a/docs/config/src/osd.yml b/docs/config/src/osd.yml index c541c34e..474ed8bf 100644 --- a/docs/config/src/osd.yml +++ b/docs/config/src/osd.yml @@ -731,8 +731,19 @@ default: 10 online: true info: | - Minimum possible value for auto-tuned recovery_sleep_us. Values lower - than this value are changed to 0. + Minimum possible value for auto-tuned recovery_sleep_us. Lower values + are changed to 0. info_ru: | Минимальное возможное значение авто-подстроенного recovery_sleep_us. - Значения ниже данного заменяются на 0. + Меньшие значения заменяются на 0. +- name: recovery_tune_sleep_cutoff_us + type: us + default: 10000000 + online: true + info: | + Maximum possible value for auto-tuned recovery_sleep_us. Higher values + are treated as outliers and ignored in aggregation. + info_ru: | + Максимальное возможное значение авто-подстроенного recovery_sleep_us. + Большие значения считаются случайными выбросами и игнорируются в + усреднении. diff --git a/src/osd.cpp b/src/osd.cpp index 23a006e5..134b31d6 100644 --- a/src/osd.cpp +++ b/src/osd.cpp @@ -233,6 +233,8 @@ void osd_t::parse_config(bool init) ? 10 : config["recovery_tune_agg_interval"].uint64_value(); recovery_tune_sleep_min_us = config["recovery_tune_sleep_min_us"].is_null() ? 10 : config["recovery_tune_sleep_min_us"].uint64_value(); + recovery_tune_sleep_cutoff_us = config["recovery_tune_sleep_cutoff_us"].is_null() + ? 10000000 : config["recovery_tune_sleep_cutoff_us"].uint64_value(); recovery_pg_switch = config["recovery_pg_switch"].uint64_value(); if (recovery_pg_switch < 1) recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH; diff --git a/src/osd.h b/src/osd.h index 49f27c15..c5079435 100644 --- a/src/osd.h +++ b/src/osd.h @@ -125,6 +125,7 @@ class osd_t int recovery_tune_interval = 1; int recovery_tune_agg_interval = 10; int recovery_tune_sleep_min_us = 10; + int recovery_tune_sleep_cutoff_us = 10000000; int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH; int recovery_sync_batch = DEFAULT_RECOVERY_BATCH; int inode_vanish_time = 60; diff --git a/src/osd_flush.cpp b/src/osd_flush.cpp index 2401948d..c8a9f5ad 100644 --- a/src/osd_flush.cpp +++ b/src/osd_flush.cpp @@ -422,6 +422,10 @@ void osd_t::tune_recovery() rtune_avg_lat = total_recovery_usec/recovery_count; uint64_t target_lat = rtune_avg_lat * rtune_avg_lat/1000000.0 * recovery_count/recovery_tune_interval / rtune_target_util; auto sleep_us = target_lat > rtune_avg_lat+recovery_tune_sleep_min_us ? target_lat-rtune_avg_lat : 0; + if (sleep_us > recovery_tune_sleep_cutoff_us) + { + return; + } if (recovery_target_sleep_items.size() != recovery_tune_agg_interval) { recovery_target_sleep_items.resize(recovery_tune_agg_interval); -- 2.30.2 From 978bdc128a6a9429f54a7abe93068522234547e9 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 11 Feb 2024 12:19:08 +0300 Subject: [PATCH 31/33] Apply recovery pause before writes, after commits, and do not apply it to syncs to not block EC pools from functioning --- src/osd.h | 1 + src/osd_flush.cpp | 2 +- src/osd_secondary.cpp | 24 ++++++++++++++++++++++-- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/src/osd.h b/src/osd.h index c5079435..1718a8dd 100644 --- a/src/osd.h +++ b/src/osd.h @@ -283,6 +283,7 @@ class osd_t void exec_sync_stab_all(osd_op_t *cur_op); void exec_show_config(osd_op_t *cur_op); void exec_secondary(osd_op_t *cur_op); + void exec_secondary_real(osd_op_t *cur_op); void secondary_op_callback(osd_op_t *cur_op); // primary ops diff --git a/src/osd_flush.cpp b/src/osd_flush.cpp index c8a9f5ad..5f81240a 100644 --- a/src/osd_flush.cpp +++ b/src/osd_flush.cpp @@ -442,7 +442,7 @@ void osd_t::tune_recovery() if (recovery_target_sleep_count < recovery_tune_agg_interval) recovery_target_sleep_count++; recovery_target_sleep_us = recovery_target_sleep_total / recovery_target_sleep_count; - if (log_level > 4) + if (log_level > 1) { printf( "[OSD %lu] auto-tune: client util: %.2f, recovery util: %.2f, lat: %lu us -> target util %.2f, delay %lu us\n", diff --git a/src/osd_secondary.cpp b/src/osd_secondary.cpp index d8fe2627..61c26d87 100644 --- a/src/osd_secondary.cpp +++ b/src/osd_secondary.cpp @@ -42,8 +42,10 @@ void osd_t::secondary_op_callback(osd_op_t *op) int retval = op->bs_op->retval; delete op->bs_op; op->bs_op = NULL; - if (op->is_recovery_related() && recovery_target_sleep_us) + if (op->is_recovery_related() && recovery_target_sleep_us && + op->req.hdr.opcode == OSD_OP_SEC_STABILIZE) { + // Apply pause AFTER commit. Do not apply pause to SYNC at all if (!op->tv_end.tv_sec) { clock_gettime(CLOCK_REALTIME, &op->tv_end); @@ -59,7 +61,25 @@ void osd_t::secondary_op_callback(osd_op_t *op) } } -void osd_t::exec_secondary(osd_op_t *cur_op) +void osd_t::exec_secondary(osd_op_t *op) +{ + if (op->is_recovery_related() && recovery_target_sleep_us && + op->req.hdr.opcode != OSD_OP_SEC_STABILIZE && op->req.hdr.opcode != OSD_OP_SEC_SYNC) + { + // Apply pause BEFORE write/delete + tfd->set_timer_us(recovery_target_sleep_us, false, [this, op](int timer_id) + { + clock_gettime(CLOCK_REALTIME, &op->tv_begin); + exec_secondary_real(op); + }); + } + else + { + exec_secondary_real(op); + } +} + +void osd_t::exec_secondary_real(osd_op_t *cur_op) { if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP) { -- 2.30.2 From 2947ea93e851c0af74592c4ce36f7b9444f1746d Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 11 Feb 2024 16:09:53 +0300 Subject: [PATCH 32/33] Raise test_snapshot_chain_ec timeout to 6 minutes --- .gitea/workflows/test.yml | 2 +- .gitea/workflows/tests-to-yaml.pl | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitea/workflows/test.yml b/.gitea/workflows/test.yml index 9bb0816a..683c1d58 100644 --- a/.gitea/workflows/test.yml +++ b/.gitea/workflows/test.yml @@ -395,7 +395,7 @@ jobs: steps: - name: Run test id: test - timeout-minutes: 3 + timeout-minutes: 6 run: SCHEME=ec /root/vitastor/tests/test_snapshot_chain.sh - name: Print logs if: always() && steps.test.outcome == 'failure' diff --git a/.gitea/workflows/tests-to-yaml.pl b/.gitea/workflows/tests-to-yaml.pl index ee497795..4a820331 100755 --- a/.gitea/workflows/tests-to-yaml.pl +++ b/.gitea/workflows/tests-to-yaml.pl @@ -39,6 +39,10 @@ for my $line (<>) $test_name .= '_'.lc($1).'_'.$2; } } + if ($test_name eq 'test_snapshot_chain_ec') + { + $timeout = 6; + } $line =~ s!\./test_!/root/vitastor/tests/test_!; # Gitea CI doesn't support artifacts yet, lol #- name: Upload results -- 2.30.2 From c777a0041aa55af4a0bf07477e99ca206b0b4b10 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 11 Feb 2024 16:22:58 +0300 Subject: [PATCH 33/33] Release 1.4.4 A couple of fixes for EC pools - Fix a segfault possible on partial EC overwrite in 1234 -> 5030 rebalance scenario - Fix two problems leading to EC pools stalling on rebalance & parallel sudden stops of OSDs, for example during a sudden poweroff of a host: - Recovery auto-tuning (1.4.0 feature) could apply too large delays and stall the EC journal - fixed by limiting delays with a new recovery_tune_sleep_cutoff_us parameter (10 seconds by default) and applying recovery pauses before write operations, not after them, to not occupy space in the journal for long time - Dynamic journal space reservation (1.3.0 feature) wasn't accounting new writes when checking the limit so OSDs could still fill the journal fully and stall - fixed by including new writes into the limit - Print etcd dbSize instead of dbSizeInUse in status --- CMakeLists.txt | 2 +- csi/Makefile | 2 +- csi/deploy/004-csi-nodeplugin.yaml | 2 +- csi/deploy/007-csi-provisioner.yaml | 2 +- csi/src/config.go | 2 +- debian/changelog | 2 +- debian/vitastor.Dockerfile | 8 ++++---- mon/package.json | 2 +- patches/cinder-vitastor.py | 2 +- rpm/build-tarball.sh | 2 +- rpm/vitastor-el7.Dockerfile | 2 +- rpm/vitastor-el7.spec | 4 ++-- rpm/vitastor-el8.Dockerfile | 2 +- rpm/vitastor-el8.spec | 4 ++-- rpm/vitastor-el9.Dockerfile | 2 +- rpm/vitastor-el9.spec | 4 ++-- src/CMakeLists.txt | 2 +- src/vitastor.pc.in | 2 +- 18 files changed, 24 insertions(+), 24 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index faa40448..5340df9d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12) project(vitastor) -set(VERSION "1.4.3") +set(VERSION "1.4.4") add_subdirectory(src) diff --git a/csi/Makefile b/csi/Makefile index 56cb9b37..962f1d6d 100644 --- a/csi/Makefile +++ b/csi/Makefile @@ -1,4 +1,4 @@ -VERSION ?= v1.4.3 +VERSION ?= v1.4.4 all: build push diff --git a/csi/deploy/004-csi-nodeplugin.yaml b/csi/deploy/004-csi-nodeplugin.yaml index d5badfac..f00b9b11 100644 --- a/csi/deploy/004-csi-nodeplugin.yaml +++ b/csi/deploy/004-csi-nodeplugin.yaml @@ -49,7 +49,7 @@ spec: capabilities: add: ["SYS_ADMIN"] allowPrivilegeEscalation: true - image: vitalif/vitastor-csi:v1.4.3 + image: vitalif/vitastor-csi:v1.4.4 args: - "--node=$(NODE_ID)" - "--endpoint=$(CSI_ENDPOINT)" diff --git a/csi/deploy/007-csi-provisioner.yaml b/csi/deploy/007-csi-provisioner.yaml index 73b0ea7c..b449bccf 100644 --- a/csi/deploy/007-csi-provisioner.yaml +++ b/csi/deploy/007-csi-provisioner.yaml @@ -121,7 +121,7 @@ spec: privileged: true capabilities: add: ["SYS_ADMIN"] - image: vitalif/vitastor-csi:v1.4.3 + image: vitalif/vitastor-csi:v1.4.4 args: - "--node=$(NODE_ID)" - "--endpoint=$(CSI_ENDPOINT)" diff --git a/csi/src/config.go b/csi/src/config.go index 21091917..3d4c243b 100644 --- a/csi/src/config.go +++ b/csi/src/config.go @@ -5,7 +5,7 @@ package vitastor const ( vitastorCSIDriverName = "csi.vitastor.io" - vitastorCSIDriverVersion = "1.4.3" + vitastorCSIDriverVersion = "1.4.4" ) // Config struct fills the parameters of request or user input diff --git a/debian/changelog b/debian/changelog index 18924223..de4be154 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -vitastor (1.4.3-1) unstable; urgency=medium +vitastor (1.4.4-1) unstable; urgency=medium * Bugfixes diff --git a/debian/vitastor.Dockerfile b/debian/vitastor.Dockerfile index d002c826..af4259e0 100644 --- a/debian/vitastor.Dockerfile +++ b/debian/vitastor.Dockerfile @@ -35,8 +35,8 @@ RUN set -e -x; \ mkdir -p /root/packages/vitastor-$REL; \ rm -rf /root/packages/vitastor-$REL/*; \ cd /root/packages/vitastor-$REL; \ - cp -r /root/vitastor vitastor-1.4.3; \ - cd vitastor-1.4.3; \ + cp -r /root/vitastor vitastor-1.4.4; \ + cd vitastor-1.4.4; \ ln -s /root/fio-build/fio-*/ ./fio; \ FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \ ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \ @@ -49,8 +49,8 @@ RUN set -e -x; \ rm -rf a b; \ echo "dep:fio=$FIO" > debian/fio_version; \ cd /root/packages/vitastor-$REL; \ - tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.3.orig.tar.xz vitastor-1.4.3; \ - cd vitastor-1.4.3; \ + tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.4.orig.tar.xz vitastor-1.4.4; \ + cd vitastor-1.4.4; \ V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \ DEBFULLNAME="Vitaliy Filippov " dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \ DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \ diff --git a/mon/package.json b/mon/package.json index 769fef1c..3bea81ac 100644 --- a/mon/package.json +++ b/mon/package.json @@ -1,6 +1,6 @@ { "name": "vitastor-mon", - "version": "1.4.3", + "version": "1.4.4", "description": "Vitastor SDS monitor service", "main": "mon-main.js", "scripts": { diff --git a/patches/cinder-vitastor.py b/patches/cinder-vitastor.py index fb410764..4c766beb 100644 --- a/patches/cinder-vitastor.py +++ b/patches/cinder-vitastor.py @@ -50,7 +50,7 @@ from cinder.volume import configuration from cinder.volume import driver from cinder.volume import volume_utils -VERSION = '1.4.3' +VERSION = '1.4.4' LOG = logging.getLogger(__name__) diff --git a/rpm/build-tarball.sh b/rpm/build-tarball.sh index 74437785..5b700149 100755 --- a/rpm/build-tarball.sh +++ b/rpm/build-tarball.sh @@ -24,4 +24,4 @@ rm fio mv fio-copy fio FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'` perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec -tar --transform 's#^#vitastor-1.4.3/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.3$(rpm --eval '%dist').tar.gz * +tar --transform 's#^#vitastor-1.4.4/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.4$(rpm --eval '%dist').tar.gz * diff --git a/rpm/vitastor-el7.Dockerfile b/rpm/vitastor-el7.Dockerfile index 74654034..21424a09 100644 --- a/rpm/vitastor-el7.Dockerfile +++ b/rpm/vitastor-el7.Dockerfile @@ -36,7 +36,7 @@ ADD . /root/vitastor RUN set -e; \ cd /root/vitastor/rpm; \ sh build-tarball.sh; \ - cp /root/vitastor-1.4.3.el7.tar.gz ~/rpmbuild/SOURCES; \ + cp /root/vitastor-1.4.4.el7.tar.gz ~/rpmbuild/SOURCES; \ cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \ cd ~/rpmbuild/SPECS/; \ rpmbuild -ba vitastor.spec; \ diff --git a/rpm/vitastor-el7.spec b/rpm/vitastor-el7.spec index f850272c..f8c13724 100644 --- a/rpm/vitastor-el7.spec +++ b/rpm/vitastor-el7.spec @@ -1,11 +1,11 @@ Name: vitastor -Version: 1.4.3 +Version: 1.4.4 Release: 1%{?dist} Summary: Vitastor, a fast software-defined clustered block storage License: Vitastor Network Public License 1.1 URL: https://vitastor.io/ -Source0: vitastor-1.4.3.el7.tar.gz +Source0: vitastor-1.4.4.el7.tar.gz BuildRequires: liburing-devel >= 0.6 BuildRequires: gperftools-devel diff --git a/rpm/vitastor-el8.Dockerfile b/rpm/vitastor-el8.Dockerfile index 3a039103..72276be3 100644 --- a/rpm/vitastor-el8.Dockerfile +++ b/rpm/vitastor-el8.Dockerfile @@ -35,7 +35,7 @@ ADD . /root/vitastor RUN set -e; \ cd /root/vitastor/rpm; \ sh build-tarball.sh; \ - cp /root/vitastor-1.4.3.el8.tar.gz ~/rpmbuild/SOURCES; \ + cp /root/vitastor-1.4.4.el8.tar.gz ~/rpmbuild/SOURCES; \ cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \ cd ~/rpmbuild/SPECS/; \ rpmbuild -ba vitastor.spec; \ diff --git a/rpm/vitastor-el8.spec b/rpm/vitastor-el8.spec index f3da7ea3..41cae854 100644 --- a/rpm/vitastor-el8.spec +++ b/rpm/vitastor-el8.spec @@ -1,11 +1,11 @@ Name: vitastor -Version: 1.4.3 +Version: 1.4.4 Release: 1%{?dist} Summary: Vitastor, a fast software-defined clustered block storage License: Vitastor Network Public License 1.1 URL: https://vitastor.io/ -Source0: vitastor-1.4.3.el8.tar.gz +Source0: vitastor-1.4.4.el8.tar.gz BuildRequires: liburing-devel >= 0.6 BuildRequires: gperftools-devel diff --git a/rpm/vitastor-el9.Dockerfile b/rpm/vitastor-el9.Dockerfile index 198952fd..1f1939db 100644 --- a/rpm/vitastor-el9.Dockerfile +++ b/rpm/vitastor-el9.Dockerfile @@ -18,7 +18,7 @@ ADD . /root/vitastor RUN set -e; \ cd /root/vitastor/rpm; \ sh build-tarball.sh; \ - cp /root/vitastor-1.4.3.el9.tar.gz ~/rpmbuild/SOURCES; \ + cp /root/vitastor-1.4.4.el9.tar.gz ~/rpmbuild/SOURCES; \ cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \ cd ~/rpmbuild/SPECS/; \ rpmbuild -ba vitastor.spec; \ diff --git a/rpm/vitastor-el9.spec b/rpm/vitastor-el9.spec index 7bcfb1ad..4c3e9557 100644 --- a/rpm/vitastor-el9.spec +++ b/rpm/vitastor-el9.spec @@ -1,11 +1,11 @@ Name: vitastor -Version: 1.4.3 +Version: 1.4.4 Release: 1%{?dist} Summary: Vitastor, a fast software-defined clustered block storage License: Vitastor Network Public License 1.1 URL: https://vitastor.io/ -Source0: vitastor-1.4.3.el9.tar.gz +Source0: vitastor-1.4.4.el9.tar.gz BuildRequires: liburing-devel >= 0.6 BuildRequires: gperftools-devel diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d0cae0ae..751c62b0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$") set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") endif() -add_definitions(-DVERSION="1.4.3") +add_definitions(-DVERSION="1.4.4") add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src) add_link_options(-fno-omit-frame-pointer) if (${WITH_ASAN}) diff --git a/src/vitastor.pc.in b/src/vitastor.pc.in index d9b3d8a6..71932bb8 100644 --- a/src/vitastor.pc.in +++ b/src/vitastor.pc.in @@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ Name: Vitastor Description: Vitastor client library -Version: 1.4.3 +Version: 1.4.4 Libs: -L${libdir} -lvitastor_client Cflags: -I${includedir} -- 2.30.2