diff --git a/.gitea/workflows/test.yml b/.gitea/workflows/test.yml index 421089ed..683c1d58 100644 --- a/.gitea/workflows/test.yml +++ b/.gitea/workflows/test.yml @@ -395,7 +395,7 @@ jobs: steps: - name: Run test id: test - timeout-minutes: 3 + timeout-minutes: 6 run: SCHEME=ec /root/vitastor/tests/test_snapshot_chain.sh - name: Print logs if: always() && steps.test.outcome == 'failure' @@ -532,6 +532,24 @@ jobs: echo "" done + test_switch_primary: + runs-on: ubuntu-latest + needs: build + container: ${{env.TEST_IMAGE}}:${{github.sha}} + steps: + - name: Run test + id: test + timeout-minutes: 3 + run: /root/vitastor/tests/test_switch_primary.sh + - name: Print logs + if: always() && steps.test.outcome == 'failure' + run: | + for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do + echo "-------- $i --------" + cat $i + echo "" + done + test_write: runs-on: ubuntu-latest needs: build diff --git a/.gitea/workflows/tests-to-yaml.pl b/.gitea/workflows/tests-to-yaml.pl index ee497795..4a820331 100755 --- a/.gitea/workflows/tests-to-yaml.pl +++ b/.gitea/workflows/tests-to-yaml.pl @@ -39,6 +39,10 @@ for my $line (<>) $test_name .= '_'.lc($1).'_'.$2; } } + if ($test_name eq 'test_snapshot_chain_ec') + { + $timeout = 6; + } $line =~ s!\./test_!/root/vitastor/tests/test_!; # Gitea CI doesn't support artifacts yet, lol #- name: Upload results diff --git a/CLA-en.md b/CLA-en.md index 0407b56e..b912f06f 100644 --- a/CLA-en.md +++ b/CLA-en.md @@ -38,7 +38,7 @@ in the offer. on behalf of third parties, including on behalf of his employer. 2. Subject of the Agreement. \ - 2.1. Subject of the Agreement shall be the Contributions sent to the Author by Contributors. + 2.1. Subject of the Agreement shall be the Contributions sent to the Author by Contributors. \ 2.2. The Contributor grants to the Author the right to use Contributions at his own discretion and without any necessity to get a prior approval from Contributor or any other third party in any way, under a simple (non-exclusive), royalty-free, @@ -86,7 +86,7 @@ in the offer. of their provision to the Author. \ 5.2. The Contributor represents and warrants that he legally owns exclusive intellectual property rights to the Contributions. \ - 5.3. The Contributor represents and warrants that any further use of \ + 5.3. The Contributor represents and warrants that any further use of Contributions by the Author as provided by Contributor under the terms of the Agreement does not infringe on intellectual and other rights and legitimate interests of third parties. \ diff --git a/CMakeLists.txt b/CMakeLists.txt index 9fa27825..5340df9d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12) project(vitastor) -set(VERSION "1.4.0") +set(VERSION "1.4.4") add_subdirectory(src) diff --git a/cpp-btree b/cpp-btree index 45e6d1f1..8de8b467 160000 --- a/cpp-btree +++ b/cpp-btree @@ -1 +1 @@ -Subproject commit 45e6d1f13196a0824e2089a586c53b9de0283f17 +Subproject commit 8de8b467acbca50cfd8835c20e0e379110f3b32b diff --git a/csi/Makefile b/csi/Makefile index 86b563e4..962f1d6d 100644 --- a/csi/Makefile +++ b/csi/Makefile @@ -1,4 +1,4 @@ -VERSION ?= v1.4.0 +VERSION ?= v1.4.4 all: build push diff --git a/csi/deploy/004-csi-nodeplugin.yaml b/csi/deploy/004-csi-nodeplugin.yaml index b71c18c0..f00b9b11 100644 --- a/csi/deploy/004-csi-nodeplugin.yaml +++ b/csi/deploy/004-csi-nodeplugin.yaml @@ -49,7 +49,7 @@ spec: capabilities: add: ["SYS_ADMIN"] allowPrivilegeEscalation: true - image: vitalif/vitastor-csi:v1.4.0 + image: vitalif/vitastor-csi:v1.4.4 args: - "--node=$(NODE_ID)" - "--endpoint=$(CSI_ENDPOINT)" diff --git a/csi/deploy/007-csi-provisioner.yaml b/csi/deploy/007-csi-provisioner.yaml index c62791c3..b449bccf 100644 --- a/csi/deploy/007-csi-provisioner.yaml +++ b/csi/deploy/007-csi-provisioner.yaml @@ -121,7 +121,7 @@ spec: privileged: true capabilities: add: ["SYS_ADMIN"] - image: vitalif/vitastor-csi:v1.4.0 + image: vitalif/vitastor-csi:v1.4.4 args: - "--node=$(NODE_ID)" - "--endpoint=$(CSI_ENDPOINT)" diff --git a/csi/src/config.go b/csi/src/config.go index a3ceeb20..3d4c243b 100644 --- a/csi/src/config.go +++ b/csi/src/config.go @@ -5,7 +5,7 @@ package vitastor const ( vitastorCSIDriverName = "csi.vitastor.io" - vitastorCSIDriverVersion = "1.4.0" + vitastorCSIDriverVersion = "1.4.4" ) // Config struct fills the parameters of request or user input diff --git a/debian/changelog b/debian/changelog index 80884c55..de4be154 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -vitastor (1.4.0-1) unstable; urgency=medium +vitastor (1.4.4-1) unstable; urgency=medium * Bugfixes diff --git a/debian/vitastor.Dockerfile b/debian/vitastor.Dockerfile index 1d64949b..af4259e0 100644 --- a/debian/vitastor.Dockerfile +++ b/debian/vitastor.Dockerfile @@ -35,8 +35,8 @@ RUN set -e -x; \ mkdir -p /root/packages/vitastor-$REL; \ rm -rf /root/packages/vitastor-$REL/*; \ cd /root/packages/vitastor-$REL; \ - cp -r /root/vitastor vitastor-1.4.0; \ - cd vitastor-1.4.0; \ + cp -r /root/vitastor vitastor-1.4.4; \ + cd vitastor-1.4.4; \ ln -s /root/fio-build/fio-*/ ./fio; \ FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \ ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \ @@ -49,8 +49,8 @@ RUN set -e -x; \ rm -rf a b; \ echo "dep:fio=$FIO" > debian/fio_version; \ cd /root/packages/vitastor-$REL; \ - tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.0.orig.tar.xz vitastor-1.4.0; \ - cd vitastor-1.4.0; \ + tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.4.orig.tar.xz vitastor-1.4.4; \ + cd vitastor-1.4.4; \ V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \ DEBFULLNAME="Vitaliy Filippov " dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \ DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \ diff --git a/docs/config/monitor.en.md b/docs/config/monitor.en.md index 9bf3f7cc..a2583ca7 100644 --- a/docs/config/monitor.en.md +++ b/docs/config/monitor.en.md @@ -19,8 +19,8 @@ These parameters only apply to Monitors. ## etcd_mon_ttl - Type: seconds -- Default: 30 -- Minimum: 10 +- Default: 1 +- Minimum: 5 Monitor etcd lease refresh interval in seconds diff --git a/docs/config/monitor.ru.md b/docs/config/monitor.ru.md index 12cca10a..9adab5da 100644 --- a/docs/config/monitor.ru.md +++ b/docs/config/monitor.ru.md @@ -19,8 +19,8 @@ ## etcd_mon_ttl - Тип: секунды -- Значение по умолчанию: 30 -- Минимальное значение: 10 +- Значение по умолчанию: 1 +- Минимальное значение: 5 Интервал обновления etcd резервации (lease) монитором diff --git a/docs/config/network.en.md b/docs/config/network.en.md index 547b675d..a28464ee 100644 --- a/docs/config/network.en.md +++ b/docs/config/network.en.md @@ -215,8 +215,8 @@ is scheduled. ## up_wait_retry_interval - Type: milliseconds -- Default: 500 -- Minimum: 50 +- Default: 50 +- Minimum: 10 - Can be changed online: yes OSDs respond to clients with a special error code when they receive I/O diff --git a/docs/config/network.ru.md b/docs/config/network.ru.md index de1a65fb..1d3ceaa0 100644 --- a/docs/config/network.ru.md +++ b/docs/config/network.ru.md @@ -224,8 +224,8 @@ OSD в любом случае согласовывают реальное зн ## up_wait_retry_interval - Тип: миллисекунды -- Значение по умолчанию: 500 -- Минимальное значение: 50 +- Значение по умолчанию: 50 +- Минимальное значение: 10 - Можно менять на лету: да Когда OSD получают от клиентов запросы ввода-вывода, относящиеся к не diff --git a/docs/config/osd.en.md b/docs/config/osd.en.md index 27945633..165a79df 100644 --- a/docs/config/osd.en.md +++ b/docs/config/osd.en.md @@ -59,6 +59,7 @@ them, even without restarting by updating configuration in etcd. - [recovery_tune_client_util_high](#recovery_tune_client_util_high) - [recovery_tune_agg_interval](#recovery_tune_agg_interval) - [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us) +- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us) ## etcd_report_interval @@ -604,5 +605,14 @@ is usually fine. - Default: 10 - Can be changed online: yes -Minimum possible value for auto-tuned recovery_sleep_us. Values lower -than this value are changed to 0. +Minimum possible value for auto-tuned recovery_sleep_us. Lower values +are changed to 0. + +## recovery_tune_sleep_cutoff_us + +- Type: microseconds +- Default: 10000000 +- Can be changed online: yes + +Maximum possible value for auto-tuned recovery_sleep_us. Higher values +are treated as outliers and ignored in aggregation. diff --git a/docs/config/osd.ru.md b/docs/config/osd.ru.md index b337f8f3..66456088 100644 --- a/docs/config/osd.ru.md +++ b/docs/config/osd.ru.md @@ -60,6 +60,7 @@ - [recovery_tune_client_util_high](#recovery_tune_client_util_high) - [recovery_tune_agg_interval](#recovery_tune_agg_interval) - [recovery_tune_sleep_min_us](#recovery_tune_sleep_min_us) +- [recovery_tune_sleep_cutoff_us](#recovery_tune_sleep_cutoff_us) ## etcd_report_interval @@ -634,4 +635,14 @@ EC (кодов коррекции ошибок) с более, чем 1 диск - Можно менять на лету: да Минимальное возможное значение авто-подстроенного recovery_sleep_us. -Значения ниже данного заменяются на 0. +Меньшие значения заменяются на 0. + +## recovery_tune_sleep_cutoff_us + +- Тип: микросекунды +- Значение по умолчанию: 10000000 +- Можно менять на лету: да + +Максимальное возможное значение авто-подстроенного recovery_sleep_us. +Большие значения считаются случайными выбросами и игнорируются в +усреднении. diff --git a/docs/config/src/monitor.yml b/docs/config/src/monitor.yml index 06f6a649..caf335e3 100644 --- a/docs/config/src/monitor.yml +++ b/docs/config/src/monitor.yml @@ -1,7 +1,7 @@ - name: etcd_mon_ttl type: sec - min: 10 - default: 30 + min: 5 + default: 1 info: Monitor etcd lease refresh interval in seconds info_ru: Интервал обновления etcd резервации (lease) монитором - name: etcd_mon_timeout diff --git a/docs/config/src/network.yml b/docs/config/src/network.yml index b95f48e2..5bd2c808 100644 --- a/docs/config/src/network.yml +++ b/docs/config/src/network.yml @@ -245,8 +245,8 @@ повторная попытка соединения. - name: up_wait_retry_interval type: ms - min: 50 - default: 500 + min: 10 + default: 50 online: true info: | OSDs respond to clients with a special error code when they receive I/O diff --git a/docs/config/src/osd.yml b/docs/config/src/osd.yml index c541c34e..474ed8bf 100644 --- a/docs/config/src/osd.yml +++ b/docs/config/src/osd.yml @@ -731,8 +731,19 @@ default: 10 online: true info: | - Minimum possible value for auto-tuned recovery_sleep_us. Values lower - than this value are changed to 0. + Minimum possible value for auto-tuned recovery_sleep_us. Lower values + are changed to 0. info_ru: | Минимальное возможное значение авто-подстроенного recovery_sleep_us. - Значения ниже данного заменяются на 0. + Меньшие значения заменяются на 0. +- name: recovery_tune_sleep_cutoff_us + type: us + default: 10000000 + online: true + info: | + Maximum possible value for auto-tuned recovery_sleep_us. Higher values + are treated as outliers and ignored in aggregation. + info_ru: | + Максимальное возможное значение авто-подстроенного recovery_sleep_us. + Большие значения считаются случайными выбросами и игнорируются в + усреднении. diff --git a/docs/performance/theoretical.en.md b/docs/performance/theoretical.en.md index 043bd7ea..b85d0a43 100644 --- a/docs/performance/theoretical.en.md +++ b/docs/performance/theoretical.en.md @@ -11,19 +11,26 @@ Replicated setups: - Single-threaded write+fsync latency: - With immediate commit: 2 network roundtrips + 1 disk write. - With lazy commit: 4 network roundtrips + 1 disk write + 1 disk flush. -- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)). -- Saturated parallel write iops: min(network bandwidth, sum(disk write iops / number of replicas / write amplification)). +- Linear read: `min(total network bandwidth, sum(disk read MB/s))`. +- Linear write: `min(total network bandwidth, sum(disk write MB/s / number of replicas))`. +- Saturated parallel read iops: `min(total network bandwidth, sum(disk read iops))`. +- Saturated parallel write iops: `min(total network bandwidth / number of replicas, sum(disk write iops / number of replicas / (write amplification = 4)))`. -EC/XOR setups: +EC/XOR setups (EC N+K): - Single-threaded (T1Q1) read latency: 1.5 network roundtrips + 1 disk read. - Single-threaded write+fsync latency: - With immediate commit: 3.5 network roundtrips + 1 disk read + 2 disk writes. - With lazy commit: 5.5 network roundtrips + 1 disk read + 2 disk writes + 2 disk fsyncs. - - 0.5 in actually (k-1)/k which means that an additional roundtrip doesn't happen when + - 0.5 in actually `(N-1)/N` which means that an additional roundtrip doesn't happen when the read sub-operation can be served locally. -- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)). -- Saturated parallel write iops: min(network bandwidth, sum(disk write iops * number of data drives / (number of data + parity drives) / write amplification)). - In fact, you should put disk write iops under the condition of ~10% reads / ~90% writes in this formula. +- Linear read: `min(total network bandwidth, sum(disk read MB/s))`. +- Linear write: `min(total network bandwidth, sum(disk write MB/s * N/(N+K)))`. +- Saturated parallel read iops: `min(total network bandwidth, sum(disk read iops))`. +- Saturated parallel write iops: roughly `total iops / (N+K) / WA`. More exactly, + `min(total network bandwidth * N/(N+K), sum(disk randrw iops / (N*4 + K*5 + 1)))` with + random read/write mix corresponding to `(N-1)/(N*4 + K*5 + 1)*100 % reads`. + - For example, with EC 2+1 it is: `(7% randrw iops) / 14`. + - With EC 6+3 it is: `(12.5% randrw iops) / 40`. Write amplification for 4 KB blocks is usually 3-5 in Vitastor: 1. Journal block write diff --git a/docs/performance/theoretical.ru.md b/docs/performance/theoretical.ru.md index e32c27d3..d258545a 100644 --- a/docs/performance/theoretical.ru.md +++ b/docs/performance/theoretical.ru.md @@ -11,20 +11,27 @@ - Запись+fsync в 1 поток: - С мгновенным сбросом: 2 RTT + 1 запись. - С отложенным ("ленивым") сбросом: 4 RTT + 1 запись + 1 fsync. -- Параллельное чтение: сумма IOPS всех дисков либо производительность сети, если в сеть упрётся раньше. -- Параллельная запись: сумма IOPS всех дисков / число реплик / WA либо производительность сети, если в сеть упрётся раньше. +- Линейное чтение: сумма МБ/с чтения всех дисков, либо общая производительность сети (сумма пропускной способности сети всех нод), если в сеть упрётся раньше. +- Линейная запись: сумма МБ/с записи всех дисков / число реплик, либо производительность сети / число реплик, если в сеть упрётся раньше. +- Параллельное случайное мелкое чтение: сумма IOPS чтения всех дисков, либо производительность сети, если в сеть упрётся раньше. +- Параллельная случайная мелкая запись: сумма IOPS записи всех дисков / число реплик / WA, либо производительность сети / число реплик, если в сеть упрётся раньше. -При использовании кодов коррекции ошибок (EC): +При использовании кодов коррекции ошибок (EC N+K): - Задержка чтения в 1 поток (T1Q1): 1.5 RTT + 1 чтение. - Запись+fsync в 1 поток: - С мгновенным сбросом: 3.5 RTT + 1 чтение + 2 записи. - С отложенным ("ленивым") сбросом: 5.5 RTT + 1 чтение + 2 записи + 2 fsync. -- Под 0.5 на самом деле подразумевается (k-1)/k, где k - число дисков данных, +- Под 0.5 на самом деле подразумевается (N-1)/N, где N - число дисков данных, что означает, что дополнительное обращение по сети не нужно, когда операция чтения обслуживается локально. -- Параллельное чтение: сумма IOPS всех дисков либо производительность сети, если в сеть упрётся раньше. -- Параллельная запись: сумма IOPS всех дисков / общее число дисков данных и чётности / WA либо производительность сети, если в сеть упрётся раньше. - Примечание: IOPS дисков в данном случае надо брать в смешанном режиме чтения/записи в пропорции, аналогичной формулам выше. +- Линейное чтение: сумма МБ/с чтения всех дисков, либо общая производительность сети, если в сеть упрётся раньше. +- Линейная запись: сумма МБ/с записи всех дисков * N/(N+K), либо производительность сети * N / (N+K), если в сеть упрётся раньше. +- Параллельное случайное мелкое чтение: сумма IOPS чтения всех дисков либо производительность сети, если в сеть упрётся раньше. +- Параллельная случайная мелкая запись: грубо `(сумма IOPS / (N+K) / WA)`. Если точнее, то: + сумма смешанного IOPS всех дисков при `(N-1)/(N*4 + K*5 + 1)*100 %` чтения, делённая на `(N*4 + K*5 + 1)`. + Либо, производительность сети * N/(N+K), если в сеть упрётся раньше. + - Например, при EC 2+1 это: `(сумма IOPS при 7% чтения) / 14`. + - При EC 6+3 это: `(сумма IOPS при 12.5% чтения) / 40`. WA (мультипликатор записи) для 4 КБ блоков в Vitastor обычно составляет 3-5: 1. Запись метаданных в журнал diff --git a/mon/mon.js b/mon/mon.js index 56e6a573..6e8864c9 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -55,7 +55,7 @@ const etcd_tree = { // etcd connection - configurable online etcd_address: "10.0.115.10:2379/v3", // mon - etcd_mon_ttl: 30, // min: 10 + etcd_mon_ttl: 5, // min: 1 etcd_mon_timeout: 1000, // ms. min: 0 etcd_mon_retries: 5, // min: 0 mon_change_timeout: 1000, // ms. min: 100 @@ -92,7 +92,7 @@ const etcd_tree = { peer_connect_timeout: 5, // seconds. min: 1 osd_idle_timeout: 5, // seconds. min: 1 osd_ping_timeout: 5, // seconds. min: 1 - up_wait_retry_interval: 500, // ms. min: 50 + up_wait_retry_interval: 50, // ms. min: 10 max_etcd_attempts: 5, etcd_quick_timeout: 1000, // ms etcd_slow_timeout: 5000, // ms @@ -390,7 +390,8 @@ class Mon { constructor(config) { - this.die = (e) => this._die(e); + this.failconnect = (e) => this._die(e, 2); + this.die = (e) => this._die(e, 1); if (fs.existsSync(config.config_path||'/etc/vitastor/vitastor.conf')) { config = { @@ -479,10 +480,10 @@ class Mon check_config() { - this.config.etcd_mon_ttl = Number(this.config.etcd_mon_ttl) || 30; - if (this.config.etcd_mon_ttl < 10) + this.config.etcd_mon_ttl = Number(this.config.etcd_mon_ttl) || 5; + if (this.config.etcd_mon_ttl < 1) { - this.config.etcd_mon_ttl = 10; + this.config.etcd_mon_ttl = 1; } this.config.etcd_mon_timeout = Number(this.config.etcd_mon_timeout) || 0; if (this.config.etcd_mon_timeout <= 0) @@ -604,7 +605,7 @@ class Mon } if (!this.ws) { - this.die('Failed to open etcd watch websocket'); + this.failconnect('Failed to open etcd watch websocket'); } const cur_addr = this.selected_etcd_url; this.ws_alive = true; @@ -674,7 +675,12 @@ class Mon { this.parse_kv(e.kv); const key = e.kv.key.substr(this.etcd_prefix.length); - if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/' || key.substr(0, 16) == '/osd/inodestats/') + if (key.substr(0, 11) == '/osd/state/') + { + stats_changed = true; + changed = true; + } + else if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/' || key.substr(0, 16) == '/osd/inodestats/') { stats_changed = true; } @@ -791,9 +797,9 @@ class Mon const res = await this.etcd_call('/lease/keepalive', { ID: this.etcd_lease_id }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries); if (!res.result.TTL) { - this.die('Lease expired'); + this.failconnect('Lease expired'); } - }, this.config.etcd_mon_timeout); + }, this.config.etcd_mon_ttl*1000); if (!this.signals_set) { process.on('SIGINT', this.on_stop_cb); @@ -1414,7 +1420,14 @@ class Mon } if (changed) { - await this.save_pg_config(new_config_pgs); + const ok = await this.save_pg_config(new_config_pgs); + if (ok) + console.log('PG configuration successfully changed'); + else + { + console.log('Someone changed PG configuration while we also tried to change it. Retrying in '+this.config.mon_change_timeout+' ms'); + this.schedule_recheck(); + } } } this.recheck_pgs_active = false; @@ -1495,6 +1508,11 @@ class Mon this.save_new_pgs_txn(new_config_pgs, etcd_request, pool_id, up_osds, osd_tree, real_prev_pgs, pool_res.pgs, pg_history); } new_config_pgs.hash = tree_hash; + return await this.save_pg_config(new_config_pgs, etcd_request); + } + + async save_pg_config(new_config_pgs, etcd_request = { compare: [], success: [] }) + { etcd_request.compare.push( { key: b64(this.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id }, { key: b64(this.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' }, @@ -1622,9 +1640,13 @@ class Mon } const sum_diff = { op_stats: {}, subop_stats: {}, recovery_stats: {} }; // Sum derived values instead of deriving summed - for (const osd in this.state.osd.stats) + for (const osd in this.state.osd.state) { const derived = this.prev_stats.osd_diff[osd]; + if (!this.state.osd.state[osd] || !derived) + { + continue; + } for (const type in sum_diff) { for (const op in derived[type]||{}) @@ -1725,9 +1747,13 @@ class Mon const used = this.state.pool.stats[pool_id].used_raw_tb; this.state.pool.stats[pool_id].used_raw_tb = Number(used)/1024/1024/1024/1024; } - for (const osd_num in this.state.osd.inodestats) + for (const osd_num in this.state.osd.state) { const ist = this.state.osd.inodestats[osd_num]; + if (!ist || !this.state.osd.state[osd_num]) + { + continue; + } for (const pool_id in ist) { inode_stats[pool_id] = inode_stats[pool_id] || {}; @@ -1743,9 +1769,14 @@ class Mon } } } - for (const osd in this.prev_stats.osd_diff) + for (const osd in this.state.osd.state) { - for (const pool_id in this.prev_stats.osd_diff[osd].inode_stats) + const osd_diff = this.prev_stats.osd_diff[osd]; + if (!osd_diff || !this.state.osd.state[osd]) + { + continue; + } + for (const pool_id in osd_diff.inode_stats) { for (const inode_num in this.prev_stats.osd_diff[osd].inode_stats[pool_id]) { @@ -1985,14 +2016,14 @@ class Mon return res.json; } } - this.die(); + this.failconnect(); } - _die(err) + _die(err, code) { // In fact we can just try to rejoin console.error(new Error(err || 'Cluster connection failed')); - process.exit(1); + process.exit(code || 2); } local_ips(all) diff --git a/mon/package.json b/mon/package.json index eeeb36bc..3bea81ac 100644 --- a/mon/package.json +++ b/mon/package.json @@ -1,6 +1,6 @@ { "name": "vitastor-mon", - "version": "1.4.0", + "version": "1.4.4", "description": "Vitastor SDS monitor service", "main": "mon-main.js", "scripts": { diff --git a/mon/vitastor-osd@.service b/mon/vitastor-osd@.service index ac2857d2..1930752e 100644 --- a/mon/vitastor-osd@.service +++ b/mon/vitastor-osd@.service @@ -8,7 +8,9 @@ PartOf=vitastor.target LimitNOFILE=1048576 LimitNPROC=1048576 LimitMEMLOCK=infinity -ExecStart=bash -c 'exec vitastor-disk exec-osd /dev/vitastor/osd%i-data >>/var/log/vitastor/osd%i.log 2>&1' +# Use the following for direct logs to files +#ExecStart=bash -c 'exec vitastor-disk exec-osd /dev/vitastor/osd%i-data >>/var/log/vitastor/osd%i.log 2>&1' +ExecStart=vitastor-disk exec-osd /dev/vitastor/osd%i-data ExecStartPre=+vitastor-disk pre-exec /dev/vitastor/osd%i-data WorkingDirectory=/ User=vitastor diff --git a/patches/cinder-vitastor.py b/patches/cinder-vitastor.py index f69d0f85..4c766beb 100644 --- a/patches/cinder-vitastor.py +++ b/patches/cinder-vitastor.py @@ -50,7 +50,7 @@ from cinder.volume import configuration from cinder.volume import driver from cinder.volume import volume_utils -VERSION = '1.4.0' +VERSION = '1.4.4' LOG = logging.getLogger(__name__) diff --git a/patches/libvirt-9.10-vitastor.diff b/patches/libvirt-9.10-vitastor.diff new file mode 100644 index 00000000..e4fbcdc0 --- /dev/null +++ b/patches/libvirt-9.10-vitastor.diff @@ -0,0 +1,643 @@ +commit c1cd026e211e94b120028e7c98a6e4ce5afe9846 +Author: Vitaliy Filippov +Date: Wed Jan 24 22:04:50 2024 +0300 + + Add Vitastor support + +diff --git a/include/libvirt/libvirt-storage.h b/include/libvirt/libvirt-storage.h +index aaad4a3da1..5f5daa8341 100644 +--- a/include/libvirt/libvirt-storage.h ++++ b/include/libvirt/libvirt-storage.h +@@ -326,6 +326,7 @@ typedef enum { + VIR_CONNECT_LIST_STORAGE_POOLS_ZFS = 1 << 17, /* (Since: 1.2.8) */ + VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE = 1 << 18, /* (Since: 3.1.0) */ + VIR_CONNECT_LIST_STORAGE_POOLS_ISCSI_DIRECT = 1 << 19, /* (Since: 5.6.0) */ ++ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR = 1 << 20, /* (Since: 5.0.0) */ + } virConnectListAllStoragePoolsFlags; + + int virConnectListAllStoragePools(virConnectPtr conn, +diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c +index 22ad43e1d7..56c81d6852 100644 +--- a/src/conf/domain_conf.c ++++ b/src/conf/domain_conf.c +@@ -7185,7 +7185,8 @@ virDomainDiskSourceNetworkParse(xmlNodePtr node, + src->configFile = virXPathString("string(./config/@file)", ctxt); + + if (src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTP || +- src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS) ++ src->protocol == VIR_STORAGE_NET_PROTOCOL_HTTPS || ++ src->protocol == VIR_STORAGE_NET_PROTOCOL_VITASTOR) + src->query = virXMLPropString(node, "query"); + + if (virDomainStorageNetworkParseHosts(node, ctxt, &src->hosts, &src->nhosts) < 0) +@@ -30618,6 +30619,7 @@ virDomainStorageSourceTranslateSourcePool(virStorageSource *src, + + case VIR_STORAGE_POOL_MPATH: + case VIR_STORAGE_POOL_RBD: ++ case VIR_STORAGE_POOL_VITASTOR: + case VIR_STORAGE_POOL_SHEEPDOG: + case VIR_STORAGE_POOL_GLUSTER: + case VIR_STORAGE_POOL_LAST: +diff --git a/src/conf/domain_validate.c b/src/conf/domain_validate.c +index c72108886e..c739ed6c43 100644 +--- a/src/conf/domain_validate.c ++++ b/src/conf/domain_validate.c +@@ -495,6 +495,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src) + case VIR_STORAGE_NET_PROTOCOL_RBD: + break; + ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: + case VIR_STORAGE_NET_PROTOCOL_NBD: + case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG: + case VIR_STORAGE_NET_PROTOCOL_GLUSTER: +@@ -541,7 +542,7 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src) + } + } + +- /* internal snapshots and config files are currently supported only with rbd: */ ++ /* internal snapshots are currently supported only with rbd: */ + if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK && + src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD) { + if (src->snapshot) { +@@ -549,10 +550,15 @@ virDomainDiskDefValidateSourceChainOne(const virStorageSource *src) + _(" element is currently supported only with 'rbd' disks")); + return -1; + } ++ } + ++ /* config files are currently supported only with rbd and vitastor: */ ++ if (virStorageSourceGetActualType(src) != VIR_STORAGE_TYPE_NETWORK && ++ src->protocol != VIR_STORAGE_NET_PROTOCOL_RBD && ++ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR) { + if (src->configFile) { + virReportError(VIR_ERR_XML_ERROR, "%s", +- _(" element is currently supported only with 'rbd' disks")); ++ _(" element is currently supported only with 'rbd' and 'vitastor' disks")); + return -1; + } + } +diff --git a/src/conf/schemas/domaincommon.rng b/src/conf/schemas/domaincommon.rng +index b98a2ae602..7d7a872e01 100644 +--- a/src/conf/schemas/domaincommon.rng ++++ b/src/conf/schemas/domaincommon.rng +@@ -1997,6 +1997,35 @@ + + + ++ ++ ++ ++ ++ vitastor ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ + + + +@@ -2347,6 +2376,7 @@ + + + ++ + + + +diff --git a/src/conf/storage_conf.c b/src/conf/storage_conf.c +index 68842004b7..1d69a788b6 100644 +--- a/src/conf/storage_conf.c ++++ b/src/conf/storage_conf.c +@@ -56,7 +56,7 @@ VIR_ENUM_IMPL(virStoragePool, + "logical", "disk", "iscsi", + "iscsi-direct", "scsi", "mpath", + "rbd", "sheepdog", "gluster", +- "zfs", "vstorage", ++ "zfs", "vstorage", "vitastor", + ); + + VIR_ENUM_IMPL(virStoragePoolFormatFileSystem, +@@ -242,6 +242,18 @@ static virStoragePoolTypeInfo poolTypeInfo[] = { + .formatToString = virStorageFileFormatTypeToString, + } + }, ++ {.poolType = VIR_STORAGE_POOL_VITASTOR, ++ .poolOptions = { ++ .flags = (VIR_STORAGE_POOL_SOURCE_HOST | ++ VIR_STORAGE_POOL_SOURCE_NETWORK | ++ VIR_STORAGE_POOL_SOURCE_NAME), ++ }, ++ .volOptions = { ++ .defaultFormat = VIR_STORAGE_FILE_RAW, ++ .formatFromString = virStorageVolumeFormatFromString, ++ .formatToString = virStorageFileFormatTypeToString, ++ } ++ }, + {.poolType = VIR_STORAGE_POOL_SHEEPDOG, + .poolOptions = { + .flags = (VIR_STORAGE_POOL_SOURCE_HOST | +@@ -538,6 +550,11 @@ virStoragePoolDefParseSource(xmlXPathContextPtr ctxt, + _("element 'name' is mandatory for RBD pool")); + return -1; + } ++ if (pool_type == VIR_STORAGE_POOL_VITASTOR && source->name == NULL) { ++ virReportError(VIR_ERR_XML_ERROR, "%s", ++ _("element 'name' is mandatory for Vitastor pool")); ++ return -1; ++ } + + if (options->formatFromString) { + g_autofree char *format = NULL; +@@ -1127,6 +1144,7 @@ virStoragePoolDefFormatBuf(virBuffer *buf, + /* RBD, Sheepdog, Gluster and Iscsi-direct devices are not local block devs nor + * files, so they don't have a target */ + if (def->type != VIR_STORAGE_POOL_RBD && ++ def->type != VIR_STORAGE_POOL_VITASTOR && + def->type != VIR_STORAGE_POOL_SHEEPDOG && + def->type != VIR_STORAGE_POOL_GLUSTER && + def->type != VIR_STORAGE_POOL_ISCSI_DIRECT) { +diff --git a/src/conf/storage_conf.h b/src/conf/storage_conf.h +index fc67957cfe..720c07ef74 100644 +--- a/src/conf/storage_conf.h ++++ b/src/conf/storage_conf.h +@@ -103,6 +103,7 @@ typedef enum { + VIR_STORAGE_POOL_GLUSTER, /* Gluster device */ + VIR_STORAGE_POOL_ZFS, /* ZFS */ + VIR_STORAGE_POOL_VSTORAGE, /* Virtuozzo Storage */ ++ VIR_STORAGE_POOL_VITASTOR, /* Vitastor */ + + VIR_STORAGE_POOL_LAST, + } virStoragePoolType; +@@ -454,6 +455,7 @@ VIR_ENUM_DECL(virStoragePartedFs); + VIR_CONNECT_LIST_STORAGE_POOLS_SCSI | \ + VIR_CONNECT_LIST_STORAGE_POOLS_MPATH | \ + VIR_CONNECT_LIST_STORAGE_POOLS_RBD | \ ++ VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR | \ + VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG | \ + VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER | \ + VIR_CONNECT_LIST_STORAGE_POOLS_ZFS | \ +diff --git a/src/conf/storage_source_conf.c b/src/conf/storage_source_conf.c +index f974a521b1..cd394d0a9f 100644 +--- a/src/conf/storage_source_conf.c ++++ b/src/conf/storage_source_conf.c +@@ -88,6 +88,7 @@ VIR_ENUM_IMPL(virStorageNetProtocol, + "ssh", + "vxhs", + "nfs", ++ "vitastor", + ); + + +@@ -1301,6 +1302,7 @@ virStorageSourceNetworkDefaultPort(virStorageNetProtocol protocol) + case VIR_STORAGE_NET_PROTOCOL_GLUSTER: + return 24007; + ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: + case VIR_STORAGE_NET_PROTOCOL_RBD: + /* we don't provide a default for RBD */ + return 0; +diff --git a/src/conf/storage_source_conf.h b/src/conf/storage_source_conf.h +index 5e7d127453..283709eeb3 100644 +--- a/src/conf/storage_source_conf.h ++++ b/src/conf/storage_source_conf.h +@@ -129,6 +129,7 @@ typedef enum { + VIR_STORAGE_NET_PROTOCOL_SSH, + VIR_STORAGE_NET_PROTOCOL_VXHS, + VIR_STORAGE_NET_PROTOCOL_NFS, ++ VIR_STORAGE_NET_PROTOCOL_VITASTOR, + + VIR_STORAGE_NET_PROTOCOL_LAST + } virStorageNetProtocol; +diff --git a/src/conf/virstorageobj.c b/src/conf/virstorageobj.c +index 59fa5da372..4739167f5f 100644 +--- a/src/conf/virstorageobj.c ++++ b/src/conf/virstorageobj.c +@@ -1438,6 +1438,7 @@ virStoragePoolObjSourceFindDuplicateCb(const void *payload, + return 1; + break; + ++ case VIR_STORAGE_POOL_VITASTOR: + case VIR_STORAGE_POOL_ISCSI_DIRECT: + case VIR_STORAGE_POOL_RBD: + case VIR_STORAGE_POOL_LAST: +@@ -1921,6 +1922,8 @@ virStoragePoolObjMatch(virStoragePoolObj *obj, + (obj->def->type == VIR_STORAGE_POOL_MPATH)) || + (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_RBD) && + (obj->def->type == VIR_STORAGE_POOL_RBD)) || ++ (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR) && ++ (obj->def->type == VIR_STORAGE_POOL_VITASTOR)) || + (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG) && + (obj->def->type == VIR_STORAGE_POOL_SHEEPDOG)) || + (MATCH(VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER) && +diff --git a/src/libvirt-storage.c b/src/libvirt-storage.c +index db7660aac4..561df34709 100644 +--- a/src/libvirt-storage.c ++++ b/src/libvirt-storage.c +@@ -94,6 +94,7 @@ virStoragePoolGetConnect(virStoragePoolPtr pool) + * VIR_CONNECT_LIST_STORAGE_POOLS_SCSI + * VIR_CONNECT_LIST_STORAGE_POOLS_MPATH + * VIR_CONNECT_LIST_STORAGE_POOLS_RBD ++ * VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR + * VIR_CONNECT_LIST_STORAGE_POOLS_SHEEPDOG + * VIR_CONNECT_LIST_STORAGE_POOLS_GLUSTER + * VIR_CONNECT_LIST_STORAGE_POOLS_ZFS +diff --git a/src/libxl/libxl_conf.c b/src/libxl/libxl_conf.c +index 62e1be6672..71a1d42896 100644 +--- a/src/libxl/libxl_conf.c ++++ b/src/libxl/libxl_conf.c +@@ -979,6 +979,7 @@ libxlMakeNetworkDiskSrcStr(virStorageSource *src, + case VIR_STORAGE_NET_PROTOCOL_SSH: + case VIR_STORAGE_NET_PROTOCOL_VXHS: + case VIR_STORAGE_NET_PROTOCOL_NFS: ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: + case VIR_STORAGE_NET_PROTOCOL_LAST: + case VIR_STORAGE_NET_PROTOCOL_NONE: + virReportError(VIR_ERR_NO_SUPPORT, +diff --git a/src/libxl/xen_xl.c b/src/libxl/xen_xl.c +index f175359307..8efcf4c329 100644 +--- a/src/libxl/xen_xl.c ++++ b/src/libxl/xen_xl.c +@@ -1456,6 +1456,7 @@ xenFormatXLDiskSrcNet(virStorageSource *src) + case VIR_STORAGE_NET_PROTOCOL_SSH: + case VIR_STORAGE_NET_PROTOCOL_VXHS: + case VIR_STORAGE_NET_PROTOCOL_NFS: ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: + case VIR_STORAGE_NET_PROTOCOL_LAST: + case VIR_STORAGE_NET_PROTOCOL_NONE: + virReportError(VIR_ERR_NO_SUPPORT, +diff --git a/src/qemu/qemu_block.c b/src/qemu/qemu_block.c +index 7e9daf0bdc..825b4a3006 100644 +--- a/src/qemu/qemu_block.c ++++ b/src/qemu/qemu_block.c +@@ -758,6 +758,38 @@ qemuBlockStorageSourceGetRBDProps(virStorageSource *src, + } + + ++static virJSONValue * ++qemuBlockStorageSourceGetVitastorProps(virStorageSource *src) ++{ ++ virJSONValue *ret = NULL; ++ virStorageNetHostDef *host; ++ size_t i; ++ g_auto(virBuffer) buf = VIR_BUFFER_INITIALIZER; ++ g_autofree char *etcd = NULL; ++ ++ for (i = 0; i < src->nhosts; i++) { ++ host = src->hosts + i; ++ if ((virStorageNetHostTransport)host->transport != VIR_STORAGE_NET_HOST_TRANS_TCP) { ++ return NULL; ++ } ++ virBufferAsprintf(&buf, i > 0 ? ",%s:%u" : "%s:%u", host->name, host->port); ++ } ++ if (src->nhosts > 0) { ++ etcd = virBufferContentAndReset(&buf); ++ } ++ ++ if (virJSONValueObjectAdd(&ret, ++ "S:etcd-host", etcd, ++ "S:etcd-prefix", src->query, ++ "S:config-path", src->configFile, ++ "s:image", src->path, ++ NULL) < 0) ++ return NULL; ++ ++ return ret; ++} ++ ++ + static virJSONValue * + qemuBlockStorageSourceGetSheepdogProps(virStorageSource *src) + { +@@ -1140,6 +1172,12 @@ qemuBlockStorageSourceGetBackendProps(virStorageSource *src, + return NULL; + break; + ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: ++ driver = "vitastor"; ++ if (!(fileprops = qemuBlockStorageSourceGetVitastorProps(src))) ++ return NULL; ++ break; ++ + case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG: + driver = "sheepdog"; + if (!(fileprops = qemuBlockStorageSourceGetSheepdogProps(src))) +@@ -2032,6 +2070,7 @@ qemuBlockGetBackingStoreString(virStorageSource *src, + + case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG: + case VIR_STORAGE_NET_PROTOCOL_RBD: ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: + case VIR_STORAGE_NET_PROTOCOL_VXHS: + case VIR_STORAGE_NET_PROTOCOL_NFS: + case VIR_STORAGE_NET_PROTOCOL_SSH: +@@ -2415,6 +2454,12 @@ qemuBlockStorageSourceCreateGetStorageProps(virStorageSource *src, + return -1; + break; + ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: ++ driver = "vitastor"; ++ if (!(location = qemuBlockStorageSourceGetVitastorProps(src))) ++ return -1; ++ break; ++ + case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG: + driver = "sheepdog"; + if (!(location = qemuBlockStorageSourceGetSheepdogProps(src))) +diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c +index 953808fcfe..62860283d8 100644 +--- a/src/qemu/qemu_domain.c ++++ b/src/qemu/qemu_domain.c +@@ -5215,7 +5215,8 @@ qemuDomainValidateStorageSource(virStorageSource *src, + if (src->query && + (actualType != VIR_STORAGE_TYPE_NETWORK || + (src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTPS && +- src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP))) { ++ src->protocol != VIR_STORAGE_NET_PROTOCOL_HTTP && ++ src->protocol != VIR_STORAGE_NET_PROTOCOL_VITASTOR))) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("query is supported only with HTTP(S) protocols")); + return -1; +@@ -10340,6 +10341,7 @@ qemuDomainPrepareStorageSourceTLS(virStorageSource *src, + break; + + case VIR_STORAGE_NET_PROTOCOL_RBD: ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: + case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG: + case VIR_STORAGE_NET_PROTOCOL_GLUSTER: + case VIR_STORAGE_NET_PROTOCOL_ISCSI: +diff --git a/src/qemu/qemu_snapshot.c b/src/qemu/qemu_snapshot.c +index 73ff533827..e9c799ca8f 100644 +--- a/src/qemu/qemu_snapshot.c ++++ b/src/qemu/qemu_snapshot.c +@@ -423,6 +423,7 @@ qemuSnapshotPrepareDiskExternalInactive(virDomainSnapshotDiskDef *snapdisk, + case VIR_STORAGE_NET_PROTOCOL_NONE: + case VIR_STORAGE_NET_PROTOCOL_NBD: + case VIR_STORAGE_NET_PROTOCOL_RBD: ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: + case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG: + case VIR_STORAGE_NET_PROTOCOL_GLUSTER: + case VIR_STORAGE_NET_PROTOCOL_ISCSI: +@@ -648,6 +649,7 @@ qemuSnapshotPrepareDiskInternal(virDomainDiskDef *disk, + case VIR_STORAGE_NET_PROTOCOL_NONE: + case VIR_STORAGE_NET_PROTOCOL_NBD: + case VIR_STORAGE_NET_PROTOCOL_RBD: ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: + case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG: + case VIR_STORAGE_NET_PROTOCOL_GLUSTER: + case VIR_STORAGE_NET_PROTOCOL_ISCSI: +diff --git a/src/storage/storage_driver.c b/src/storage/storage_driver.c +index 314fe930e0..fb615a8b4e 100644 +--- a/src/storage/storage_driver.c ++++ b/src/storage/storage_driver.c +@@ -1626,6 +1626,7 @@ storageVolLookupByPathCallback(virStoragePoolObj *obj, + + case VIR_STORAGE_POOL_GLUSTER: + case VIR_STORAGE_POOL_RBD: ++ case VIR_STORAGE_POOL_VITASTOR: + case VIR_STORAGE_POOL_SHEEPDOG: + case VIR_STORAGE_POOL_ZFS: + case VIR_STORAGE_POOL_LAST: +diff --git a/src/storage_file/storage_source_backingstore.c b/src/storage_file/storage_source_backingstore.c +index 80681924ea..8a3ade9ec0 100644 +--- a/src/storage_file/storage_source_backingstore.c ++++ b/src/storage_file/storage_source_backingstore.c +@@ -287,6 +287,75 @@ virStorageSourceParseRBDColonString(const char *rbdstr, + } + + ++static int ++virStorageSourceParseVitastorColonString(const char *colonstr, ++ virStorageSource *src) ++{ ++ char *p, *e, *next; ++ g_autofree char *options = NULL; ++ ++ /* optionally skip the "vitastor:" prefix if provided */ ++ if (STRPREFIX(colonstr, "vitastor:")) ++ colonstr += strlen("vitastor:"); ++ ++ options = g_strdup(colonstr); ++ ++ p = options; ++ while (*p) { ++ /* find : delimiter or end of string */ ++ for (e = p; *e && *e != ':'; ++e) { ++ if (*e == '\\') { ++ e++; ++ if (*e == '\0') ++ break; ++ } ++ } ++ if (*e == '\0') { ++ next = e; /* last kv pair */ ++ } else { ++ next = e + 1; ++ *e = '\0'; ++ } ++ ++ if (STRPREFIX(p, "image=")) { ++ src->path = g_strdup(p + strlen("image=")); ++ } else if (STRPREFIX(p, "etcd-prefix=")) { ++ src->query = g_strdup(p + strlen("etcd-prefix=")); ++ } else if (STRPREFIX(p, "config-path=")) { ++ src->configFile = g_strdup(p + strlen("config-path=")); ++ } else if (STRPREFIX(p, "etcd-host=")) { ++ char *h, *sep; ++ ++ h = p + strlen("etcd-host="); ++ while (h < e) { ++ for (sep = h; sep < e; ++sep) { ++ if (*sep == '\\' && (sep[1] == ',' || ++ sep[1] == ';' || ++ sep[1] == ' ')) { ++ *sep = '\0'; ++ sep += 2; ++ break; ++ } ++ } ++ ++ if (virStorageSourceRBDAddHost(src, h) < 0) ++ return -1; ++ ++ h = sep; ++ } ++ } ++ ++ p = next; ++ } ++ ++ if (!src->path) { ++ return -1; ++ } ++ ++ return 0; ++} ++ ++ + static int + virStorageSourceParseNBDColonString(const char *nbdstr, + virStorageSource *src) +@@ -399,6 +468,11 @@ virStorageSourceParseBackingColon(virStorageSource *src, + return -1; + break; + ++ case VIR_STORAGE_NET_PROTOCOL_VITASTOR: ++ if (virStorageSourceParseVitastorColonString(path, src) < 0) ++ return -1; ++ break; ++ + case VIR_STORAGE_NET_PROTOCOL_SHEEPDOG: + case VIR_STORAGE_NET_PROTOCOL_LAST: + case VIR_STORAGE_NET_PROTOCOL_NONE: +@@ -975,6 +1049,54 @@ virStorageSourceParseBackingJSONRBD(virStorageSource *src, + return 0; + } + ++static int ++virStorageSourceParseBackingJSONVitastor(virStorageSource *src, ++ virJSONValue *json, ++ const char *jsonstr G_GNUC_UNUSED, ++ int opaque G_GNUC_UNUSED) ++{ ++ const char *filename; ++ const char *image = virJSONValueObjectGetString(json, "image"); ++ const char *conf = virJSONValueObjectGetString(json, "config-path"); ++ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd-prefix"); ++ virJSONValue *servers = virJSONValueObjectGetArray(json, "server"); ++ size_t nservers; ++ size_t i; ++ ++ src->type = VIR_STORAGE_TYPE_NETWORK; ++ src->protocol = VIR_STORAGE_NET_PROTOCOL_VITASTOR; ++ ++ /* legacy syntax passed via 'filename' option */ ++ if ((filename = virJSONValueObjectGetString(json, "filename"))) ++ return virStorageSourceParseVitastorColonString(filename, src); ++ ++ if (!image) { ++ virReportError(VIR_ERR_INVALID_ARG, "%s", ++ _("missing image name in Vitastor backing volume " ++ "JSON specification")); ++ return -1; ++ } ++ ++ src->path = g_strdup(image); ++ src->configFile = g_strdup(conf); ++ src->query = g_strdup(etcd_prefix); ++ ++ if (servers) { ++ nservers = virJSONValueArraySize(servers); ++ ++ src->hosts = g_new0(virStorageNetHostDef, nservers); ++ src->nhosts = nservers; ++ ++ for (i = 0; i < nservers; i++) { ++ if (virStorageSourceParseBackingJSONInetSocketAddress(src->hosts + i, ++ virJSONValueArrayGet(servers, i)) < 0) ++ return -1; ++ } ++ } ++ ++ return 0; ++} ++ + static int + virStorageSourceParseBackingJSONRaw(virStorageSource *src, + virJSONValue *json, +@@ -1152,6 +1274,7 @@ static const struct virStorageSourceJSONDriverParser jsonParsers[] = { + {"sheepdog", false, virStorageSourceParseBackingJSONSheepdog, 0}, + {"ssh", false, virStorageSourceParseBackingJSONSSH, 0}, + {"rbd", false, virStorageSourceParseBackingJSONRBD, 0}, ++ {"vitastor", false, virStorageSourceParseBackingJSONVitastor, 0}, + {"raw", true, virStorageSourceParseBackingJSONRaw, 0}, + {"nfs", false, virStorageSourceParseBackingJSONNFS, 0}, + {"vxhs", false, virStorageSourceParseBackingJSONVxHS, 0}, +diff --git a/src/test/test_driver.c b/src/test/test_driver.c +index e87d7cfd44..ccc05d7aae 100644 +--- a/src/test/test_driver.c ++++ b/src/test/test_driver.c +@@ -7335,6 +7335,7 @@ testStorageVolumeTypeForPool(int pooltype) + case VIR_STORAGE_POOL_ISCSI_DIRECT: + case VIR_STORAGE_POOL_GLUSTER: + case VIR_STORAGE_POOL_RBD: ++ case VIR_STORAGE_POOL_VITASTOR: + return VIR_STORAGE_VOL_NETWORK; + case VIR_STORAGE_POOL_LOGICAL: + case VIR_STORAGE_POOL_DISK: +diff --git a/tests/storagepoolcapsschemadata/poolcaps-fs.xml b/tests/storagepoolcapsschemadata/poolcaps-fs.xml +index eee75af746..8bd0a57bdd 100644 +--- a/tests/storagepoolcapsschemadata/poolcaps-fs.xml ++++ b/tests/storagepoolcapsschemadata/poolcaps-fs.xml +@@ -204,4 +204,11 @@ + + + ++ ++ ++ ++ ++ ++ ++ + +diff --git a/tests/storagepoolcapsschemadata/poolcaps-full.xml b/tests/storagepoolcapsschemadata/poolcaps-full.xml +index 805950a937..852df0de16 100644 +--- a/tests/storagepoolcapsschemadata/poolcaps-full.xml ++++ b/tests/storagepoolcapsschemadata/poolcaps-full.xml +@@ -204,4 +204,11 @@ + + + ++ ++ ++ ++ ++ ++ ++ + +diff --git a/tests/storagepoolxml2argvtest.c b/tests/storagepoolxml2argvtest.c +index e8e40d695e..db55fe5f3a 100644 +--- a/tests/storagepoolxml2argvtest.c ++++ b/tests/storagepoolxml2argvtest.c +@@ -65,6 +65,7 @@ testCompareXMLToArgvFiles(bool shouldFail, + case VIR_STORAGE_POOL_GLUSTER: + case VIR_STORAGE_POOL_ZFS: + case VIR_STORAGE_POOL_VSTORAGE: ++ case VIR_STORAGE_POOL_VITASTOR: + case VIR_STORAGE_POOL_LAST: + default: + VIR_TEST_DEBUG("pool type '%s' has no xml2argv test", defTypeStr); +diff --git a/tools/virsh-pool.c b/tools/virsh-pool.c +index 36f00cf643..5f5bd3464e 100644 +--- a/tools/virsh-pool.c ++++ b/tools/virsh-pool.c +@@ -1223,6 +1223,9 @@ cmdPoolList(vshControl *ctl, const vshCmd *cmd G_GNUC_UNUSED) + case VIR_STORAGE_POOL_VSTORAGE: + flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VSTORAGE; + break; ++ case VIR_STORAGE_POOL_VITASTOR: ++ flags |= VIR_CONNECT_LIST_STORAGE_POOLS_VITASTOR; ++ break; + case VIR_STORAGE_POOL_LAST: + break; + } diff --git a/pull_request_template.yml b/pull_request_template.yml new file mode 100644 index 00000000..bc25a0fe --- /dev/null +++ b/pull_request_template.yml @@ -0,0 +1,28 @@ +name: Pull Request +about: Submit a pull request +body: + - type: textarea + id: description + attributes: + label: Description + description: Describe your pull request + placeholder: "" + value: "" + validations: + required: true + - type: input + id: author + attributes: + label: Contributor Name + description: Contributor Name or Company Details if the Contributor is a company + placeholder: "" + validations: + required: false + - type: checkboxes + id: terms + attributes: + label: CLA + description: By submitting this pull request, I accept [Vitastor CLA](https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md) + options: + - label: "I accept Vitastor CLA agreement: https://git.yourcmc.ru/vitalif/vitastor/src/branch/master/CLA-en.md" + required: true diff --git a/rpm/build-tarball.sh b/rpm/build-tarball.sh index 2068d9ca..5b700149 100755 --- a/rpm/build-tarball.sh +++ b/rpm/build-tarball.sh @@ -24,4 +24,4 @@ rm fio mv fio-copy fio FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'` perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec -tar --transform 's#^#vitastor-1.4.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.0$(rpm --eval '%dist').tar.gz * +tar --transform 's#^#vitastor-1.4.4/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.4$(rpm --eval '%dist').tar.gz * diff --git a/rpm/vitastor-el7.Dockerfile b/rpm/vitastor-el7.Dockerfile index 957282aa..21424a09 100644 --- a/rpm/vitastor-el7.Dockerfile +++ b/rpm/vitastor-el7.Dockerfile @@ -36,7 +36,7 @@ ADD . /root/vitastor RUN set -e; \ cd /root/vitastor/rpm; \ sh build-tarball.sh; \ - cp /root/vitastor-1.4.0.el7.tar.gz ~/rpmbuild/SOURCES; \ + cp /root/vitastor-1.4.4.el7.tar.gz ~/rpmbuild/SOURCES; \ cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \ cd ~/rpmbuild/SPECS/; \ rpmbuild -ba vitastor.spec; \ diff --git a/rpm/vitastor-el7.spec b/rpm/vitastor-el7.spec index 594e5573..f8c13724 100644 --- a/rpm/vitastor-el7.spec +++ b/rpm/vitastor-el7.spec @@ -1,11 +1,11 @@ Name: vitastor -Version: 1.4.0 +Version: 1.4.4 Release: 1%{?dist} Summary: Vitastor, a fast software-defined clustered block storage License: Vitastor Network Public License 1.1 URL: https://vitastor.io/ -Source0: vitastor-1.4.0.el7.tar.gz +Source0: vitastor-1.4.4.el7.tar.gz BuildRequires: liburing-devel >= 0.6 BuildRequires: gperftools-devel diff --git a/rpm/vitastor-el8.Dockerfile b/rpm/vitastor-el8.Dockerfile index c2db92dd..72276be3 100644 --- a/rpm/vitastor-el8.Dockerfile +++ b/rpm/vitastor-el8.Dockerfile @@ -35,7 +35,7 @@ ADD . /root/vitastor RUN set -e; \ cd /root/vitastor/rpm; \ sh build-tarball.sh; \ - cp /root/vitastor-1.4.0.el8.tar.gz ~/rpmbuild/SOURCES; \ + cp /root/vitastor-1.4.4.el8.tar.gz ~/rpmbuild/SOURCES; \ cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \ cd ~/rpmbuild/SPECS/; \ rpmbuild -ba vitastor.spec; \ diff --git a/rpm/vitastor-el8.spec b/rpm/vitastor-el8.spec index d9ea9155..41cae854 100644 --- a/rpm/vitastor-el8.spec +++ b/rpm/vitastor-el8.spec @@ -1,11 +1,11 @@ Name: vitastor -Version: 1.4.0 +Version: 1.4.4 Release: 1%{?dist} Summary: Vitastor, a fast software-defined clustered block storage License: Vitastor Network Public License 1.1 URL: https://vitastor.io/ -Source0: vitastor-1.4.0.el8.tar.gz +Source0: vitastor-1.4.4.el8.tar.gz BuildRequires: liburing-devel >= 0.6 BuildRequires: gperftools-devel diff --git a/rpm/vitastor-el9.Dockerfile b/rpm/vitastor-el9.Dockerfile index f18620b9..1f1939db 100644 --- a/rpm/vitastor-el9.Dockerfile +++ b/rpm/vitastor-el9.Dockerfile @@ -18,7 +18,7 @@ ADD . /root/vitastor RUN set -e; \ cd /root/vitastor/rpm; \ sh build-tarball.sh; \ - cp /root/vitastor-1.4.0.el9.tar.gz ~/rpmbuild/SOURCES; \ + cp /root/vitastor-1.4.4.el9.tar.gz ~/rpmbuild/SOURCES; \ cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \ cd ~/rpmbuild/SPECS/; \ rpmbuild -ba vitastor.spec; \ diff --git a/rpm/vitastor-el9.spec b/rpm/vitastor-el9.spec index f06301bb..4c3e9557 100644 --- a/rpm/vitastor-el9.spec +++ b/rpm/vitastor-el9.spec @@ -1,11 +1,11 @@ Name: vitastor -Version: 1.4.0 +Version: 1.4.4 Release: 1%{?dist} Summary: Vitastor, a fast software-defined clustered block storage License: Vitastor Network Public License 1.1 URL: https://vitastor.io/ -Source0: vitastor-1.4.0.el9.tar.gz +Source0: vitastor-1.4.4.el9.tar.gz BuildRequires: liburing-devel >= 0.6 BuildRequires: gperftools-devel diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 69e286e4..751c62b0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$") set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") endif() -add_definitions(-DVERSION="1.4.0") +add_definitions(-DVERSION="1.4.4") add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src) add_link_options(-fno-omit-frame-pointer) if (${WITH_ASAN}) diff --git a/src/blockstore_flush.cpp b/src/blockstore_flush.cpp index 66eb8ea6..290a4dbf 100644 --- a/src/blockstore_flush.cpp +++ b/src/blockstore_flush.cpp @@ -184,8 +184,7 @@ void journal_flusher_t::mark_trim_possible() if (trim_wanted > 0) { dequeuing = true; - if (!journal_trim_counter) - journal_trim_counter = journal_trim_interval; + journal_trim_counter = 0; bs->ringloop->wakeup(); } } @@ -366,7 +365,7 @@ resume_0: !flusher->flush_queue.size() || !flusher->dequeuing) { stop_flusher: - if (flusher->trim_wanted > 0 && flusher->journal_trim_counter > 0) + if (flusher->trim_wanted > 0 && !flusher->journal_trim_counter) { // Attempt forced trim flusher->active_flushers++; @@ -1346,7 +1345,6 @@ bool journal_flusher_co::trim_journal(int wait_base) else if (wait_state == wait_base+2) goto resume_2; else if (wait_state == wait_base+3) goto resume_3; else if (wait_state == wait_base+4) goto resume_4; - flusher->journal_trim_counter = 0; new_trim_pos = bs->journal.get_trim_pos(); if (new_trim_pos != bs->journal.used_start) { @@ -1419,6 +1417,7 @@ bool journal_flusher_co::trim_journal(int wait_base) exit(0); } } + flusher->journal_trim_counter = 0; flusher->trimming = false; } return true; diff --git a/src/blockstore_impl.cpp b/src/blockstore_impl.cpp index d8ccff9c..399e0c78 100644 --- a/src/blockstore_impl.cpp +++ b/src/blockstore_impl.cpp @@ -163,20 +163,10 @@ void blockstore_impl_t::loop() } else if (op->opcode == BS_OP_SYNC) { - // wait for all small writes to be submitted - // wait for all big writes to complete, submit data device fsync + // sync only completed writes? // wait for the data device fsync to complete, then submit journal writes for big writes // then submit an fsync operation - if (has_writes) - { - // Can't submit SYNC before previous writes - continue; - } wr_st = continue_sync(op); - if (wr_st != 2) - { - has_writes = wr_st > 0 ? 1 : 2; - } } else if (op->opcode == BS_OP_STABLE) { @@ -205,6 +195,10 @@ void blockstore_impl_t::loop() // ring is full, stop submission break; } + else if (PRIV(op)->wait_for == WAIT_JOURNAL) + { + PRIV(op)->wait_detail2 = (unstable_writes.size()+unstable_unsynced); + } } } if (op_idx != new_idx) @@ -283,7 +277,8 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op) } else if (PRIV(op)->wait_for == WAIT_JOURNAL) { - if (journal.used_start == PRIV(op)->wait_detail) + if (journal.used_start == PRIV(op)->wait_detail && + (unstable_writes.size()+unstable_unsynced) == PRIV(op)->wait_detail2) { // do not submit #ifdef BLOCKSTORE_DEBUG @@ -558,13 +553,14 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) if (stable_count >= stable_alloc) { stable_alloc *= 2; - stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc); - if (!stable) + obj_ver_id* nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc); + if (!nst) { op->retval = -ENOMEM; FINISH_OP(op); return; } + stable = nst; } stable[stable_count++] = { .oid = clean_it->first, @@ -642,8 +638,8 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) if (stable_count >= stable_alloc) { stable_alloc += 32768; - stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc); - if (!stable) + obj_ver_id *nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc); + if (!nst) { if (unstable) free(unstable); @@ -651,6 +647,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) FINISH_OP(op); return; } + stable = nst; } stable[stable_count++] = dirty_it->first; } @@ -666,8 +663,8 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) if (unstable_count >= unstable_alloc) { unstable_alloc += 32768; - unstable = (obj_ver_id*)realloc(unstable, sizeof(obj_ver_id) * unstable_alloc); - if (!unstable) + obj_ver_id *nst = (obj_ver_id*)realloc(unstable, sizeof(obj_ver_id) * unstable_alloc); + if (!nst) { if (stable) free(stable); @@ -675,6 +672,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) FINISH_OP(op); return; } + unstable = nst; } unstable[unstable_count++] = dirty_it->first; } @@ -694,8 +692,8 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) if (stable_count+unstable_count > stable_alloc) { stable_alloc = stable_count+unstable_count; - stable = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc); - if (!stable) + obj_ver_id *nst = (obj_ver_id*)realloc(stable, sizeof(obj_ver_id) * stable_alloc); + if (!nst) { if (unstable) free(unstable); @@ -703,6 +701,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op) FINISH_OP(op); return; } + stable = nst; } // Copy unstable entries for (int i = 0; i < unstable_count; i++) diff --git a/src/blockstore_impl.h b/src/blockstore_impl.h index e20e956c..c1b88638 100644 --- a/src/blockstore_impl.h +++ b/src/blockstore_impl.h @@ -55,6 +55,7 @@ #define IS_JOURNAL(st) (((st) & 0x0F) == BS_ST_SMALL_WRITE) #define IS_BIG_WRITE(st) (((st) & 0x0F) == BS_ST_BIG_WRITE) #define IS_DELETE(st) (((st) & 0x0F) == BS_ST_DELETE) +#define IS_INSTANT(st) (((st) & BS_ST_TYPE_MASK) == BS_ST_DELETE || ((st) & BS_ST_INSTANT)) #define BS_SUBMIT_CHECK_SQES(n) \ if (ringloop->sqes_left() < (n))\ @@ -201,7 +202,7 @@ struct blockstore_op_private_t { // Wait status int wait_for; - uint64_t wait_detail; + uint64_t wait_detail, wait_detail2; int pending_ops; int op_state; @@ -377,7 +378,7 @@ class blockstore_impl_t // Stabilize int dequeue_stable(blockstore_op_t *op); int continue_stable(blockstore_op_t *op); - void mark_stable(const obj_ver_id & ov, bool forget_dirty = false); + void mark_stable(obj_ver_id ov, bool forget_dirty = false); void stabilize_object(object_id oid, uint64_t max_ver); blockstore_op_t* selective_sync(blockstore_op_t *op); int split_stab_op(blockstore_op_t *op, std::function decider); diff --git a/src/blockstore_open.cpp b/src/blockstore_open.cpp index 7c57dbde..1bfbd064 100644 --- a/src/blockstore_open.cpp +++ b/src/blockstore_open.cpp @@ -19,7 +19,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config, bool init) throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10); throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10); throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10); - if (config.find("autosync_writes") != config.end()) + if (config["autosync_writes"] != "") { autosync_writes = strtoull(config["autosync_writes"].c_str(), NULL, 10); } diff --git a/src/blockstore_stable.cpp b/src/blockstore_stable.cpp index 10648ad9..2dba0555 100644 --- a/src/blockstore_stable.cpp +++ b/src/blockstore_stable.cpp @@ -412,11 +412,40 @@ resume_4: return 2; } -void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty) +void blockstore_impl_t::mark_stable(obj_ver_id v, bool forget_dirty) { auto dirty_it = dirty_db.find(v); if (dirty_it != dirty_db.end()) { + if (IS_INSTANT(dirty_it->second.state)) + { + // 'Instant' (non-EC) operations may complete and try to become stable out of order. Prevent it. + auto back_it = dirty_it; + while (back_it != dirty_db.begin()) + { + back_it--; + if (back_it->first.oid != v.oid) + { + break; + } + if (!IS_STABLE(back_it->second.state)) + { + // There are preceding unstable versions, can't flush + return; + } + } + while (true) + { + dirty_it++; + if (dirty_it == dirty_db.end() || dirty_it->first.oid != v.oid || + !IS_SYNCED(dirty_it->second.state)) + { + dirty_it--; + break; + } + v.version = dirty_it->first.version; + } + } while (1) { bool was_stable = IS_STABLE(dirty_it->second.state); diff --git a/src/blockstore_sync.cpp b/src/blockstore_sync.cpp index 1d64a4f6..d29035b5 100644 --- a/src/blockstore_sync.cpp +++ b/src/blockstore_sync.cpp @@ -76,6 +76,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op) // 2nd step: Data device is synced, prepare & write journal entries // Check space in the journal and journal memory buffers blockstore_journal_check_t space_check(this); + auto reservation = (unstable_writes.size()+unstable_unsynced+PRIV(op)->sync_big_writes.size())*journal.block_size; if (dsk.csum_block_size) { // More complex check because all journal entries have different lengths @@ -85,16 +86,14 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op) left--; auto & dirty_entry = dirty_db.at(sbw); uint64_t dyn_size = dsk.dirty_dyn_size(dirty_entry.offset, dirty_entry.len); - if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size, - (unstable_writes.size()+unstable_unsynced)*journal.block_size)) + if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size, left ? 0 : reservation)) { return 0; } } } else if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(), - sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, - (unstable_writes.size()+unstable_unsynced)*journal.block_size)) + sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, reservation)) { return 0; } diff --git a/src/blockstore_write.cpp b/src/blockstore_write.cpp index 36828abc..fe768f88 100644 --- a/src/blockstore_write.cpp +++ b/src/blockstore_write.cpp @@ -129,7 +129,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) } bool imm = (op->len < dsk.data_block_size ? (immediate_commit != IMMEDIATE_NONE) : (immediate_commit == IMMEDIATE_ALL)); if (wait_big && !is_del && !deleted && op->len < dsk.data_block_size && !imm || - !imm && unsynced_queued_ops >= autosync_writes) + !imm && autosync_writes && unsynced_queued_ops >= autosync_writes) { // Issue an additional sync so that the previous big write can reach the journal blockstore_op_t *sync_op = new blockstore_op_t; @@ -320,7 +320,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) blockstore_journal_check_t space_check(this); if (!space_check.check_available(op, unsynced_big_write_count + 1, sizeof(journal_entry_big_write) + dsk.clean_dyn_size, - (unstable_writes.size()+unstable_unsynced)*journal.block_size)) + (unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size)) { return 0; } @@ -412,7 +412,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) sizeof(journal_entry_big_write) + dsk.clean_dyn_size, 0) || !space_check.check_available(op, 1, sizeof(journal_entry_small_write) + dyn_size, - op->len + (unstable_writes.size()+unstable_unsynced)*journal.block_size)) + op->len + (unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size)) { return 0; } @@ -549,7 +549,7 @@ resume_2: uint64_t dyn_size = dsk.dirty_dyn_size(op->offset, op->len); blockstore_journal_check_t space_check(this); if (!space_check.check_available(op, 1, sizeof(journal_entry_big_write) + dyn_size, - (unstable_writes.size()+unstable_unsynced)*journal.block_size)) + (unstable_writes.size()+unstable_unsynced+((dirty_it->second.state & BS_ST_INSTANT) ? 0 : 1))*journal.block_size)) { return 0; } @@ -593,7 +593,7 @@ resume_4: #endif bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE; bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE); - bool is_instant = ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT)); + bool is_instant = IS_INSTANT(dirty_it->second.state); if (imm) { auto & unstab = unstable_writes[op->oid]; diff --git a/src/cli_rm_data.cpp b/src/cli_rm_data.cpp index dcf726dc..37ea8c91 100644 --- a/src/cli_rm_data.cpp +++ b/src/cli_rm_data.cpp @@ -17,6 +17,7 @@ struct rm_pg_t uint64_t obj_count = 0, obj_done = 0; int state = 0; int in_flight = 0; + bool synced = false; }; struct rm_inode_t @@ -48,6 +49,7 @@ struct rm_inode_t .objects = objects, .obj_count = objects.size(), .obj_done = 0, + .synced = parent->cli->get_immediate_commit(inode), }); if (min_offset == 0) { @@ -151,6 +153,37 @@ struct rm_inode_t } cur_list->obj_pos++; } + if (cur_list->in_flight == 0 && cur_list->obj_pos == cur_list->objects.end() && + !cur_list->synced) + { + osd_op_t *op = new osd_op_t(); + op->op_type = OSD_OP_OUT; + op->peer_fd = parent->cli->msgr.osd_peer_fds.at(cur_list->rm_osd_num); + op->req = (osd_any_op_t){ + .sync = { + .header = { + .magic = SECONDARY_OSD_OP_MAGIC, + .id = parent->cli->next_op_id(), + .opcode = OSD_OP_SYNC, + }, + }, + }; + op->callback = [this, cur_list](osd_op_t *op) + { + cur_list->in_flight--; + cur_list->synced = true; + if (op->reply.hdr.retval < 0) + { + fprintf(stderr, "Failed to sync OSD %lu (retval=%ld)\n", + cur_list->rm_osd_num, op->reply.hdr.retval); + error_count++; + } + delete op; + continue_delete(); + }; + cur_list->in_flight++; + parent->cli->msgr.outbox_push(op); + } } void continue_delete() @@ -161,7 +194,8 @@ struct rm_inode_t } for (int i = 0; i < lists.size(); i++) { - if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end()) + if (!lists[i]->in_flight && lists[i]->obj_pos == lists[i]->objects.end() && + lists[i]->synced) { delete lists[i]; lists.erase(lists.begin()+i, lists.begin()+i+1); @@ -187,7 +221,7 @@ struct rm_inode_t { fprintf(stderr, "\n"); } - if (parent->progress && (total_done < total_count || inactive_osds.size() > 0)) + if (parent->progress && (total_done < total_count || inactive_osds.size() > 0 || error_count > 0)) { fprintf( stderr, "Warning: Pool:%u,ID:%lu inode data may not have been fully removed.\n" diff --git a/src/cli_status.cpp b/src/cli_status.cpp index f9fcb16c..85606cb2 100644 --- a/src/cli_status.cpp +++ b/src/cli_status.cpp @@ -106,7 +106,7 @@ resume_2: if (etcd_states[i]["error"].is_null()) { etcd_alive++; - etcd_db_size = etcd_states[i]["dbSizeInUse"].uint64_value(); + etcd_db_size = etcd_states[i]["dbSize"].uint64_value(); } } int mon_count = 0; diff --git a/src/cluster_client.cpp b/src/cluster_client.cpp index efed244a..1f270889 100644 --- a/src/cluster_client.cpp +++ b/src/cluster_client.cpp @@ -352,13 +352,13 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co // up_wait_retry_interval up_wait_retry_interval = config["up_wait_retry_interval"].uint64_value(); if (!up_wait_retry_interval) - { - up_wait_retry_interval = 500; - } - else if (up_wait_retry_interval < 50) { up_wait_retry_interval = 50; } + else if (up_wait_retry_interval < 10) + { + up_wait_retry_interval = 10; + } // log_level log_level = config["log_level"].uint64_value(); msgr.parse_config(config); diff --git a/src/etcd_state_client.h b/src/etcd_state_client.h index bf7b18e5..b081d848 100644 --- a/src/etcd_state_client.h +++ b/src/etcd_state_client.h @@ -28,7 +28,7 @@ struct etcd_kv_t { std::string key; json11::Json value; - uint64_t mod_revision; + uint64_t mod_revision = 0; }; struct pg_config_t diff --git a/src/osd.cpp b/src/osd.cpp index b85e7210..134b31d6 100644 --- a/src/osd.cpp +++ b/src/osd.cpp @@ -22,7 +22,7 @@ static blockstore_config_t json_to_bs(const json11::Json::object & config) { if (kv.second.is_string()) bs[kv.first] = kv.second.string_value(); - else + else if (!kv.second.is_null()) bs[kv.first] = kv.second.dump(); } return bs; @@ -194,7 +194,8 @@ void osd_t::parse_config(bool init) if (autosync_interval > MAX_AUTOSYNC_INTERVAL) autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; } - if (!config["autosync_writes"].is_null()) + if (config["autosync_writes"].is_number() || + config["autosync_writes"].string_value() != "") { // Allow to set it to 0 autosync_writes = config["autosync_writes"].uint64_value(); @@ -232,6 +233,8 @@ void osd_t::parse_config(bool init) ? 10 : config["recovery_tune_agg_interval"].uint64_value(); recovery_tune_sleep_min_us = config["recovery_tune_sleep_min_us"].is_null() ? 10 : config["recovery_tune_sleep_min_us"].uint64_value(); + recovery_tune_sleep_cutoff_us = config["recovery_tune_sleep_cutoff_us"].is_null() + ? 10000000 : config["recovery_tune_sleep_cutoff_us"].uint64_value(); recovery_pg_switch = config["recovery_pg_switch"].uint64_value(); if (recovery_pg_switch < 1) recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH; diff --git a/src/osd.h b/src/osd.h index 49f27c15..1718a8dd 100644 --- a/src/osd.h +++ b/src/osd.h @@ -125,6 +125,7 @@ class osd_t int recovery_tune_interval = 1; int recovery_tune_agg_interval = 10; int recovery_tune_sleep_min_us = 10; + int recovery_tune_sleep_cutoff_us = 10000000; int recovery_pg_switch = DEFAULT_RECOVERY_PG_SWITCH; int recovery_sync_batch = DEFAULT_RECOVERY_BATCH; int inode_vanish_time = 60; @@ -282,6 +283,7 @@ class osd_t void exec_sync_stab_all(osd_op_t *cur_op); void exec_show_config(osd_op_t *cur_op); void exec_secondary(osd_op_t *cur_op); + void exec_secondary_real(osd_op_t *cur_op); void secondary_op_callback(osd_op_t *cur_op); // primary ops diff --git a/src/osd_cluster.cpp b/src/osd_cluster.cpp index 2ecee460..180bd880 100644 --- a/src/osd_cluster.cpp +++ b/src/osd_cluster.cpp @@ -262,7 +262,8 @@ void osd_t::report_statistics() for (auto st_it = inode_stats.begin(); st_it != inode_stats.end(); ) { auto & kv = *st_it; - if (!bs_inode_space[kv.first]) + auto spc_it = bs_inode_space.find(kv.first); + if (spc_it == bs_inode_space.end() || !spc_it->second) // prevent autovivification { // Is it an empty inode? if (!tv_now.tv_sec) diff --git a/src/osd_flush.cpp b/src/osd_flush.cpp index 2401948d..5f81240a 100644 --- a/src/osd_flush.cpp +++ b/src/osd_flush.cpp @@ -422,6 +422,10 @@ void osd_t::tune_recovery() rtune_avg_lat = total_recovery_usec/recovery_count; uint64_t target_lat = rtune_avg_lat * rtune_avg_lat/1000000.0 * recovery_count/recovery_tune_interval / rtune_target_util; auto sleep_us = target_lat > rtune_avg_lat+recovery_tune_sleep_min_us ? target_lat-rtune_avg_lat : 0; + if (sleep_us > recovery_tune_sleep_cutoff_us) + { + return; + } if (recovery_target_sleep_items.size() != recovery_tune_agg_interval) { recovery_target_sleep_items.resize(recovery_tune_agg_interval); @@ -438,7 +442,7 @@ void osd_t::tune_recovery() if (recovery_target_sleep_count < recovery_tune_agg_interval) recovery_target_sleep_count++; recovery_target_sleep_us = recovery_target_sleep_total / recovery_target_sleep_count; - if (log_level > 4) + if (log_level > 1) { printf( "[OSD %lu] auto-tune: client util: %.2f, recovery util: %.2f, lat: %lu us -> target util %.2f, delay %lu us\n", diff --git a/src/osd_peering.cpp b/src/osd_peering.cpp index 40657369..d8aff7dc 100644 --- a/src/osd_peering.cpp +++ b/src/osd_peering.cpp @@ -222,6 +222,9 @@ void osd_t::start_pg_peering(pg_t & pg) } if (pg.pg_cursize < pg.pg_minsize) { + // FIXME: Incomplete EC PGs may currently easily lead to write hangs ("slow ops" in OSD logs) + // because such PGs don't flush unstable entries on secondary OSDs so they can't remove these + // entries from their journals... pg.state = PG_INCOMPLETE; report_pg_state(pg); return; diff --git a/src/osd_primary.cpp b/src/osd_primary.cpp index e7a76b0f..750d0ff9 100644 --- a/src/osd_primary.cpp +++ b/src/osd_primary.cpp @@ -706,6 +706,26 @@ resume_5: remove_object_from_state(op_data->oid, &op_data->object_state, pg); deref_object_state(pg, &op_data->object_state, true); } + // Mark PG and OSDs as dirty + for (auto & chunk: (op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set)) + { + this->dirty_osds.insert(chunk.osd_num); + } + for (auto cl_it = msgr.clients.find(cur_op->peer_fd); cl_it != msgr.clients.end(); ) + { + cl_it->second->dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }); + break; + } + dirty_pgs.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }); + if (immediate_commit == IMMEDIATE_NONE) + { + unstable_write_count++; + if (unstable_write_count >= autosync_writes) + { + unstable_write_count = 0; + autosync(); + } + } pg.total_count--; cur_op->reply.hdr.retval = 0; continue_others: diff --git a/src/osd_rmw.cpp b/src/osd_rmw.cpp index 7788cf8b..eebcc265 100644 --- a/src/osd_rmw.cpp +++ b/src/osd_rmw.cpp @@ -861,15 +861,15 @@ static void calc_rmw_parity_copy_mod(osd_rmw_stripe_t *stripes, int pg_size, int static void calc_rmw_parity_copy_parity(osd_rmw_stripe_t *stripes, int pg_size, int pg_minsize, uint64_t *read_osd_set, uint64_t *write_osd_set, uint32_t chunk_size, uint32_t start, uint32_t end) { - if (write_osd_set != read_osd_set) + if (write_osd_set != read_osd_set && end != 0) { for (int role = pg_minsize; role < pg_size; role++) { - if (write_osd_set[role] != read_osd_set[role] && (start != 0 || end != chunk_size)) + if (write_osd_set[role] != read_osd_set[role] && write_osd_set[role] != 0 && (start != 0 || end != chunk_size)) { // Copy new parity into the read buffer to write it back memcpy( - (uint8_t*)stripes[role].read_buf + start, + (uint8_t*)stripes[role].read_buf + start - stripes[role].read_start, stripes[role].write_buf, end - start ); diff --git a/src/osd_rmw_test.cpp b/src/osd_rmw_test.cpp index 2e1542d7..a71aab3d 100644 --- a/src/osd_rmw_test.cpp +++ b/src/osd_rmw_test.cpp @@ -30,6 +30,7 @@ void test16(); void test_recover_22_d2(); void test_ec43_error_bruteforce(); void test_recover_53_d5(); +void test_recover_22(); int main(int narg, char *args[]) { @@ -70,6 +71,8 @@ int main(int narg, char *args[]) test_ec43_error_bruteforce(); // Test 19 test_recover_53_d5(); + // Test 20 + test_recover_22(); // End printf("all ok\n"); return 0; @@ -1244,3 +1247,99 @@ void test_recover_53_d5() // Done use_ec(8, 5, false); } + +void test_recover_22() +{ + const int bmp = 128*1024 / 4096 / 8; + use_ec(4, 2, true); + osd_num_t osd_set[4] = { 1, 2, 3, 4 }; + osd_num_t write_osd_set[4] = { 5, 0, 3, 0 }; + osd_rmw_stripe_t stripes[4] = {}; + unsigned bitmaps[4] = { 0 }; + // split + void *write_buf = (uint8_t*)malloc_or_die(4096); + set_pattern(write_buf, 4096, PATTERN0); + split_stripes(2, 128*1024, 120*1024, 4096, stripes); + assert(stripes[0].req_start == 120*1024 && stripes[0].req_end == 124*1024); + assert(stripes[1].req_start == 0 && stripes[1].req_end == 0); + assert(stripes[2].req_start == 0 && stripes[2].req_end == 0); + assert(stripes[3].req_start == 0 && stripes[3].req_end == 0); + // calc_rmw + void *rmw_buf = calc_rmw(write_buf, stripes, osd_set, 4, 2, 2, write_osd_set, 128*1024, bmp); + for (int i = 0; i < 4; i++) + stripes[i].bmp_buf = bitmaps+i; + assert(rmw_buf); + assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024); + assert(stripes[1].read_start == 120*1024 && stripes[1].read_end == 124*1024); + assert(stripes[2].read_start == 0 && stripes[2].read_end == 0); + assert(stripes[3].read_start == 0 && stripes[3].read_end == 0); + assert(stripes[0].write_start == 120*1024 && stripes[0].write_end == 124*1024); + assert(stripes[1].write_start == 0 && stripes[1].write_end == 0); + assert(stripes[2].write_start == 120*1024 && stripes[2].write_end == 124*1024); + assert(stripes[3].write_start == 0 && stripes[3].write_end == 0); + assert(stripes[0].read_buf == (uint8_t*)rmw_buf+4*1024); + assert(stripes[1].read_buf == (uint8_t*)rmw_buf+132*1024); + assert(stripes[2].read_buf == NULL); + assert(stripes[3].read_buf == NULL); + assert(stripes[0].write_buf == write_buf); + assert(stripes[1].write_buf == NULL); + assert(stripes[2].write_buf == (uint8_t*)rmw_buf); + assert(stripes[3].write_buf == NULL); + // encode + set_pattern(stripes[0].read_buf, 128*1024, PATTERN1); + set_pattern(stripes[1].read_buf, 4*1024, PATTERN2); + memset(stripes[0].bmp_buf, 0xff, bmp); + memset(stripes[1].bmp_buf, 0xff, bmp); + calc_rmw_parity_ec(stripes, 4, 2, osd_set, write_osd_set, 128*1024, bmp); + assert(*(uint32_t*)stripes[2].bmp_buf == 0); + assert(stripes[0].write_start == 0 && stripes[0].write_end == 128*1024); + assert(stripes[1].write_start == 0 && stripes[1].write_end == 0); + assert(stripes[2].write_start == 120*1024 && stripes[2].write_end == 124*1024); + assert(stripes[3].write_start == 0 && stripes[3].write_end == 0); + assert(stripes[0].write_buf == stripes[0].read_buf); + assert(stripes[1].write_buf == NULL); + assert(stripes[2].write_buf == (uint8_t*)rmw_buf); + assert(stripes[3].write_buf == NULL); + check_pattern(stripes[2].write_buf, 4*1024, PATTERN0^PATTERN2); + // decode and verify + memset(stripes, 0, sizeof(stripes)); + split_stripes(2, 128*1024, 0, 256*1024, stripes); + assert(stripes[0].req_start == 0 && stripes[0].req_end == 128*1024); + assert(stripes[1].req_start == 0 && stripes[1].req_end == 128*1024); + assert(stripes[2].req_start == 0 && stripes[2].req_end == 0); + assert(stripes[3].req_start == 0 && stripes[3].req_end == 0); + for (int role = 0; role < 4; role++) + { + stripes[role].read_start = stripes[role].req_start; + stripes[role].read_end = stripes[role].req_end; + } + assert(extend_missing_stripes(stripes, write_osd_set, 2, 4) == 0); + assert(stripes[0].read_start == 0 && stripes[0].read_end == 128*1024); + assert(stripes[1].read_start == 0 && stripes[1].read_end == 128*1024); + assert(stripes[2].read_start == 0 && stripes[2].read_end == 128*1024); + assert(stripes[3].read_start == 0 && stripes[3].read_end == 0); + void *read_buf = alloc_read_buffer(stripes, 4, 0); + for (int i = 0; i < 4; i++) + stripes[i].bmp_buf = bitmaps+i; + assert(read_buf); + assert(stripes[0].read_buf == read_buf); + assert(stripes[1].read_buf == (uint8_t*)read_buf+128*1024); + assert(stripes[2].read_buf == (uint8_t*)read_buf+2*128*1024); + set_pattern(stripes[0].read_buf, 128*1024, PATTERN1); + set_pattern(stripes[0].read_buf+120*1024, 4*1024, PATTERN0); + set_pattern(stripes[2].read_buf, 128*1024, PATTERN1^PATTERN2); + set_pattern(stripes[2].read_buf+120*1024, 4*1024, PATTERN0^PATTERN2); + memset(stripes[0].bmp_buf, 0xff, bmp); + memset(stripes[2].bmp_buf, 0, bmp); + bitmaps[1] = 0; + bitmaps[3] = 0; + reconstruct_stripes_ec(stripes, 4, 2, bmp); + assert(bitmaps[0] == 0xFFFFFFFF); + assert(*(uint32_t*)stripes[1].bmp_buf == 0xFFFFFFFF); + check_pattern(stripes[1].read_buf, 128*1024, PATTERN2); + free(read_buf); + // Done + free(rmw_buf); + free(write_buf); + use_ec(4, 2, false); +} diff --git a/src/osd_secondary.cpp b/src/osd_secondary.cpp index d8fe2627..61c26d87 100644 --- a/src/osd_secondary.cpp +++ b/src/osd_secondary.cpp @@ -42,8 +42,10 @@ void osd_t::secondary_op_callback(osd_op_t *op) int retval = op->bs_op->retval; delete op->bs_op; op->bs_op = NULL; - if (op->is_recovery_related() && recovery_target_sleep_us) + if (op->is_recovery_related() && recovery_target_sleep_us && + op->req.hdr.opcode == OSD_OP_SEC_STABILIZE) { + // Apply pause AFTER commit. Do not apply pause to SYNC at all if (!op->tv_end.tv_sec) { clock_gettime(CLOCK_REALTIME, &op->tv_end); @@ -59,7 +61,25 @@ void osd_t::secondary_op_callback(osd_op_t *op) } } -void osd_t::exec_secondary(osd_op_t *cur_op) +void osd_t::exec_secondary(osd_op_t *op) +{ + if (op->is_recovery_related() && recovery_target_sleep_us && + op->req.hdr.opcode != OSD_OP_SEC_STABILIZE && op->req.hdr.opcode != OSD_OP_SEC_SYNC) + { + // Apply pause BEFORE write/delete + tfd->set_timer_us(recovery_target_sleep_us, false, [this, op](int timer_id) + { + clock_gettime(CLOCK_REALTIME, &op->tv_begin); + exec_secondary_real(op); + }); + } + else + { + exec_secondary_real(op); + } +} + +void osd_t::exec_secondary_real(osd_op_t *cur_op) { if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP) { diff --git a/src/vitastor.pc.in b/src/vitastor.pc.in index bc3d847d..71932bb8 100644 --- a/src/vitastor.pc.in +++ b/src/vitastor.pc.in @@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ Name: Vitastor Description: Vitastor client library -Version: 1.4.0 +Version: 1.4.4 Libs: -L${libdir} -lvitastor_client Cflags: -I${includedir} diff --git a/tests/run_3osds.sh b/tests/run_3osds.sh index 5d54676e..2f6073eb 100644 --- a/tests/run_3osds.sh +++ b/tests/run_3osds.sh @@ -10,6 +10,7 @@ SCHEME=${SCHEME:-replicated} # OFFSET_ARGS # PG_SIZE # PG_MINSIZE +# GLOBAL_CONFIG if [ "$SCHEME" = "ec" ]; then OSD_COUNT=${OSD_COUNT:-5} @@ -19,10 +20,10 @@ fi if [ "$IMMEDIATE_COMMIT" != "" ]; then NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 10 --etcd_stats_interval 5" - $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"recovery_tune_util_low":1,"osd_out_time":1,"immediate_commit":"all","client_enable_writeback":true}' + $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"recovery_tune_util_low":1,"immediate_commit":"all","client_enable_writeback":true,"client_max_writeback_iodepth":32'$GLOBAL_CONFIG'}' else NO_SAME="--journal_sector_buffer_count 1024 --log_level 10 --etcd_stats_interval 5" - $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"recovery_tune_util_low":1,"osd_out_time":1,"client_enable_writeback":true}' + $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"recovery_tune_util_low":1,"client_enable_writeback":true,"client_max_writeback_iodepth":32'$GLOBAL_CONFIG'}' fi start_osd_on() @@ -53,7 +54,7 @@ for i in $(seq 1 $OSD_COUNT); do start_osd $i done -(while true; do node mon/mon-main.js --etcd_address $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 || true; done) >>./testdata/mon.log 2>&1 & +(while true; do set +e; node mon/mon-main.js --etcd_address $ETCD_URL --etcd_prefix "/vitastor" --verbose 1; if [[ $? -ne 2 ]]; then break; fi; done) >>./testdata/mon.log 2>&1 & MON_PID=$! if [ "$SCHEME" = "ec" ]; then diff --git a/tests/run_tests.sh b/tests/run_tests.sh index c93058bb..fcf079ec 100755 --- a/tests/run_tests.sh +++ b/tests/run_tests.sh @@ -45,6 +45,8 @@ IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh SCHEME=ec ./test_rebalance_verify.sh SCHEME=ec IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh +./test_switch_primary.sh + ./test_write.sh SCHEME=xor ./test_write.sh diff --git a/tests/test_add_osd.sh b/tests/test_add_osd.sh index de75ed68..42fa996c 100755 --- a/tests/test_add_osd.sh +++ b/tests/test_add_osd.sh @@ -1,7 +1,7 @@ #!/bin/bash -ex PG_COUNT=2048 - +GLOBAL_CONFIG=',"osd_out_time":1' . `dirname $0`/run_3osds.sh LD_PRELOAD="build/src/libfio_vitastor.so" \ diff --git a/tests/test_heal.sh b/tests/test_heal.sh index cf4efd1b..3604b295 100755 --- a/tests/test_heal.sh +++ b/tests/test_heal.sh @@ -9,6 +9,7 @@ if [[ "$SCHEME" = "ec" ]]; then fi OSD_COUNT=${OSD_COUNT:-7} PG_COUNT=32 +GLOBAL_CONFIG=',"osd_out_time":1' . `dirname $0`/run_3osds.sh check_qemu diff --git a/tests/test_minsize_1.sh b/tests/test_minsize_1.sh index 2569bd64..51ef2ab9 100755 --- a/tests/test_minsize_1.sh +++ b/tests/test_minsize_1.sh @@ -2,6 +2,7 @@ PG_MINSIZE=1 SCHEME=replicated +GLOBAL_CONFIG=',"osd_out_time":1' . `dirname $0`/run_3osds.sh diff --git a/tests/test_scrub.sh b/tests/test_scrub.sh index 9da65a0c..8252e0bd 100755 --- a/tests/test_scrub.sh +++ b/tests/test_scrub.sh @@ -20,6 +20,9 @@ LD_PRELOAD="build/src/libfio_vitastor.so" \ fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 \ -mirror_file=./testdata/mirror.bin -end_fsync=1 -rw=write -etcd=$ETCD_URL -image=testimg +# Save PG primary +primary=$($ETCDCTL get --print-value-only /vitastor/config/pgs | jq -r '.items["1"]["1"].primary') + # Intentionally corrupt OSD data and restart it zero_osd_pid=OSD${ZERO_OSD}_PID kill ${!zero_osd_pid} @@ -34,6 +37,9 @@ start_osd $ZERO_OSD # Wait until start wait_up 10 +# Wait until PG is back on the same primary +wait_condition 10 "$ETCDCTL"$' get --print-value-only /vitastor/config/pgs | jq -s -e \'.[0].items["1"]["1"].primary == "'$primary'"'"'" + # Trigger scrub $ETCDCTL put /vitastor/pg/history/1/1 `$ETCDCTL get --print-value-only /vitastor/pg/history/1/1 | jq -s -c '(.[0] // {}) + {"next_scrub":1}'` diff --git a/tests/test_splitbrain.sh b/tests/test_splitbrain.sh index 9c637d2b..19cf0f49 100755 --- a/tests/test_splitbrain.sh +++ b/tests/test_splitbrain.sh @@ -4,6 +4,7 @@ OSD_COUNT=2 PG_SIZE=2 PG_MINSIZE=1 SCHEME=replicated +GLOBAL_CONFIG=',"osd_out_time":1' . `dirname $0`/run_3osds.sh diff --git a/tests/test_switch_primary.sh b/tests/test_switch_primary.sh new file mode 100755 index 00000000..78dcde78 --- /dev/null +++ b/tests/test_switch_primary.sh @@ -0,0 +1,18 @@ +#!/bin/bash -ex + +. `dirname $0`/run_3osds.sh + +primary=$($ETCDCTL get --print-value-only /vitastor/config/pgs | jq -r '.items["1"]["1"].primary') +primary_pid=OSD${primary}_PID +kill -9 ${!primary_pid} + +sleep 15 +wait_condition 10 "$ETCDCTL get --print-value-only /vitastor/config/pgs | jq -s -e '.[0].items[\"1\"][\"1\"].primary != \"$primary\"'" + +newprim=$($ETCDCTL get --print-value-only /vitastor/config/pgs | jq -r '.items["1"]["1"].primary') + +if [ "$newprim" = "$primary" ]; then + format_error Primary not switched +fi + +format_green OK