Fix eviction when random_pos selects the end

Implement min/max list_count to make listings during performance test reasonable
Fix and improve parallel allocation
2024-01-20 14:07:58 +03:00 · 2024-01-20 14:07:58 +03:00 · 2024-01-20 14:07:58 +03:00 · 2024-01-20 14:07:58 +03:00 · 2024-01-20 14:07:58 +03:00 · 2024-01-20 14:07:58 +03:00
29 changed files with 118 additions and 49 deletions
--- a/.gitea/workflows/test.yml
+++ b/.gitea/workflows/test.yml
@ -532,6 +532,24 @@ jobs:
          echo ""
        done

+  test_switch_primary:
+    runs-on: ubuntu-latest
+    needs: build
+    container: ${{env.TEST_IMAGE}}:${{github.sha}}
+    steps:
+    - name: Run test
+      id: test
+      timeout-minutes: 3
+      run: /root/vitastor/tests/test_switch_primary.sh
+    - name: Print logs
+      if: always() && steps.test.outcome == 'failure'
+      run: |
+        for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
+          echo "-------- $i --------"
+          cat $i
+          echo ""
+        done
+
  test_write:
    runs-on: ubuntu-latest
    needs: build
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,6 +2,6 @@ cmake_minimum_required(VERSION 2.8.12)

 project(vitastor)

-set(VERSION "1.4.0")
+set(VERSION "1.4.1")

 add_subdirectory(src)
--- a/csi/Makefile
+++ b/csi/Makefile
@ -1,4 +1,4 @@
-VERSION ?= v1.4.0
+VERSION ?= v1.4.1

 all: build push

--- a/csi/deploy/004-csi-nodeplugin.yaml
+++ b/csi/deploy/004-csi-nodeplugin.yaml
@ -49,7 +49,7 @@ spec:
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
-          image: vitalif/vitastor-csi:v1.4.0
+          image: vitalif/vitastor-csi:v1.4.1
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/deploy/007-csi-provisioner.yaml
+++ b/csi/deploy/007-csi-provisioner.yaml
@ -121,7 +121,7 @@ spec:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
-          image: vitalif/vitastor-csi:v1.4.0
+          image: vitalif/vitastor-csi:v1.4.1
          args:
            - "--node=$(NODE_ID)"
            - "--endpoint=$(CSI_ENDPOINT)"
--- a/csi/src/config.go
+++ b/csi/src/config.go
@ -5,7 +5,7 @@ package vitastor

 const (
    vitastorCSIDriverName    = "csi.vitastor.io"
-    vitastorCSIDriverVersion = "1.4.0"
+    vitastorCSIDriverVersion = "1.4.1"
 )

 // Config struct fills the parameters of request or user input
--- a/debian/changelog
+++ b/debian/changelog
@ -1,4 +1,4 @@
-vitastor (1.4.0-1) unstable; urgency=medium
+vitastor (1.4.1-1) unstable; urgency=medium

  * Bugfixes

--- a/debian/vitastor.Dockerfile
+++ b/debian/vitastor.Dockerfile
@ -35,8 +35,8 @@ RUN set -e -x; \
    mkdir -p /root/packages/vitastor-$REL; \
    rm -rf /root/packages/vitastor-$REL/*; \
    cd /root/packages/vitastor-$REL; \
-    cp -r /root/vitastor vitastor-1.4.0; \
-    cd vitastor-1.4.0; \
+    cp -r /root/vitastor vitastor-1.4.1; \
+    cd vitastor-1.4.1; \
    ln -s /root/fio-build/fio-*/ ./fio; \
    FIO=$(head -n1 fio/debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    ls /usr/include/linux/raw.h || cp ./debian/raw.h /usr/include/linux/raw.h; \
@ -49,8 +49,8 @@ RUN set -e -x; \
    rm -rf a b; \
    echo "dep:fio=$FIO" > debian/fio_version; \
    cd /root/packages/vitastor-$REL; \
-    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.0.orig.tar.xz vitastor-1.4.0; \
-    cd vitastor-1.4.0; \
+    tar --sort=name --mtime='2020-01-01' --owner=0 --group=0 --exclude=debian -cJf vitastor_1.4.1.orig.tar.xz vitastor-1.4.1; \
+    cd vitastor-1.4.1; \
    V=$(head -n1 debian/changelog | perl -pe 's/^.*\((.*?)\).*$/$1/'); \
    DEBFULLNAME="Vitaliy Filippov <vitalif@yourcmc.ru>" dch -D $REL -v "$V""$REL" "Rebuild for $REL"; \
    DEB_BUILD_OPTIONS=nocheck dpkg-buildpackage --jobs=auto -sa; \
--- a/docs/performance/theoretical.en.md
+++ b/docs/performance/theoretical.en.md
@ -11,19 +11,26 @@ Replicated setups:
 - Single-threaded write+fsync latency:
  - With immediate commit: 2 network roundtrips + 1 disk write.
  - With lazy commit: 4 network roundtrips + 1 disk write + 1 disk flush.
- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)).
- Saturated parallel write iops: min(network bandwidth, sum(disk write iops / number of replicas / write amplification)).
+- Linear read: `min(total network bandwidth, sum(disk read MB/s))`.
+- Linear write: `min(total network bandwidth, sum(disk write MB/s / number of replicas))`.
+- Saturated parallel read iops: `min(total network bandwidth, sum(disk read iops))`.
+- Saturated parallel write iops: `min(total network bandwidth / number of replicas, sum(disk write iops / number of replicas / (write amplification = 4)))`.

-EC/XOR setups:
+EC/XOR setups (EC N+K):
 - Single-threaded (T1Q1) read latency: 1.5 network roundtrips + 1 disk read.
 - Single-threaded write+fsync latency:
  - With immediate commit: 3.5 network roundtrips + 1 disk read + 2 disk writes.
  - With lazy commit: 5.5 network roundtrips + 1 disk read + 2 disk writes + 2 disk fsyncs.
-  - 0.5 in actually (k-1)/k which means that an additional roundtrip doesn't happen when
+  - 0.5 in actually `(N-1)/N` which means that an additional roundtrip doesn't happen when
    the read sub-operation can be served locally.
- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)).
- Saturated parallel write iops: min(network bandwidth, sum(disk write iops * number of data drives / (number of data + parity drives) / write amplification)).
-  In fact, you should put disk write iops under the condition of ~10% reads / ~90% writes in this formula.
+- Linear read: `min(total network bandwidth, sum(disk read MB/s))`.
+- Linear write: `min(total network bandwidth, sum(disk write MB/s * N/(N+K)))`.
+- Saturated parallel read iops: `min(total network bandwidth, sum(disk read iops))`.
+- Saturated parallel write iops: roughly `total iops / (N+K) / WA`. More exactly,
+  `min(total network bandwidth * N/(N+K), sum(disk randrw iops / (N*4 + K*5 + 1)))` with
+  random read/write mix corresponding to `(N-1)/(N*4 + K*5 + 1)*100 % reads`.
+  - For example, with EC 2+1 it is: `(7% randrw iops) / 14`.
+  - With EC 6+3 it is: `(12.5% randrw iops) / 40`.

 Write amplification for 4 KB blocks is usually 3-5 in Vitastor:
 1. Journal block write
--- a/docs/performance/theoretical.ru.md
+++ b/docs/performance/theoretical.ru.md
@ -11,20 +11,27 @@
 - Запись+fsync в 1 поток:
  - С мгновенным сбросом: 2 RTT + 1 запись.
  - С отложенным ("ленивым") сбросом: 4 RTT + 1 запись + 1 fsync.
- Параллельное чтение: сумма IOPS всех дисков либо производительность сети, если в сеть упрётся раньше.
- Параллельная запись: сумма IOPS всех дисков / число реплик / WA либо производительность сети, если в сеть упрётся раньше.
+- Линейное чтение: сумма МБ/с чтения всех дисков, либо общая производительность сети (сумма пропускной способности сети всех нод), если в сеть упрётся раньше.
+- Линейная запись: сумма МБ/с записи всех дисков / число реплик, либо производительность сети / число реплик, если в сеть упрётся раньше.
+- Параллельное случайное мелкое чтение: сумма IOPS чтения всех дисков, либо производительность сети, если в сеть упрётся раньше.
+- Параллельная случайная мелкая запись: сумма IOPS записи всех дисков / число реплик / WA, либо производительность сети / число реплик, если в сеть упрётся раньше.

-При использовании кодов коррекции ошибок (EC):
+При использовании кодов коррекции ошибок (EC N+K):
 - Задержка чтения в 1 поток (T1Q1): 1.5 RTT + 1 чтение.
 - Запись+fsync в 1 поток:
  - С мгновенным сбросом: 3.5 RTT + 1 чтение + 2 записи.
  - С отложенным ("ленивым") сбросом: 5.5 RTT + 1 чтение + 2 записи + 2 fsync.
- Под 0.5 на самом деле подразумевается (k-1)/k, где k - число дисков данных,
+- Под 0.5 на самом деле подразумевается (N-1)/N, где N - число дисков данных,
  что означает, что дополнительное обращение по сети не нужно, когда операция
  чтения обслуживается локально.
- Параллельное чтение: сумма IOPS всех дисков либо производительность сети, если в сеть упрётся раньше.
- Параллельная запись: сумма IOPS всех дисков / общее число дисков данных и чётности / WA либо производительность сети, если в сеть упрётся раньше.
-  Примечание: IOPS дисков в данном случае надо брать в смешанном режиме чтения/записи в пропорции, аналогичной формулам выше.
+- Линейное чтение: сумма МБ/с чтения всех дисков, либо общая производительность сети, если в сеть упрётся раньше.
+- Линейная запись: сумма МБ/с записи всех дисков * N/(N+K), либо производительность сети * N / (N+K), если в сеть упрётся раньше.
+- Параллельное случайное мелкое чтение: сумма IOPS чтения всех дисков либо производительность сети, если в сеть упрётся раньше.
+- Параллельная случайная мелкая запись: грубо `(сумма IOPS / (N+K) / WA)`. Если точнее, то:
+  сумма смешанного IOPS всех дисков при `(N-1)/(N*4 + K*5 + 1)*100 %` чтения, делённая на `(N*4 + K*5 + 1)`.
+  Либо, производительность сети * N/(N+K), если в сеть упрётся раньше.
+  - Например, при EC 2+1 это: `(сумма IOPS при 7% чтения) / 14`.
+  - При EC 6+3 это: `(сумма IOPS при 12.5% чтения) / 40`.

 WA (мультипликатор записи) для 4 КБ блоков в Vitastor обычно составляет 3-5:
 1. Запись метаданных в журнал
--- a/mon/mon.js
+++ b/mon/mon.js
@ -390,7 +390,8 @@ class Mon
 {
    constructor(config)
    {
-        this.die = (e) => this._die(e);
+        this.failconnect = (e) => this._die(e, 2);
+        this.die = (e) => this._die(e, 1);
        if (fs.existsSync(config.config_path||'/etc/vitastor/vitastor.conf'))
        {
            config = {
@ -604,7 +605,7 @@ class Mon
        }
        if (!this.ws)
        {
-            this.die('Failed to open etcd watch websocket');
+            this.failconnect('Failed to open etcd watch websocket');
        }
        const cur_addr = this.selected_etcd_url;
        this.ws_alive = true;
@ -791,7 +792,7 @@ class Mon
            const res = await this.etcd_call('/lease/keepalive', { ID: this.etcd_lease_id }, this.config.etcd_mon_timeout, this.config.etcd_mon_retries);
            if (!res.result.TTL)
            {
-                this.die('Lease expired');
+                this.failconnect('Lease expired');
            }
        }, this.config.etcd_mon_timeout);
        if (!this.signals_set)
@ -1414,7 +1415,14 @@ class Mon
            }
            if (changed)
            {
-                await this.save_pg_config(new_config_pgs);
+                const ok = await this.save_pg_config(new_config_pgs);
+                if (ok)
+                    console.log('PG configuration successfully changed');
+                else
+                {
+                    console.log('Someone changed PG configuration while we also tried to change it. Retrying in '+this.config.mon_change_timeout+' ms');
+                    this.schedule_recheck();
+                }
            }
        }
        this.recheck_pgs_active = false;
@ -1495,6 +1503,11 @@ class Mon
            this.save_new_pgs_txn(new_config_pgs, etcd_request, pool_id, up_osds, osd_tree, real_prev_pgs, pool_res.pgs, pg_history);
        }
        new_config_pgs.hash = tree_hash;
+        return await this.save_pg_config(new_config_pgs, etcd_request);
+    }
+
+    async save_pg_config(new_config_pgs, etcd_request = { compare: [], success: [] })
+    {
        etcd_request.compare.push(
            { key: b64(this.etcd_prefix+'/mon/master'), target: 'LEASE', lease: ''+this.etcd_lease_id },
            { key: b64(this.etcd_prefix+'/config/pgs'), target: 'MOD', mod_revision: ''+this.etcd_watch_revision, result: 'LESS' },
@ -1985,14 +1998,14 @@ class Mon
                return res.json;
            }
        }
-        this.die();
+        this.failconnect();
    }

-    _die(err)
+    _die(err, code)
    {
        // In fact we can just try to rejoin
        console.error(new Error(err || 'Cluster connection failed'));
-        process.exit(1);
+        process.exit(code || 2);
    }

    local_ips(all)
--- a/mon/package.json
+++ b/mon/package.json
@ -1,6 +1,6 @@
 {
  "name": "vitastor-mon",
-  "version": "1.4.0",
+  "version": "1.4.1",
  "description": "Vitastor SDS monitor service",
  "main": "mon-main.js",
  "scripts": {
--- a/patches/cinder-vitastor.py
+++ b/patches/cinder-vitastor.py
@ -50,7 +50,7 @@ from cinder.volume import configuration
 from cinder.volume import driver
 from cinder.volume import volume_utils

-VERSION = '1.4.0'
+VERSION = '1.4.1'

 LOG = logging.getLogger(__name__)

--- a/rpm/build-tarball.sh
+++ b/rpm/build-tarball.sh
@ -24,4 +24,4 @@ rm fio
 mv fio-copy fio
 FIO=`rpm -qi fio | perl -e 'while(<>) { /^Epoch[\s:]+(\S+)/ && print "$1:"; /^Version[\s:]+(\S+)/ && print $1; /^Release[\s:]+(\S+)/ && print "-$1"; }'`
 perl -i -pe 's/(Requires:\s*fio)([^\n]+)?/$1 = '$FIO'/' $VITASTOR/rpm/vitastor-el$EL.spec
-tar --transform 's#^#vitastor-1.4.0/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.0$(rpm --eval '%dist').tar.gz *
+tar --transform 's#^#vitastor-1.4.1/#' --exclude 'rpm/*.rpm' -czf $VITASTOR/../vitastor-1.4.1$(rpm --eval '%dist').tar.gz *
--- a/rpm/vitastor-el7.Dockerfile
+++ b/rpm/vitastor-el7.Dockerfile
@ -36,7 +36,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-1.4.0.el7.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.4.1.el7.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el7.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el7.spec
+++ b/rpm/vitastor-el7.spec
@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.4.0
+Version:        1.4.1
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.4.0.el7.tar.gz
+Source0:        vitastor-1.4.1.el7.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el8.Dockerfile
+++ b/rpm/vitastor-el8.Dockerfile
@ -35,7 +35,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-1.4.0.el8.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.4.1.el8.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el8.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el8.spec
+++ b/rpm/vitastor-el8.spec
@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.4.0
+Version:        1.4.1
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.4.0.el8.tar.gz
+Source0:        vitastor-1.4.1.el8.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/rpm/vitastor-el9.Dockerfile
+++ b/rpm/vitastor-el9.Dockerfile
@ -18,7 +18,7 @@ ADD . /root/vitastor
 RUN set -e; \
    cd /root/vitastor/rpm; \
    sh build-tarball.sh; \
-    cp /root/vitastor-1.4.0.el9.tar.gz ~/rpmbuild/SOURCES; \
+    cp /root/vitastor-1.4.1.el9.tar.gz ~/rpmbuild/SOURCES; \
    cp vitastor-el9.spec ~/rpmbuild/SPECS/vitastor.spec; \
    cd ~/rpmbuild/SPECS/; \
    rpmbuild -ba vitastor.spec; \
--- a/rpm/vitastor-el9.spec
+++ b/rpm/vitastor-el9.spec
@ -1,11 +1,11 @@
 Name:           vitastor
-Version:        1.4.0
+Version:        1.4.1
 Release:        1%{?dist}
 Summary:        Vitastor, a fast software-defined clustered block storage

 License:        Vitastor Network Public License 1.1
 URL:            https://vitastor.io/
-Source0:        vitastor-1.4.0.el9.tar.gz
+Source0:        vitastor-1.4.1.el9.tar.gz

 BuildRequires:  liburing-devel >= 0.6
 BuildRequires:  gperftools-devel
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -16,7 +16,7 @@ if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/local/?$")
 	set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
 endif()

-add_definitions(-DVERSION="1.4.0")
+add_definitions(-DVERSION="1.4.1")
 add_definitions(-Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fdiagnostics-color=always -fno-omit-frame-pointer -I ${CMAKE_SOURCE_DIR}/src)
 add_link_options(-fno-omit-frame-pointer)
 if (${WITH_ASAN})
--- a/src/vitastor.pc.in
+++ b/src/vitastor.pc.in
@ -6,7 +6,7 @@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@

 Name: Vitastor
 Description: Vitastor client library
-Version: 1.4.0
+Version: 1.4.1
 Libs: -L${libdir} -lvitastor_client
 Cflags: -I${includedir}

--- a/tests/run_3osds.sh
+++ b/tests/run_3osds.sh
@ -10,6 +10,7 @@ SCHEME=${SCHEME:-replicated}
 # OFFSET_ARGS
 # PG_SIZE
 # PG_MINSIZE
+# GLOBAL_CONFIG

 if [ "$SCHEME" = "ec" ]; then
    OSD_COUNT=${OSD_COUNT:-5}
@ -19,10 +20,10 @@ fi

 if [ "$IMMEDIATE_COMMIT" != "" ]; then
    NO_SAME="--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 --disable_data_fsync 1 --immediate_commit all --log_level 10 --etcd_stats_interval 5"
-    $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"recovery_tune_util_low":1,"osd_out_time":1,"immediate_commit":"all","client_enable_writeback":true}'
+    $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"recovery_tune_util_low":1,"immediate_commit":"all","client_enable_writeback":true,"client_max_writeback_iodepth":32'$GLOBAL_CONFIG'}'
 else
    NO_SAME="--journal_sector_buffer_count 1024 --log_level 10 --etcd_stats_interval 5"
-    $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"recovery_tune_util_low":1,"osd_out_time":1,"client_enable_writeback":true}'
+    $ETCDCTL put /vitastor/config/global '{"recovery_queue_depth":1,"recovery_tune_util_low":1,"client_enable_writeback":true,"client_max_writeback_iodepth":32'$GLOBAL_CONFIG'}'
 fi

 start_osd_on()
@ -53,7 +54,7 @@ for i in $(seq 1 $OSD_COUNT); do
    start_osd $i
 done

-(while true; do node mon/mon-main.js --etcd_address $ETCD_URL --etcd_prefix "/vitastor" --verbose 1 || true; done) >>./testdata/mon.log 2>&1 &
+(while true; do set +e; node mon/mon-main.js --etcd_address $ETCD_URL --etcd_prefix "/vitastor" --verbose 1; if [[ $? -ne 2 ]]; then break; fi; done) >>./testdata/mon.log 2>&1 &
 MON_PID=$!

 if [ "$SCHEME" = "ec" ]; then
--- a/tests/run_tests.sh
+++ b/tests/run_tests.sh
@ -45,6 +45,8 @@ IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh
 SCHEME=ec ./test_rebalance_verify.sh
 SCHEME=ec IMMEDIATE_COMMIT=1 ./test_rebalance_verify.sh

+./test_switch_primary.sh
+
 ./test_write.sh
 SCHEME=xor ./test_write.sh

--- a/tests/test_add_osd.sh
+++ b/tests/test_add_osd.sh
@ -1,7 +1,7 @@
 #!/bin/bash -ex

 PG_COUNT=2048
-
+GLOBAL_CONFIG=',"osd_out_time":1'
 . `dirname $0`/run_3osds.sh

 LD_PRELOAD="build/src/libfio_vitastor.so" \
--- a/tests/test_heal.sh
+++ b/tests/test_heal.sh
@ -9,6 +9,7 @@ if [[ "$SCHEME" = "ec" ]]; then
 fi
 OSD_COUNT=${OSD_COUNT:-7}
 PG_COUNT=32
+GLOBAL_CONFIG=',"osd_out_time":1'
 . `dirname $0`/run_3osds.sh
 check_qemu

--- a/tests/test_minsize_1.sh
+++ b/tests/test_minsize_1.sh
@ -2,6 +2,7 @@

 PG_MINSIZE=1
 SCHEME=replicated
+GLOBAL_CONFIG=',"osd_out_time":1'

 . `dirname $0`/run_3osds.sh

--- a/tests/test_splitbrain.sh
+++ b/tests/test_splitbrain.sh
@ -4,6 +4,7 @@ OSD_COUNT=2
 PG_SIZE=2
 PG_MINSIZE=1
 SCHEME=replicated
+GLOBAL_CONFIG=',"osd_out_time":1'

 . `dirname $0`/run_3osds.sh

--- a/tests/test_switch_primary.sh
+++ b/tests/test_switch_primary.sh
@ -0,0 +1,18 @@
+#!/bin/bash -ex
+
+. `dirname $0`/run_3osds.sh
+
+primary=$($ETCDCTL get --print-value-only /vitastor/config/pgs | jq -r '.items["1"]["1"].primary')
+primary_pid=OSD${primary}_PID
+kill -9 ${!primary_pid}
+
+sleep 15
+wait_condition 10 "$ETCDCTL get --print-value-only /vitastor/config/pgs | jq -s -e '.[0].items[\"1\"][\"1\"].primary != \"$primary\"'"
+
+newprim=$($ETCDCTL get --print-value-only /vitastor/config/pgs | jq -r '.items["1"]["1"].primary')
+
+if [ "$newprim" = "$primary" ]; then
+    format_error Primary not switched
+fi
+
+format_green OK
Author	SHA1	Message	Date
Vitaliy Filippov	31c7a55751	Fix eviction when random_pos selects the end Test / test_move_reappear (push) Has been cancelled Details Test / test_rm (push) Has been cancelled Details Test / test_snapshot_chain (push) Has been cancelled Details Test / test_snapshot_chain_ec (push) Has been cancelled Details Test / test_snapshot_down (push) Has been cancelled Details Test / test_snapshot_down_ec (push) Has been cancelled Details Test / test_splitbrain (push) Has been cancelled Details Test / test_rebalance_verify (push) Has been cancelled Details Test / test_rebalance_verify_imm (push) Has been cancelled Details Test / test_rebalance_verify_ec (push) Has been cancelled Details Test / test_rebalance_verify_ec_imm (push) Has been cancelled Details Test / test_switch_primary (push) Has been cancelled Details Test / test_write (push) Has been cancelled Details Test / test_write_xor (push) Has been cancelled Details Test / test_write_no_same (push) Has been cancelled Details Test / test_heal_pg_size_2 (push) Has been cancelled Details Test / test_heal_ec (push) Has been cancelled Details Test / test_heal_csum_32k_dmj (push) Has been cancelled Details Test / test_heal_csum_32k_dj (push) Has been cancelled Details Test / test_heal_csum_32k (push) Has been cancelled Details Test / test_heal_csum_4k_dmj (push) Has been cancelled Details Test / test_heal_csum_4k_dj (push) Has been cancelled Details Test / test_heal_csum_4k (push) Has been cancelled Details Test / test_scrub (push) Has been cancelled Details Test / test_scrub_zero_osd_2 (push) Has been cancelled Details Test / test_scrub_xor (push) Has been cancelled Details Test / test_scrub_pg_size_3 (push) Has been cancelled Details Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Has been cancelled Details Test / test_scrub_ec (push) Has been cancelled Details Test / buildenv (push) Has been cancelled Details	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	1025d1c777	Implement min/max list_count to make listings during performance test reasonable	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	f4cd609bd7	Fix and improve parallel allocation - Do not try to allocate more DB blocks in an inode block until it's "confirmed" and "locked" by the first write - Do not recheck for new zero DB blocks on first write into an inode block - a CAS failure means someone else is already writing into it - Throw new allocation blocks away regardless of whether the known_version is 0 on a CAS failure	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	2ded2f08b4	Implement key_prefix for K/V stress test	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	8aff78dfa7	More fixes - do not overwrite a block with older version if known version is newer (read may start before update and end after update) - invalidated block versions can't be remembered and trusted - right boundary for split blocks is right_half when diving down, not key_lt - restart update also when block is "invalidated", not just on version mismatch - copy callback in listings to avoid closure destruction bugs too	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	7e868da353	Add logging and one more assert	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	32f9b29498	Make get_block() wait for updating when unrelated block is found along the path	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	961865a24e	Fix a race condition where changed blocks were parsed over existing cached blocks and getting a mix of data	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	9794d3fa73	Simplify code by removing an unneeded "optimisation"	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	aacd664208	Add kv_log_level, print warnings on level 1, trace ops on level 10	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	df01ae3de4	Fix duplicate keys in listings on parallel updates -- do not rewind key "iterator position"	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	52d0c2b7c8	Implement key suffix to avoid collisions of multiple test workers	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	e64b3f7806	Do not complain on empty first block	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	eab2b7c91b	Add JSON output for stress-tester	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	589863833a	Print total stats	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	95eb08be91	Do not send more than op_count operations (fix segfault on finish)	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	85377b5418	Add some more resiliency to serialize()	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	a2ae812ac9	Invalidate blocks being updated too	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	093d32b226	Change new block allocation method: make each writer choose multiple empty PG blocks and place blocks in them	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	37d0960906	Remove blocks from cache on unsuccessful updates	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	dac250295c	Allow to track multiple updates per block (it should never happen though)	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	5d6d352b61	Do not call stop_updating after failed write_new_block and after clear_block (both delete the item)	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	cf80f73abb	Track versions of parent blocks and recheck if changed during update	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	018c37dea5	Fix resume_split condition (key_lt can also be "")	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	2272573883	Experiment: transform offsets for better sharding	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	5d3515d66b	More post-stress-test fixes - Prevent _split types of new blocks - Stop updating new blocks only after the whole update, otherwise pointers may become invalid - Use recheck_none for updates initially - Use UINT64_MAX as initial block version when postponing ops, otherwise the check fails when the block is initially empty. This for example leads to writing both leaf items & block pointers (which is incorrect) into the root block when starting stress-test with --parallelism 32 - Fix -EINTR comparison	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	ea9f59b8fe	Print operation statistics	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	544af504f7	K/V fixes after stress-test :-) - track block versions correctly - per inode block (128kb) instead of tree block (4kb) - prevent multiple parallel CAS writes of the same inode block - add logging for EILSEQ which means invalid data in the tree - fix get_block updated flag which was true for blocks already in cache and was leading to infinite loops on "unrelated block" errors - apply changes to blocks in cache only after successful writes (using "virtual changes") - do not replace cached block with an older version from disk - recheck "unrelated blocks" (read/update collisions) until data stops changing - track tree path correctly - do not treat split block as parent of its right half - correctly move blocks when finding new empty place on disk - restart updates from the beginning when one of blocks is changed by a parallel update - fix delete using SET opcode and setting key to the empty value instead - prevent changing the same key more than 1 time in parallel - fix listing verification - resume continue_updates in update_find (required because it uses continue_update itself) - add allow_old_cached parameter to get()	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	f804851e22	Implement K/V DB stress tester	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	2f1483ce26	Evict blocks based on memory limit & block usage	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	516629ac26	Track blocks per level	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	29a198d7d6	Track block level	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	4f5bb7936d	Experimental B-Tree Vitastor embedded K/V database implementation!	2024-01-20 14:07:58 +03:00
Vitaliy Filippov	ba55f91409	Release 1.4.1 Test / test_move_reappear (push) Successful in 22s Details Test / test_snapshot_chain (push) Successful in 1m27s Details Test / test_interrupted_rebalance_ec (push) Successful in 4m41s Details Test / test_snapshot_down (push) Successful in 25s Details Test / test_snapshot_chain_ec (push) Successful in 2m0s Details Test / test_splitbrain (push) Successful in 18s Details Test / test_snapshot_down_ec (push) Successful in 25s Details Test / test_rebalance_verify_ec (push) Failing after 2m21s Details Test / test_rebalance_verify_imm (push) Successful in 2m30s Details Test / test_switch_primary (push) Successful in 39s Details Test / test_write (push) Successful in 35s Details Test / test_interrupted_rebalance (push) Failing after 10m8s Details Test / test_write_xor (push) Successful in 36s Details Test / test_write_no_same (push) Successful in 17s Details Test / test_rebalance_verify_ec_imm (push) Successful in 4m4s Details Test / test_heal_pg_size_2 (push) Successful in 3m55s Details Test / test_rebalance_verify (push) Successful in 8m31s Details Test / test_heal_ec (push) Successful in 5m9s Details Test / test_heal_csum_32k_dmj (push) Successful in 4m27s Details Test / test_heal_csum_32k (push) Successful in 5m42s Details Test / test_heal_csum_32k_dj (push) Successful in 6m1s Details Test / test_scrub (push) Successful in 59s Details Test / test_scrub_zero_osd_2 (push) Successful in 38s Details Test / test_heal_csum_4k_dmj (push) Successful in 7m5s Details Test / test_scrub_xor (push) Successful in 58s Details Test / test_heal_csum_4k_dj (push) Successful in 6m25s Details Test / test_scrub_ec (push) Failing after 42s Details Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m32s Details Test / test_scrub_pg_size_3 (push) Successful in 1m38s Details Test / test_heal_csum_4k (push) Successful in 5m38s Details - Fix a monitor crash on primary OSD switching introduced in 1.4.0 - Fix "partly outside array bounds" warnings for GCC 12 in cpp-btree - Fix a realloc memory leak in theory possible with too large listings (OSD_OP_LIST)	2024-01-18 02:31:42 +03:00
Vitaliy Filippov	80aac39513	Add detailed formula for theoretical EC N+K random write performance	2024-01-18 00:36:32 +03:00
Vitaliy Filippov	2aa5aa7ab6	Add a test for simple master switching without PG reconfiguration Test / test_move_reappear (push) Successful in 20s Details Test / test_snapshot_chain (push) Successful in 1m27s Details Test / test_snapshot_down (push) Successful in 23s Details Test / test_snapshot_chain_ec (push) Successful in 1m56s Details Test / test_snapshot_down_ec (push) Successful in 23s Details Test / test_splitbrain (push) Successful in 17s Details Test / test_interrupted_rebalance_ec (push) Successful in 6m40s Details Test / test_interrupted_rebalance (push) Successful in 8m12s Details Test / test_rebalance_verify_imm (push) Successful in 3m12s Details Test / test_switch_primary (push) Successful in 34s Details Test / test_write (push) Successful in 46s Details Test / test_rebalance_verify_ec (push) Successful in 3m18s Details Test / test_rebalance_verify_ec_imm (push) Successful in 2m42s Details Test / test_write_no_same (push) Successful in 15s Details Test / test_rebalance_verify (push) Successful in 6m36s Details Test / test_heal_ec (push) Successful in 5m2s Details Test / test_heal_csum_32k_dmj (push) Successful in 4m33s Details Test / test_heal_csum_32k_dj (push) Successful in 5m58s Details Test / test_heal_csum_32k (push) Successful in 6m6s Details Test / test_scrub (push) Successful in 47s Details Test / test_heal_csum_4k_dmj (push) Successful in 6m17s Details Test / test_scrub_zero_osd_2 (push) Successful in 43s Details Test / test_scrub_xor (push) Successful in 47s Details Test / test_heal_csum_4k_dj (push) Successful in 6m44s Details Test / test_scrub_ec (push) Successful in 41s Details Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Successful in 1m18s Details Test / test_scrub_pg_size_3 (push) Successful in 2m11s Details Test / test_heal_csum_4k (push) Successful in 6m12s Details Test / test_heal_pg_size_2 (push) Successful in 3m16s Details Test / test_write_xor (push) Successful in 34s Details Also use osd_out_time:1 only in select tests and restart mon in tests only on connection errors	2024-01-17 00:19:01 +03:00
Vitaliy Filippov	3ca3b8a8d8	Fix recheck_pgs bug introduced in 1.4.0 Test / test_rm (push) Successful in 14s Details Test / test_interrupted_rebalance_ec (push) Successful in 3m27s Details Test / test_snapshot_chain (push) Successful in 1m24s Details Test / test_snapshot_down (push) Successful in 25s Details Test / test_snapshot_chain_ec (push) Successful in 1m54s Details Test / test_snapshot_down_ec (push) Successful in 20s Details Test / test_splitbrain (push) Successful in 15s Details Test / test_rebalance_verify_imm (push) Successful in 2m42s Details Test / test_etcd_fail (push) Failing after 10m8s Details Test / test_interrupted_rebalance (push) Failing after 10m9s Details Test / test_write (push) Successful in 1m22s Details Test / test_rebalance_verify_ec (push) Failing after 1m51s Details Test / test_write_no_same (push) Successful in 16s Details Test / test_rebalance_verify_ec_imm (push) Successful in 3m27s Details Test / test_write_xor (push) Failing after 3m13s Details Test / test_heal_pg_size_2 (push) Successful in 3m22s Details Test / test_rebalance_verify (push) Failing after 10m9s Details Test / test_heal_ec (push) Successful in 4m41s Details Test / test_heal_csum_32k_dmj (push) Successful in 4m42s Details Test / test_heal_csum_32k_dj (push) Successful in 4m58s Details Test / test_heal_csum_32k (push) Successful in 6m34s Details Test / test_scrub (push) Successful in 54s Details Test / test_heal_csum_4k_dmj (push) Successful in 6m56s Details Test / test_scrub_zero_osd_2 (push) Successful in 49s Details Test / test_heal_csum_4k_dj (push) Successful in 6m1s Details Test / test_scrub_ec (push) Has been cancelled Details Test / test_heal_csum_4k (push) Has been cancelled Details Test / test_scrub_pg_size_6_pg_minsize_4_osd_count_6_ec (push) Has been cancelled Details Test / test_scrub_xor (push) Has been cancelled Details Test / test_scrub_pg_size_3 (push) Has been cancelled Details	2024-01-16 23:49:21 +03:00