From 2b863fb715071816bc91578720dd721668e47951 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 7 Apr 2024 18:58:44 +0300 Subject: [PATCH] Add ENOSPC handling tests --- .gitea/workflows/test.yml | 72 ++++++++++++++++++++++++++++++++++++++ docs/config/client.en.md | 10 ++++++ docs/config/client.ru.md | 10 ++++++ docs/config/src/client.yml | 10 ++++++ mon/mon.js | 1 + src/cluster_client.cpp | 4 ++- src/cluster_client.h | 1 + tests/run_tests.sh | 5 +++ tests/test_enospc.sh | 30 ++++++++++++++++ 9 files changed, 142 insertions(+), 1 deletion(-) create mode 100755 tests/test_enospc.sh diff --git a/.gitea/workflows/test.yml b/.gitea/workflows/test.yml index 0635c3bd..d77d0946 100644 --- a/.gitea/workflows/test.yml +++ b/.gitea/workflows/test.yml @@ -748,6 +748,78 @@ jobs: echo "" done + test_enospc: + runs-on: ubuntu-latest + needs: build + container: ${{env.TEST_IMAGE}}:${{github.sha}} + steps: + - name: Run test + id: test + timeout-minutes: 3 + run: /root/vitastor/tests/test_enospc.sh + - name: Print logs + if: always() && steps.test.outcome == 'failure' + run: | + for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do + echo "-------- $i --------" + cat $i + echo "" + done + + test_enospc_xor: + runs-on: ubuntu-latest + needs: build + container: ${{env.TEST_IMAGE}}:${{github.sha}} + steps: + - name: Run test + id: test + timeout-minutes: 3 + run: SCHEME=xor /root/vitastor/tests/test_enospc.sh + - name: Print logs + if: always() && steps.test.outcome == 'failure' + run: | + for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do + echo "-------- $i --------" + cat $i + echo "" + done + + test_enospc_imm: + runs-on: ubuntu-latest + needs: build + container: ${{env.TEST_IMAGE}}:${{github.sha}} + steps: + - name: Run test + id: test + timeout-minutes: 3 + run: IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_enospc.sh + - name: Print logs + if: always() && steps.test.outcome == 'failure' + run: | + for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do + echo "-------- $i --------" + cat $i + echo "" + done + + test_enospc_imm_xor: + runs-on: ubuntu-latest + needs: build + container: ${{env.TEST_IMAGE}}:${{github.sha}} + steps: + - name: Run test + id: test + timeout-minutes: 3 + run: IMMEDIATE_COMMIT=1 SCHEME=xor /root/vitastor/tests/test_enospc.sh + - name: Print logs + if: always() && steps.test.outcome == 'failure' + run: | + for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do + echo "-------- $i --------" + cat $i + echo "" + done + test_scrub: runs-on: ubuntu-latest needs: build diff --git a/docs/config/client.en.md b/docs/config/client.en.md index 149f48ba..04d2565f 100644 --- a/docs/config/client.en.md +++ b/docs/config/client.en.md @@ -11,6 +11,7 @@ affect their interaction with the cluster. - [client_retry_interval](#client_retry_interval) - [client_eio_retry_interval](#client_eio_retry_interval) +- [client_retry_enospc](#client_retry_enospc) - [client_max_dirty_bytes](#client_max_dirty_bytes) - [client_max_dirty_ops](#client_max_dirty_ops) - [client_enable_writeback](#client_enable_writeback) @@ -41,6 +42,15 @@ Retry time for I/O requests failed due to data corruption or unfinished EC object deletions (has_incomplete PG state). 0 disables such retries and clients are not blocked and just get EIO error code instead. +## client_retry_enospc + +- Type: boolean +- Default: true +- Can be changed online: yes + +Retry writes on out of space errors to wait until some space is freed on +OSDs. + ## client_max_dirty_bytes - Type: integer diff --git a/docs/config/client.ru.md b/docs/config/client.ru.md index e714fe31..97b0bbbc 100644 --- a/docs/config/client.ru.md +++ b/docs/config/client.ru.md @@ -11,6 +11,7 @@ - [client_retry_interval](#client_retry_interval) - [client_eio_retry_interval](#client_eio_retry_interval) +- [client_retry_enospc](#client_retry_enospc) - [client_max_dirty_bytes](#client_max_dirty_bytes) - [client_max_dirty_ops](#client_max_dirty_ops) - [client_enable_writeback](#client_enable_writeback) @@ -42,6 +43,15 @@ 0 отключает повторы таких запросов и клиенты не блокируются, а вместо этого просто получают код ошибки EIO. +## client_retry_enospc + +- Тип: булево (да/нет) +- Значение по умолчанию: true +- Можно менять на лету: да + +Повторять запросы записи, завершившиеся с ошибками нехватки места, т.е. +ожидать, пока на OSD не освободится место. + ## client_max_dirty_bytes - Тип: целое число diff --git a/docs/config/src/client.yml b/docs/config/src/client.yml index 7ca579cb..800ead04 100644 --- a/docs/config/src/client.yml +++ b/docs/config/src/client.yml @@ -22,6 +22,16 @@ или незавершённых удалений EC-объектов (состояния PG has_incomplete). 0 отключает повторы таких запросов и клиенты не блокируются, а вместо этого просто получают код ошибки EIO. +- name: client_retry_enospc + type: bool + default: true + online: true + info: | + Retry writes on out of space errors to wait until some space is freed on + OSDs. + info_ru: | + Повторять запросы записи, завершившиеся с ошибками нехватки места, т.е. + ожидать, пока на OSD не освободится место. - name: client_max_dirty_bytes type: int default: 33554432 diff --git a/mon/mon.js b/mon/mon.js index 7d5a8d48..ef2331a1 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -91,6 +91,7 @@ const etcd_tree = { client_max_writeback_iodepth: 256, client_retry_interval: 50, // ms. min: 10 client_eio_retry_interval: 1000, // ms + client_retry_enospc: true, // client and osd - configurable online log_level: 0, peer_connect_interval: 5, // seconds. min: 1 diff --git a/src/cluster_client.cpp b/src/cluster_client.cpp index f04ed219..3a92031b 100644 --- a/src/cluster_client.cpp +++ b/src/cluster_client.cpp @@ -398,6 +398,8 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co client_eio_retry_interval = 10; } } + // client_retry_enospc + client_retry_enospc = config["client_retry_enospc"].is_null() ? true : config["client_retry_enospc"].bool_value(); // log_level log_level = config["log_level"].uint64_value(); msgr.parse_config(config); @@ -818,7 +820,7 @@ resume_2: return 1; } else if (op->retval != 0 && !(op->flags & OP_FLUSH_BUFFER) && - op->retval != -EPIPE && (op->retval != -EIO || !client_eio_retry_interval) && op->retval != -ENOSPC) + op->retval != -EPIPE && (op->retval != -EIO || !client_eio_retry_interval) && (op->retval != -ENOSPC || !client_retry_enospc)) { // Fatal error (neither -EPIPE, -EIO nor -ENOSPC) erase_op(op); diff --git a/src/cluster_client.h b/src/cluster_client.h index 7f9d8e3f..c77c328a 100644 --- a/src/cluster_client.h +++ b/src/cluster_client.h @@ -94,6 +94,7 @@ class cluster_client_t int log_level = 0; int client_retry_interval = 50; // ms int client_eio_retry_interval = 1000; // ms + bool client_retry_enospc = true; int retry_timeout_id = 0; int retry_timeout_duration = 0; diff --git a/tests/run_tests.sh b/tests/run_tests.sh index 5212792e..36775989 100755 --- a/tests/run_tests.sh +++ b/tests/run_tests.sh @@ -62,6 +62,11 @@ TEST_NAME=csum_4k_dmj OSD_ARGS="--data_csum_type crc32c --inmemory_metadata fal TEST_NAME=csum_4k_dj OSD_ARGS="--data_csum_type crc32c --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh TEST_NAME=csum_4k OSD_ARGS="--data_csum_type crc32c" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh +./test_enospc.sh +SCHEME=xor ./test_enospc.sh +IMMEDIATE_COMMIT=1 ./test_enospc.sh +IMMEDIATE_COMMIT=1 SCHEME=xor ./test_enospc.sh + ./test_scrub.sh ZERO_OSD=2 ./test_scrub.sh SCHEME=xor ./test_scrub.sh diff --git a/tests/test_enospc.sh b/tests/test_enospc.sh new file mode 100755 index 00000000..7f795236 --- /dev/null +++ b/tests/test_enospc.sh @@ -0,0 +1,30 @@ +#!/bin/bash -ex + +OSD_SIZE=200 +GLOBAL_CONFIG=',"client_retry_enospc":false' + +. `dirname $0`/run_3osds.sh + +export LD_PRELOAD="build/src/libfio_vitastor.so" + +# Should fail with ENOSPC +if fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 \ + -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=500M -cluster_log_level=10; then + format_error "Should get ENOSPC, but didn't" +fi + +# Should fail with ENOSPC too (the idea is to try to overwrite first objects to check their rollback) +if fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=32 \ + -rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=500M -cluster_log_level=10; then + format_error "Should get ENOSPC, but didn't" +fi + +# Should complete OK +if ! fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=4 \ + -rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=1 -size=100M -cluster_log_level=10 -number_ios=4096; then + format_error "Should do random writes over ENOSPC correctly, but got an error" +fi + +export -n LD_PRELOAD + +format_green OK