Add ENOSPC handling tests

master
Vitaliy Filippov 2024-04-07 18:58:44 +03:00
parent 3bf4dd5abd
commit 2b863fb715
9 changed files with 142 additions and 1 deletions

View File

@ -748,6 +748,78 @@ jobs:
echo ""
done
test_enospc:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: /root/vitastor/tests/test_enospc.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_enospc_xor:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: SCHEME=xor /root/vitastor/tests/test_enospc.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_enospc_imm:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: IMMEDIATE_COMMIT=1 /root/vitastor/tests/test_enospc.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_enospc_imm_xor:
runs-on: ubuntu-latest
needs: build
container: ${{env.TEST_IMAGE}}:${{github.sha}}
steps:
- name: Run test
id: test
timeout-minutes: 3
run: IMMEDIATE_COMMIT=1 SCHEME=xor /root/vitastor/tests/test_enospc.sh
- name: Print logs
if: always() && steps.test.outcome == 'failure'
run: |
for i in /root/vitastor/testdata/*.log /root/vitastor/testdata/*.txt; do
echo "-------- $i --------"
cat $i
echo ""
done
test_scrub:
runs-on: ubuntu-latest
needs: build

View File

@ -11,6 +11,7 @@ affect their interaction with the cluster.
- [client_retry_interval](#client_retry_interval)
- [client_eio_retry_interval](#client_eio_retry_interval)
- [client_retry_enospc](#client_retry_enospc)
- [client_max_dirty_bytes](#client_max_dirty_bytes)
- [client_max_dirty_ops](#client_max_dirty_ops)
- [client_enable_writeback](#client_enable_writeback)
@ -41,6 +42,15 @@ Retry time for I/O requests failed due to data corruption or unfinished
EC object deletions (has_incomplete PG state). 0 disables such retries
and clients are not blocked and just get EIO error code instead.
## client_retry_enospc
- Type: boolean
- Default: true
- Can be changed online: yes
Retry writes on out of space errors to wait until some space is freed on
OSDs.
## client_max_dirty_bytes
- Type: integer

View File

@ -11,6 +11,7 @@
- [client_retry_interval](#client_retry_interval)
- [client_eio_retry_interval](#client_eio_retry_interval)
- [client_retry_enospc](#client_retry_enospc)
- [client_max_dirty_bytes](#client_max_dirty_bytes)
- [client_max_dirty_ops](#client_max_dirty_ops)
- [client_enable_writeback](#client_enable_writeback)
@ -42,6 +43,15 @@
0 отключает повторы таких запросов и клиенты не блокируются, а вместо
этого просто получают код ошибки EIO.
## client_retry_enospc
- Тип: булево (да/нет)
- Значение по умолчанию: true
- Можно менять на лету: да
Повторять запросы записи, завершившиеся с ошибками нехватки места, т.е.
ожидать, пока на OSD не освободится место.
## client_max_dirty_bytes
- Тип: целое число

View File

@ -22,6 +22,16 @@
или незавершённых удалений EC-объектов (состояния PG has_incomplete).
0 отключает повторы таких запросов и клиенты не блокируются, а вместо
этого просто получают код ошибки EIO.
- name: client_retry_enospc
type: bool
default: true
online: true
info: |
Retry writes on out of space errors to wait until some space is freed on
OSDs.
info_ru: |
Повторять запросы записи, завершившиеся с ошибками нехватки места, т.е.
ожидать, пока на OSD не освободится место.
- name: client_max_dirty_bytes
type: int
default: 33554432

View File

@ -91,6 +91,7 @@ const etcd_tree = {
client_max_writeback_iodepth: 256,
client_retry_interval: 50, // ms. min: 10
client_eio_retry_interval: 1000, // ms
client_retry_enospc: true,
// client and osd - configurable online
log_level: 0,
peer_connect_interval: 5, // seconds. min: 1

View File

@ -398,6 +398,8 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & etcd_global_co
client_eio_retry_interval = 10;
}
}
// client_retry_enospc
client_retry_enospc = config["client_retry_enospc"].is_null() ? true : config["client_retry_enospc"].bool_value();
// log_level
log_level = config["log_level"].uint64_value();
msgr.parse_config(config);
@ -818,7 +820,7 @@ resume_2:
return 1;
}
else if (op->retval != 0 && !(op->flags & OP_FLUSH_BUFFER) &&
op->retval != -EPIPE && (op->retval != -EIO || !client_eio_retry_interval) && op->retval != -ENOSPC)
op->retval != -EPIPE && (op->retval != -EIO || !client_eio_retry_interval) && (op->retval != -ENOSPC || !client_retry_enospc))
{
// Fatal error (neither -EPIPE, -EIO nor -ENOSPC)
erase_op(op);

View File

@ -94,6 +94,7 @@ class cluster_client_t
int log_level = 0;
int client_retry_interval = 50; // ms
int client_eio_retry_interval = 1000; // ms
bool client_retry_enospc = true;
int retry_timeout_id = 0;
int retry_timeout_duration = 0;

View File

@ -62,6 +62,11 @@ TEST_NAME=csum_4k_dmj OSD_ARGS="--data_csum_type crc32c --inmemory_metadata fal
TEST_NAME=csum_4k_dj OSD_ARGS="--data_csum_type crc32c --inmemory_journal false" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
TEST_NAME=csum_4k OSD_ARGS="--data_csum_type crc32c" OFFSET_ARGS=$OSD_ARGS ./test_heal.sh
./test_enospc.sh
SCHEME=xor ./test_enospc.sh
IMMEDIATE_COMMIT=1 ./test_enospc.sh
IMMEDIATE_COMMIT=1 SCHEME=xor ./test_enospc.sh
./test_scrub.sh
ZERO_OSD=2 ./test_scrub.sh
SCHEME=xor ./test_scrub.sh

30
tests/test_enospc.sh Executable file
View File

@ -0,0 +1,30 @@
#!/bin/bash -ex
OSD_SIZE=200
GLOBAL_CONFIG=',"client_retry_enospc":false'
. `dirname $0`/run_3osds.sh
export LD_PRELOAD="build/src/libfio_vitastor.so"
# Should fail with ENOSPC
if fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 \
-rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=500M -cluster_log_level=10; then
format_error "Should get ENOSPC, but didn't"
fi
# Should fail with ENOSPC too (the idea is to try to overwrite first objects to check their rollback)
if fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=32 \
-rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=500M -cluster_log_level=10; then
format_error "Should get ENOSPC, but didn't"
fi
# Should complete OK
if ! fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=4 \
-rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=1 -size=100M -cluster_log_level=10 -number_ios=4096; then
format_error "Should do random writes over ENOSPC correctly, but got an error"
fi
export -n LD_PRELOAD
format_green OK