Merge pull request #14838 from serathius/linearizability-docs
tests: Document robustness testsstorage-doc
commit
5223d09d41
|
@ -33,18 +33,19 @@ jobs:
|
||||||
|
|
||||||
case "${GITHUB_REF}" in
|
case "${GITHUB_REF}" in
|
||||||
release-3.5)
|
release-3.5)
|
||||||
make build-failpoints-release-3.5
|
make /tmp/etcd-release-3.5-failpoints/bin/etcd
|
||||||
./bin/etcd --version
|
cp /tmp/etcd-release-3.5-failpoints/bin/etcd bin/etcd
|
||||||
;;
|
;;
|
||||||
release-3.4)
|
release-3.4)
|
||||||
make build-failpoints-release-3.4
|
make /tmp/etcd-release-3.4-failpoints/bin/etcd
|
||||||
./bin/etcd --version
|
cp /tmp/etcd-release-3.4-failpoints/bin/etcd bin/etcd
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
make gofail-enable
|
make gofail-enable
|
||||||
make build
|
make build
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
./bin/etcd --version
|
||||||
- name: test-robustness
|
- name: test-robustness
|
||||||
run: |
|
run: |
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
50
Makefile
50
Makefile
|
@ -1,3 +1,6 @@
|
||||||
|
all: build
|
||||||
|
include tests/robustness/makefile.mk
|
||||||
|
|
||||||
.PHONY: build
|
.PHONY: build
|
||||||
build:
|
build:
|
||||||
GO_BUILD_FLAGS="${GO_BUILD_FLAGS} -v" ./scripts/build.sh
|
GO_BUILD_FLAGS="${GO_BUILD_FLAGS} -v" ./scripts/build.sh
|
||||||
|
@ -113,53 +116,6 @@ verify-genproto:
|
||||||
verify-goimport:
|
verify-goimport:
|
||||||
PASSES="goimport" ./scripts/test.sh
|
PASSES="goimport" ./scripts/test.sh
|
||||||
|
|
||||||
# Failpoints
|
|
||||||
|
|
||||||
GOFAIL_VERSION = $(shell cd tools/mod && go list -m -f {{.Version}} go.etcd.io/gofail)
|
|
||||||
|
|
||||||
.PHONY: gofail-enable
|
|
||||||
gofail-enable: install-gofail
|
|
||||||
gofail enable server/etcdserver/ server/storage/backend/ server/storage/mvcc/ server/storage/wal/
|
|
||||||
cd ./server && go get go.etcd.io/gofail@${GOFAIL_VERSION}
|
|
||||||
cd ./etcdutl && go get go.etcd.io/gofail@${GOFAIL_VERSION}
|
|
||||||
cd ./etcdctl && go get go.etcd.io/gofail@${GOFAIL_VERSION}
|
|
||||||
cd ./tests && go get go.etcd.io/gofail@${GOFAIL_VERSION}
|
|
||||||
|
|
||||||
.PHONY: gofail-disable
|
|
||||||
gofail-disable: install-gofail
|
|
||||||
gofail disable server/etcdserver/ server/storage/backend/ server/storage/mvcc/ server/storage/wal/
|
|
||||||
cd ./server && go mod tidy
|
|
||||||
cd ./etcdutl && go mod tidy
|
|
||||||
cd ./etcdctl && go mod tidy
|
|
||||||
cd ./tests && go mod tidy
|
|
||||||
|
|
||||||
.PHONY: install-gofail
|
|
||||||
install-gofail:
|
|
||||||
cd tools/mod; go install go.etcd.io/gofail@${GOFAIL_VERSION}
|
|
||||||
|
|
||||||
build-failpoints-release-3.5:
|
|
||||||
rm -rf /tmp/etcd-release-3.5/
|
|
||||||
mkdir -p /tmp/etcd-release-3.5/
|
|
||||||
cd /tmp/etcd-release-3.5/; \
|
|
||||||
git clone --depth 1 --branch release-3.5 https://github.com/etcd-io/etcd.git .; \
|
|
||||||
go get go.etcd.io/gofail@${GOFAIL_VERSION}; \
|
|
||||||
(cd server; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \
|
|
||||||
(cd etcdctl; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \
|
|
||||||
(cd etcdutl; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \
|
|
||||||
FAILPOINTS=true ./build;
|
|
||||||
mkdir -p ./bin
|
|
||||||
cp /tmp/etcd-release-3.5/bin/etcd ./bin/etcd
|
|
||||||
|
|
||||||
build-failpoints-release-3.4:
|
|
||||||
rm -rf /tmp/etcd-release-3.4/
|
|
||||||
mkdir -p /tmp/etcd-release-3.4/
|
|
||||||
cd /tmp/etcd-release-3.4/; \
|
|
||||||
git clone --depth 1 --branch release-3.4 https://github.com/etcd-io/etcd.git .; \
|
|
||||||
go get go.etcd.io/gofail@${GOFAIL_VERSION}; \
|
|
||||||
FAILPOINTS=true ./build;
|
|
||||||
mkdir -p ./bin
|
|
||||||
cp /tmp/etcd-release-3.4/bin/etcd ./bin/etcd
|
|
||||||
|
|
||||||
# Cleanup
|
# Cleanup
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
|
|
|
@ -129,7 +129,7 @@ function e2e_pass {
|
||||||
|
|
||||||
function robustness_pass {
|
function robustness_pass {
|
||||||
# e2e tests are running pre-build binary. Settings like --race,-cover,-cpu does not have any impact.
|
# e2e tests are running pre-build binary. Settings like --race,-cover,-cpu does not have any impact.
|
||||||
run_for_module "tests" go_test "./robustness/..." "keep_going" : -timeout="${TIMEOUT:-30m}" "${RUN_ARG[@]}" "$@"
|
run_for_module "tests" go_test "./robustness" "keep_going" : -timeout="${TIMEOUT:-30m}" "${RUN_ARG[@]}" "$@"
|
||||||
}
|
}
|
||||||
|
|
||||||
function integration_e2e_pass {
|
function integration_e2e_pass {
|
||||||
|
|
|
@ -315,7 +315,7 @@ function go_test {
|
||||||
additional_flags=$(${flags_for_package_func} ${pkg})
|
additional_flags=$(${flags_for_package_func} ${pkg})
|
||||||
|
|
||||||
# shellcheck disable=SC2206
|
# shellcheck disable=SC2206
|
||||||
local cmd=( go test ${goTestFlags} ${additional_flags} "$@" ${pkg} )
|
local cmd=( go test ${goTestFlags} ${additional_flags} ${pkg} "$@" )
|
||||||
|
|
||||||
# shellcheck disable=SC2086
|
# shellcheck disable=SC2086
|
||||||
if ! run env ${goTestEnv} ETCD_VERIFY="${ETCD_VERIFY}" "${cmd[@]}" | tee ${junit_filename_prefix:+"${junit_filename_prefix}.stdout"} | grep --binary-files=text "${go_test_grep_pattern}" ; then
|
if ! run env ${goTestEnv} ETCD_VERIFY="${ETCD_VERIFY}" "${cmd[@]}" | tee ${junit_filename_prefix:+"${junit_filename_prefix}.stdout"} | grep --binary-files=text "${go_test_grep_pattern}" ; then
|
||||||
|
|
|
@ -0,0 +1,71 @@
|
||||||
|
# etcd Robustness Testing
|
||||||
|
|
||||||
|
Purpose of etcd robustness tests is to validate that etcd upholds
|
||||||
|
[API guarantees] and [watch guarantees] under any condition or failure.
|
||||||
|
|
||||||
|
Robustness tests achieve that comparing etcd cluster behavior against a simplified model.
|
||||||
|
Multiple test encompass different etcd cluster setups, client traffic types and failures experienced by cluster.
|
||||||
|
During a single test we create a cluster and inject failures while sending and recording client traffic.
|
||||||
|
Correctness is validated by running collected history of client operations against the etcd model and a set of validators.
|
||||||
|
Upon failure tests generate a report that can be used to attribute whether failure was caused by bug in etcd or test framework.
|
||||||
|
|
||||||
|
[API guarantees]: https://etcd.io/docs/latest/learning/api_guarantees/
|
||||||
|
[watch guarantees]: https://etcd.io/docs/latest/learning/api/#watch-streams
|
||||||
|
|
||||||
|
## Running locally
|
||||||
|
|
||||||
|
1. Build etcd with failpoints
|
||||||
|
```bash
|
||||||
|
make gofail-enable
|
||||||
|
make build
|
||||||
|
make gofail-disable
|
||||||
|
```
|
||||||
|
2. Run the tests
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make test-robustness
|
||||||
|
```
|
||||||
|
|
||||||
|
Optionally you can pass environment variables:
|
||||||
|
* `GO_TEST_FLAGS` - to pass additional arguments to `go test`.
|
||||||
|
It is recommended to run tests multiple times with failfast enabled. this can be done by setting `GO_TEST_FLAGS='--count=100 --failfast'`.
|
||||||
|
* `EXPECT_DEBUG=true` - to get logs from the cluster.
|
||||||
|
* `RESULTS_DIR` - to change location where results report will be saved.
|
||||||
|
|
||||||
|
## Analysing failure
|
||||||
|
|
||||||
|
If robustness tests fails we want to analyse the report to confirm if the issue is on etcd side. Location of this report
|
||||||
|
is included in test logs. One of log lines should look like:
|
||||||
|
```
|
||||||
|
history.go:34: Model is not linearizable
|
||||||
|
logger.go:130: 2023-03-18T12:18:03.244+0100 INFO Saving member data dir {"member": "TestRobustnessIssue14370-test-0", "path": "/tmp/TestRobustness_Issue14370/TestRobustnessIssue14370-test-0"}
|
||||||
|
logger.go:130: 2023-03-18T12:18:03.244+0100 INFO Saving watch responses {"path": "/tmp/TestRobustness_Issue14370/TestRobustnessIssue14370-test-0/responses.json"}
|
||||||
|
logger.go:130: 2023-03-18T12:18:03.247+0100 INFO Saving watch events {"path": "/tmp/TestRobustness_Issue14370/TestRobustnessIssue14370-test-0/events.json"}
|
||||||
|
logger.go:130: 2023-03-18T12:18:03.248+0100 INFO Saving operation history {"path": "/tmp/TestRobustness_Issue14370/full-history.json"}
|
||||||
|
logger.go:130: 2023-03-18T12:18:03.252+0100 INFO Saving operation history {"path": "/tmp/TestRobustness_Issue14370/patched-history.json"}
|
||||||
|
logger.go:130: 2023-03-18T12:18:03.256+0100 INFO Saving visualization {"path": "/tmp/TestRobustness_Issue14370/history.html"}
|
||||||
|
```
|
||||||
|
|
||||||
|
Report includes multiple types of files:
|
||||||
|
* Member db files, can be used to verify disk/memory corruption.
|
||||||
|
* Watch responses saved as json, can be used to validate [watch guarantees].
|
||||||
|
* Operation history saved as both html visualization and a json, can be used to validate [API guarantees].
|
||||||
|
|
||||||
|
### Example analysis of linearization issue
|
||||||
|
|
||||||
|
Let's analyse issue [#14370].
|
||||||
|
To reproduce the issue by yourself run `make test-robustness-issue14370`.
|
||||||
|
After a couple of tries robustness tests should report `Model is not linearizable` and save report locally.
|
||||||
|
Lineralization issues are easiest to analyse via history visualization.
|
||||||
|
Open `/tmp/TestRobustness_Issue14370/history.html` file in your browser.
|
||||||
|
Jump to the error in linearization by clicking `[ jump to first error ]` on the top of the page.
|
||||||
|
|
||||||
|
You should see a graph similar to the one on the image below.
|
||||||
|
![issue14370](./issue14370.png)
|
||||||
|
|
||||||
|
Last correct request (connected with grey line) is a `Put` request that succeeded and got revision `168`.
|
||||||
|
All following requests are invalid (connected with red line) as they have revision `167`.
|
||||||
|
Etcd guarantee that revision is non-decreasing, so this shows a bug in etcd as there is no way revision should decrease.
|
||||||
|
This is consistent with the root cause of [#14370] as it was issue with process crash causing last write to be lost.
|
||||||
|
|
||||||
|
[#14370]: https://github.com/etcd-io/etcd/issues/14370
|
Binary file not shown.
After Width: | Height: | Size: 301 KiB |
|
@ -0,0 +1,92 @@
|
||||||
|
# Reproduce historical issues
|
||||||
|
|
||||||
|
.PHONY: test-robustness-issue14370
|
||||||
|
test-robustness-issue14370: /tmp/etcd-v3.5.4-failpoints/bin
|
||||||
|
GO_TEST_FLAGS='-v --run=TestRobustness/Issue14370 --count 100 --failfast --bin-dir=/tmp/etcd-v3.5.4-failpoints/bin' make test-robustness && \
|
||||||
|
echo "Failed to reproduce" || echo "Successful reproduction"
|
||||||
|
|
||||||
|
.PHONY: test-robustness-issue13766
|
||||||
|
test-robustness-issue13766: /tmp/etcd-v3.5.2-failpoints/bin
|
||||||
|
GO_TEST_FLAGS='-v --run=TestRobustness/Issue13766 --count 100 --failfast --bin-dir=/tmp/etcd-v3.5.2-failpoints/bin' make test-robustness && \
|
||||||
|
echo "Failed to reproduce" || echo "Successful reproduction"
|
||||||
|
|
||||||
|
.PHONY: test-robustness-issue14685
|
||||||
|
test-robustness-issue14685: /tmp/etcd-v3.5.5-failpoints/bin
|
||||||
|
GO_TEST_FLAGS='-v --run=TestRobustness/Issue14685 --count 100 --failfast --bin-dir=/tmp/etcd-v3.5.5-failpoints/bin' make test-robustness && \
|
||||||
|
echo "Failed to reproduce" || echo "Successful reproduction"
|
||||||
|
|
||||||
|
# Failpoints
|
||||||
|
|
||||||
|
GOFAIL_VERSION = $(shell cd tools/mod && go list -m -f {{.Version}} go.etcd.io/gofail)
|
||||||
|
|
||||||
|
.PHONY: gofail-enable
|
||||||
|
gofail-enable: install-gofail
|
||||||
|
gofail enable server/etcdserver/ server/storage/backend/ server/storage/mvcc/ server/storage/wal/
|
||||||
|
cd ./server && go get go.etcd.io/gofail@${GOFAIL_VERSION}
|
||||||
|
cd ./etcdutl && go get go.etcd.io/gofail@${GOFAIL_VERSION}
|
||||||
|
cd ./etcdctl && go get go.etcd.io/gofail@${GOFAIL_VERSION}
|
||||||
|
cd ./tests && go get go.etcd.io/gofail@${GOFAIL_VERSION}
|
||||||
|
|
||||||
|
.PHONY: gofail-disable
|
||||||
|
gofail-disable: install-gofail
|
||||||
|
gofail disable server/etcdserver/ server/storage/backend/ server/storage/mvcc/ server/storage/wal/
|
||||||
|
cd ./server && go mod tidy
|
||||||
|
cd ./etcdutl && go mod tidy
|
||||||
|
cd ./etcdctl && go mod tidy
|
||||||
|
cd ./tests && go mod tidy
|
||||||
|
|
||||||
|
.PHONY: install-gofail
|
||||||
|
install-gofail:
|
||||||
|
cd tools/mod; go install go.etcd.io/gofail@${GOFAIL_VERSION}
|
||||||
|
|
||||||
|
# Build previous releases for robustness tests
|
||||||
|
|
||||||
|
/tmp/etcd-v3.6.0-failpoints/bin:
|
||||||
|
rm -rf /tmp/etcd-v3.6.0-failpoints/
|
||||||
|
mkdir -p /tmp/etcd-v3.6.0-failpoints/
|
||||||
|
cd /tmp/etcd-v3.6.0-failpoints/; \
|
||||||
|
git clone --depth 1 --branch main https://github.com/etcd-io/etcd.git .; \
|
||||||
|
make gofail-enable; \
|
||||||
|
make build;
|
||||||
|
|
||||||
|
/tmp/etcd-v3.5.2-failpoints/bin:
|
||||||
|
/tmp/etcd-v3.5.4-failpoints/bin:
|
||||||
|
/tmp/etcd-v3.5.5-failpoints/bin:
|
||||||
|
/tmp/etcd-v3.5.%-failpoints/bin:
|
||||||
|
rm -rf /tmp/etcd-v3.5.$*-failpoints/
|
||||||
|
mkdir -p /tmp/etcd-v3.5.$*-failpoints/
|
||||||
|
cd /tmp/etcd-v3.5.$*-failpoints/; \
|
||||||
|
git clone --depth 1 --branch v3.5.$* https://github.com/etcd-io/etcd.git .; \
|
||||||
|
go get go.etcd.io/gofail@${GOFAIL_VERSION}; \
|
||||||
|
(cd server; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \
|
||||||
|
(cd etcdctl; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \
|
||||||
|
(cd etcdutl; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \
|
||||||
|
FAILPOINTS=true ./build;
|
||||||
|
|
||||||
|
/tmp/etcd-release-3.5-failpoints/bin/etcd:
|
||||||
|
rm -rf /tmp/etcd-release-3.5-failpoints/
|
||||||
|
mkdir -p /tmp/etcd-release-3.5-failpoints/
|
||||||
|
cd /tmp/etcd-release-3.5-failpoints/; \
|
||||||
|
git clone --depth 1 --branch release-3.5 https://github.com/etcd-io/etcd.git .; \
|
||||||
|
go get go.etcd.io/gofail@${GOFAIL_VERSION}; \
|
||||||
|
(cd server; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \
|
||||||
|
(cd etcdctl; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \
|
||||||
|
(cd etcdutl; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \
|
||||||
|
FAILPOINTS=true ./build;
|
||||||
|
|
||||||
|
/tmp/etcd-v3.4.23-failpoints/bin:
|
||||||
|
/tmp/etcd-v3.4.%-failpoints/bin:
|
||||||
|
rm -rf /tmp/etcd-v3.4.$*-failpoints/
|
||||||
|
mkdir -p /tmp/etcd-v3.4.$*-failpoints/
|
||||||
|
cd /tmp/etcd-v3.4.$*-failpoints/; \
|
||||||
|
git clone --depth 1 --branch v3.4.$* https://github.com/etcd-io/etcd.git .; \
|
||||||
|
go get go.etcd.io/gofail@${GOFAIL_VERSION}; \
|
||||||
|
FAILPOINTS=true ./build;
|
||||||
|
|
||||||
|
/tmp/etcd-release-3.4-failpoints/bin/etcd:
|
||||||
|
rm -rf /tmp/etcd-release-3.4-failpoints/
|
||||||
|
mkdir -p /tmp/etcd-release-3.4-failpoints/
|
||||||
|
cd /tmp/etcd-release-3.4-failpoints/; \
|
||||||
|
git clone --depth 1 --branch release-3.4 https://github.com/etcd-io/etcd.git .; \
|
||||||
|
go get go.etcd.io/gofail@${GOFAIL_VERSION}; \
|
||||||
|
FAILPOINTS=true ./build;
|
Loading…
Reference in New Issue