Merge pull request #14838 from serathius/linearizability-docs

tests: Document robustness tests
storage-doc
Marek Siarkowicz 2023-03-28 16:22:09 +02:00 committed by GitHub
commit 5223d09d41
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 173 additions and 53 deletions

View File

@ -33,18 +33,19 @@ jobs:
case "${GITHUB_REF}" in
release-3.5)
make build-failpoints-release-3.5
./bin/etcd --version
make /tmp/etcd-release-3.5-failpoints/bin/etcd
cp /tmp/etcd-release-3.5-failpoints/bin/etcd bin/etcd
;;
release-3.4)
make build-failpoints-release-3.4
./bin/etcd --version
make /tmp/etcd-release-3.4-failpoints/bin/etcd
cp /tmp/etcd-release-3.4-failpoints/bin/etcd bin/etcd
;;
*)
make gofail-enable
make build
;;
esac
./bin/etcd --version
- name: test-robustness
run: |
set -euo pipefail

View File

@ -1,3 +1,6 @@
all: build
include tests/robustness/makefile.mk
.PHONY: build
build:
GO_BUILD_FLAGS="${GO_BUILD_FLAGS} -v" ./scripts/build.sh
@ -113,53 +116,6 @@ verify-genproto:
verify-goimport:
PASSES="goimport" ./scripts/test.sh
# Failpoints
GOFAIL_VERSION = $(shell cd tools/mod && go list -m -f {{.Version}} go.etcd.io/gofail)
.PHONY: gofail-enable
gofail-enable: install-gofail
gofail enable server/etcdserver/ server/storage/backend/ server/storage/mvcc/ server/storage/wal/
cd ./server && go get go.etcd.io/gofail@${GOFAIL_VERSION}
cd ./etcdutl && go get go.etcd.io/gofail@${GOFAIL_VERSION}
cd ./etcdctl && go get go.etcd.io/gofail@${GOFAIL_VERSION}
cd ./tests && go get go.etcd.io/gofail@${GOFAIL_VERSION}
.PHONY: gofail-disable
gofail-disable: install-gofail
gofail disable server/etcdserver/ server/storage/backend/ server/storage/mvcc/ server/storage/wal/
cd ./server && go mod tidy
cd ./etcdutl && go mod tidy
cd ./etcdctl && go mod tidy
cd ./tests && go mod tidy
.PHONY: install-gofail
install-gofail:
cd tools/mod; go install go.etcd.io/gofail@${GOFAIL_VERSION}
build-failpoints-release-3.5:
rm -rf /tmp/etcd-release-3.5/
mkdir -p /tmp/etcd-release-3.5/
cd /tmp/etcd-release-3.5/; \
git clone --depth 1 --branch release-3.5 https://github.com/etcd-io/etcd.git .; \
go get go.etcd.io/gofail@${GOFAIL_VERSION}; \
(cd server; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \
(cd etcdctl; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \
(cd etcdutl; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \
FAILPOINTS=true ./build;
mkdir -p ./bin
cp /tmp/etcd-release-3.5/bin/etcd ./bin/etcd
build-failpoints-release-3.4:
rm -rf /tmp/etcd-release-3.4/
mkdir -p /tmp/etcd-release-3.4/
cd /tmp/etcd-release-3.4/; \
git clone --depth 1 --branch release-3.4 https://github.com/etcd-io/etcd.git .; \
go get go.etcd.io/gofail@${GOFAIL_VERSION}; \
FAILPOINTS=true ./build;
mkdir -p ./bin
cp /tmp/etcd-release-3.4/bin/etcd ./bin/etcd
# Cleanup
clean:

View File

@ -129,7 +129,7 @@ function e2e_pass {
function robustness_pass {
# e2e tests are running pre-build binary. Settings like --race,-cover,-cpu does not have any impact.
run_for_module "tests" go_test "./robustness/..." "keep_going" : -timeout="${TIMEOUT:-30m}" "${RUN_ARG[@]}" "$@"
run_for_module "tests" go_test "./robustness" "keep_going" : -timeout="${TIMEOUT:-30m}" "${RUN_ARG[@]}" "$@"
}
function integration_e2e_pass {

View File

@ -315,7 +315,7 @@ function go_test {
additional_flags=$(${flags_for_package_func} ${pkg})
# shellcheck disable=SC2206
local cmd=( go test ${goTestFlags} ${additional_flags} "$@" ${pkg} )
local cmd=( go test ${goTestFlags} ${additional_flags} ${pkg} "$@" )
# shellcheck disable=SC2086
if ! run env ${goTestEnv} ETCD_VERIFY="${ETCD_VERIFY}" "${cmd[@]}" | tee ${junit_filename_prefix:+"${junit_filename_prefix}.stdout"} | grep --binary-files=text "${go_test_grep_pattern}" ; then

View File

@ -0,0 +1,71 @@
# etcd Robustness Testing
Purpose of etcd robustness tests is to validate that etcd upholds
[API guarantees] and [watch guarantees] under any condition or failure.
Robustness tests achieve that comparing etcd cluster behavior against a simplified model.
Multiple test encompass different etcd cluster setups, client traffic types and failures experienced by cluster.
During a single test we create a cluster and inject failures while sending and recording client traffic.
Correctness is validated by running collected history of client operations against the etcd model and a set of validators.
Upon failure tests generate a report that can be used to attribute whether failure was caused by bug in etcd or test framework.
[API guarantees]: https://etcd.io/docs/latest/learning/api_guarantees/
[watch guarantees]: https://etcd.io/docs/latest/learning/api/#watch-streams
## Running locally
1. Build etcd with failpoints
```bash
make gofail-enable
make build
make gofail-disable
```
2. Run the tests
```bash
make test-robustness
```
Optionally you can pass environment variables:
* `GO_TEST_FLAGS` - to pass additional arguments to `go test`.
It is recommended to run tests multiple times with failfast enabled. this can be done by setting `GO_TEST_FLAGS='--count=100 --failfast'`.
* `EXPECT_DEBUG=true` - to get logs from the cluster.
* `RESULTS_DIR` - to change location where results report will be saved.
## Analysing failure
If robustness tests fails we want to analyse the report to confirm if the issue is on etcd side. Location of this report
is included in test logs. One of log lines should look like:
```
history.go:34: Model is not linearizable
logger.go:130: 2023-03-18T12:18:03.244+0100 INFO Saving member data dir {"member": "TestRobustnessIssue14370-test-0", "path": "/tmp/TestRobustness_Issue14370/TestRobustnessIssue14370-test-0"}
logger.go:130: 2023-03-18T12:18:03.244+0100 INFO Saving watch responses {"path": "/tmp/TestRobustness_Issue14370/TestRobustnessIssue14370-test-0/responses.json"}
logger.go:130: 2023-03-18T12:18:03.247+0100 INFO Saving watch events {"path": "/tmp/TestRobustness_Issue14370/TestRobustnessIssue14370-test-0/events.json"}
logger.go:130: 2023-03-18T12:18:03.248+0100 INFO Saving operation history {"path": "/tmp/TestRobustness_Issue14370/full-history.json"}
logger.go:130: 2023-03-18T12:18:03.252+0100 INFO Saving operation history {"path": "/tmp/TestRobustness_Issue14370/patched-history.json"}
logger.go:130: 2023-03-18T12:18:03.256+0100 INFO Saving visualization {"path": "/tmp/TestRobustness_Issue14370/history.html"}
```
Report includes multiple types of files:
* Member db files, can be used to verify disk/memory corruption.
* Watch responses saved as json, can be used to validate [watch guarantees].
* Operation history saved as both html visualization and a json, can be used to validate [API guarantees].
### Example analysis of linearization issue
Let's analyse issue [#14370].
To reproduce the issue by yourself run `make test-robustness-issue14370`.
After a couple of tries robustness tests should report `Model is not linearizable` and save report locally.
Lineralization issues are easiest to analyse via history visualization.
Open `/tmp/TestRobustness_Issue14370/history.html` file in your browser.
Jump to the error in linearization by clicking `[ jump to first error ]` on the top of the page.
You should see a graph similar to the one on the image below.
![issue14370](./issue14370.png)
Last correct request (connected with grey line) is a `Put` request that succeeded and got revision `168`.
All following requests are invalid (connected with red line) as they have revision `167`.
Etcd guarantee that revision is non-decreasing, so this shows a bug in etcd as there is no way revision should decrease.
This is consistent with the root cause of [#14370] as it was issue with process crash causing last write to be lost.
[#14370]: https://github.com/etcd-io/etcd/issues/14370

Binary file not shown.

After

Width:  |  Height:  |  Size: 301 KiB

View File

@ -0,0 +1,92 @@
# Reproduce historical issues
.PHONY: test-robustness-issue14370
test-robustness-issue14370: /tmp/etcd-v3.5.4-failpoints/bin
GO_TEST_FLAGS='-v --run=TestRobustness/Issue14370 --count 100 --failfast --bin-dir=/tmp/etcd-v3.5.4-failpoints/bin' make test-robustness && \
echo "Failed to reproduce" || echo "Successful reproduction"
.PHONY: test-robustness-issue13766
test-robustness-issue13766: /tmp/etcd-v3.5.2-failpoints/bin
GO_TEST_FLAGS='-v --run=TestRobustness/Issue13766 --count 100 --failfast --bin-dir=/tmp/etcd-v3.5.2-failpoints/bin' make test-robustness && \
echo "Failed to reproduce" || echo "Successful reproduction"
.PHONY: test-robustness-issue14685
test-robustness-issue14685: /tmp/etcd-v3.5.5-failpoints/bin
GO_TEST_FLAGS='-v --run=TestRobustness/Issue14685 --count 100 --failfast --bin-dir=/tmp/etcd-v3.5.5-failpoints/bin' make test-robustness && \
echo "Failed to reproduce" || echo "Successful reproduction"
# Failpoints
GOFAIL_VERSION = $(shell cd tools/mod && go list -m -f {{.Version}} go.etcd.io/gofail)
.PHONY: gofail-enable
gofail-enable: install-gofail
gofail enable server/etcdserver/ server/storage/backend/ server/storage/mvcc/ server/storage/wal/
cd ./server && go get go.etcd.io/gofail@${GOFAIL_VERSION}
cd ./etcdutl && go get go.etcd.io/gofail@${GOFAIL_VERSION}
cd ./etcdctl && go get go.etcd.io/gofail@${GOFAIL_VERSION}
cd ./tests && go get go.etcd.io/gofail@${GOFAIL_VERSION}
.PHONY: gofail-disable
gofail-disable: install-gofail
gofail disable server/etcdserver/ server/storage/backend/ server/storage/mvcc/ server/storage/wal/
cd ./server && go mod tidy
cd ./etcdutl && go mod tidy
cd ./etcdctl && go mod tidy
cd ./tests && go mod tidy
.PHONY: install-gofail
install-gofail:
cd tools/mod; go install go.etcd.io/gofail@${GOFAIL_VERSION}
# Build previous releases for robustness tests
/tmp/etcd-v3.6.0-failpoints/bin:
rm -rf /tmp/etcd-v3.6.0-failpoints/
mkdir -p /tmp/etcd-v3.6.0-failpoints/
cd /tmp/etcd-v3.6.0-failpoints/; \
git clone --depth 1 --branch main https://github.com/etcd-io/etcd.git .; \
make gofail-enable; \
make build;
/tmp/etcd-v3.5.2-failpoints/bin:
/tmp/etcd-v3.5.4-failpoints/bin:
/tmp/etcd-v3.5.5-failpoints/bin:
/tmp/etcd-v3.5.%-failpoints/bin:
rm -rf /tmp/etcd-v3.5.$*-failpoints/
mkdir -p /tmp/etcd-v3.5.$*-failpoints/
cd /tmp/etcd-v3.5.$*-failpoints/; \
git clone --depth 1 --branch v3.5.$* https://github.com/etcd-io/etcd.git .; \
go get go.etcd.io/gofail@${GOFAIL_VERSION}; \
(cd server; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \
(cd etcdctl; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \
(cd etcdutl; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \
FAILPOINTS=true ./build;
/tmp/etcd-release-3.5-failpoints/bin/etcd:
rm -rf /tmp/etcd-release-3.5-failpoints/
mkdir -p /tmp/etcd-release-3.5-failpoints/
cd /tmp/etcd-release-3.5-failpoints/; \
git clone --depth 1 --branch release-3.5 https://github.com/etcd-io/etcd.git .; \
go get go.etcd.io/gofail@${GOFAIL_VERSION}; \
(cd server; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \
(cd etcdctl; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \
(cd etcdutl; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \
FAILPOINTS=true ./build;
/tmp/etcd-v3.4.23-failpoints/bin:
/tmp/etcd-v3.4.%-failpoints/bin:
rm -rf /tmp/etcd-v3.4.$*-failpoints/
mkdir -p /tmp/etcd-v3.4.$*-failpoints/
cd /tmp/etcd-v3.4.$*-failpoints/; \
git clone --depth 1 --branch v3.4.$* https://github.com/etcd-io/etcd.git .; \
go get go.etcd.io/gofail@${GOFAIL_VERSION}; \
FAILPOINTS=true ./build;
/tmp/etcd-release-3.4-failpoints/bin/etcd:
rm -rf /tmp/etcd-release-3.4-failpoints/
mkdir -p /tmp/etcd-release-3.4-failpoints/
cd /tmp/etcd-release-3.4-failpoints/; \
git clone --depth 1 --branch release-3.4 https://github.com/etcd-io/etcd.git .; \
go get go.etcd.io/gofail@${GOFAIL_VERSION}; \
FAILPOINTS=true ./build;