diff --git a/.github/workflows/robustness-template.yaml b/.github/workflows/robustness-template.yaml index a5fe1fe2b..122552dd1 100644 --- a/.github/workflows/robustness-template.yaml +++ b/.github/workflows/robustness-template.yaml @@ -33,18 +33,19 @@ jobs: case "${GITHUB_REF}" in release-3.5) - make build-failpoints-release-3.5 - ./bin/etcd --version + make /tmp/etcd-release-3.5-failpoints/bin/etcd + cp /tmp/etcd-release-3.5-failpoints/bin/etcd bin/etcd ;; release-3.4) - make build-failpoints-release-3.4 - ./bin/etcd --version + make /tmp/etcd-release-3.4-failpoints/bin/etcd + cp /tmp/etcd-release-3.4-failpoints/bin/etcd bin/etcd ;; *) make gofail-enable make build ;; esac + ./bin/etcd --version - name: test-robustness run: | set -euo pipefail diff --git a/Makefile b/Makefile index cf9d51eb8..cae7e241f 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,6 @@ +all: build +include tests/robustness/makefile.mk + .PHONY: build build: GO_BUILD_FLAGS="${GO_BUILD_FLAGS} -v" ./scripts/build.sh @@ -113,53 +116,6 @@ verify-genproto: verify-goimport: PASSES="goimport" ./scripts/test.sh -# Failpoints - -GOFAIL_VERSION = $(shell cd tools/mod && go list -m -f {{.Version}} go.etcd.io/gofail) - -.PHONY: gofail-enable -gofail-enable: install-gofail - gofail enable server/etcdserver/ server/storage/backend/ server/storage/mvcc/ server/storage/wal/ - cd ./server && go get go.etcd.io/gofail@${GOFAIL_VERSION} - cd ./etcdutl && go get go.etcd.io/gofail@${GOFAIL_VERSION} - cd ./etcdctl && go get go.etcd.io/gofail@${GOFAIL_VERSION} - cd ./tests && go get go.etcd.io/gofail@${GOFAIL_VERSION} - -.PHONY: gofail-disable -gofail-disable: install-gofail - gofail disable server/etcdserver/ server/storage/backend/ server/storage/mvcc/ server/storage/wal/ - cd ./server && go mod tidy - cd ./etcdutl && go mod tidy - cd ./etcdctl && go mod tidy - cd ./tests && go mod tidy - -.PHONY: install-gofail -install-gofail: - cd tools/mod; go install go.etcd.io/gofail@${GOFAIL_VERSION} - -build-failpoints-release-3.5: - rm -rf /tmp/etcd-release-3.5/ - mkdir -p /tmp/etcd-release-3.5/ - cd /tmp/etcd-release-3.5/; \ - git clone --depth 1 --branch release-3.5 https://github.com/etcd-io/etcd.git .; \ - go get go.etcd.io/gofail@${GOFAIL_VERSION}; \ - (cd server; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \ - (cd etcdctl; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \ - (cd etcdutl; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \ - FAILPOINTS=true ./build; - mkdir -p ./bin - cp /tmp/etcd-release-3.5/bin/etcd ./bin/etcd - -build-failpoints-release-3.4: - rm -rf /tmp/etcd-release-3.4/ - mkdir -p /tmp/etcd-release-3.4/ - cd /tmp/etcd-release-3.4/; \ - git clone --depth 1 --branch release-3.4 https://github.com/etcd-io/etcd.git .; \ - go get go.etcd.io/gofail@${GOFAIL_VERSION}; \ - FAILPOINTS=true ./build; - mkdir -p ./bin - cp /tmp/etcd-release-3.4/bin/etcd ./bin/etcd - # Cleanup clean: diff --git a/scripts/test.sh b/scripts/test.sh index 440b12039..991994f2a 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -129,7 +129,7 @@ function e2e_pass { function robustness_pass { # e2e tests are running pre-build binary. Settings like --race,-cover,-cpu does not have any impact. - run_for_module "tests" go_test "./robustness/..." "keep_going" : -timeout="${TIMEOUT:-30m}" "${RUN_ARG[@]}" "$@" + run_for_module "tests" go_test "./robustness" "keep_going" : -timeout="${TIMEOUT:-30m}" "${RUN_ARG[@]}" "$@" } function integration_e2e_pass { diff --git a/scripts/test_lib.sh b/scripts/test_lib.sh index 46e582f1c..5bdb19cad 100644 --- a/scripts/test_lib.sh +++ b/scripts/test_lib.sh @@ -315,7 +315,7 @@ function go_test { additional_flags=$(${flags_for_package_func} ${pkg}) # shellcheck disable=SC2206 - local cmd=( go test ${goTestFlags} ${additional_flags} "$@" ${pkg} ) + local cmd=( go test ${goTestFlags} ${additional_flags} ${pkg} "$@" ) # shellcheck disable=SC2086 if ! run env ${goTestEnv} ETCD_VERIFY="${ETCD_VERIFY}" "${cmd[@]}" | tee ${junit_filename_prefix:+"${junit_filename_prefix}.stdout"} | grep --binary-files=text "${go_test_grep_pattern}" ; then diff --git a/tests/robustness/README.md b/tests/robustness/README.md new file mode 100644 index 000000000..e92b0cdfb --- /dev/null +++ b/tests/robustness/README.md @@ -0,0 +1,71 @@ +# etcd Robustness Testing + +Purpose of etcd robustness tests is to validate that etcd upholds +[API guarantees] and [watch guarantees] under any condition or failure. + +Robustness tests achieve that comparing etcd cluster behavior against a simplified model. +Multiple test encompass different etcd cluster setups, client traffic types and failures experienced by cluster. +During a single test we create a cluster and inject failures while sending and recording client traffic. +Correctness is validated by running collected history of client operations against the etcd model and a set of validators. +Upon failure tests generate a report that can be used to attribute whether failure was caused by bug in etcd or test framework. + +[API guarantees]: https://etcd.io/docs/latest/learning/api_guarantees/ +[watch guarantees]: https://etcd.io/docs/latest/learning/api/#watch-streams + +## Running locally + +1. Build etcd with failpoints + ```bash + make gofail-enable + make build + make gofail-disable + ``` +2. Run the tests + + ```bash + make test-robustness + ``` + + Optionally you can pass environment variables: + * `GO_TEST_FLAGS` - to pass additional arguments to `go test`. + It is recommended to run tests multiple times with failfast enabled. this can be done by setting `GO_TEST_FLAGS='--count=100 --failfast'`. + * `EXPECT_DEBUG=true` - to get logs from the cluster. + * `RESULTS_DIR` - to change location where results report will be saved. + +## Analysing failure + +If robustness tests fails we want to analyse the report to confirm if the issue is on etcd side. Location of this report +is included in test logs. One of log lines should look like: +``` + history.go:34: Model is not linearizable + logger.go:130: 2023-03-18T12:18:03.244+0100 INFO Saving member data dir {"member": "TestRobustnessIssue14370-test-0", "path": "/tmp/TestRobustness_Issue14370/TestRobustnessIssue14370-test-0"} + logger.go:130: 2023-03-18T12:18:03.244+0100 INFO Saving watch responses {"path": "/tmp/TestRobustness_Issue14370/TestRobustnessIssue14370-test-0/responses.json"} + logger.go:130: 2023-03-18T12:18:03.247+0100 INFO Saving watch events {"path": "/tmp/TestRobustness_Issue14370/TestRobustnessIssue14370-test-0/events.json"} + logger.go:130: 2023-03-18T12:18:03.248+0100 INFO Saving operation history {"path": "/tmp/TestRobustness_Issue14370/full-history.json"} + logger.go:130: 2023-03-18T12:18:03.252+0100 INFO Saving operation history {"path": "/tmp/TestRobustness_Issue14370/patched-history.json"} + logger.go:130: 2023-03-18T12:18:03.256+0100 INFO Saving visualization {"path": "/tmp/TestRobustness_Issue14370/history.html"} +``` + +Report includes multiple types of files: +* Member db files, can be used to verify disk/memory corruption. +* Watch responses saved as json, can be used to validate [watch guarantees]. +* Operation history saved as both html visualization and a json, can be used to validate [API guarantees]. + +### Example analysis of linearization issue + +Let's analyse issue [#14370]. +To reproduce the issue by yourself run `make test-robustness-issue14370`. +After a couple of tries robustness tests should report `Model is not linearizable` and save report locally. +Lineralization issues are easiest to analyse via history visualization. +Open `/tmp/TestRobustness_Issue14370/history.html` file in your browser. +Jump to the error in linearization by clicking `[ jump to first error ]` on the top of the page. + +You should see a graph similar to the one on the image below. +![issue14370](./issue14370.png) + +Last correct request (connected with grey line) is a `Put` request that succeeded and got revision `168`. +All following requests are invalid (connected with red line) as they have revision `167`. +Etcd guarantee that revision is non-decreasing, so this shows a bug in etcd as there is no way revision should decrease. +This is consistent with the root cause of [#14370] as it was issue with process crash causing last write to be lost. + +[#14370]: https://github.com/etcd-io/etcd/issues/14370 \ No newline at end of file diff --git a/tests/robustness/issue14370.png b/tests/robustness/issue14370.png new file mode 100644 index 000000000..2b418c486 Binary files /dev/null and b/tests/robustness/issue14370.png differ diff --git a/tests/robustness/makefile.mk b/tests/robustness/makefile.mk new file mode 100644 index 000000000..a75affb55 --- /dev/null +++ b/tests/robustness/makefile.mk @@ -0,0 +1,92 @@ +# Reproduce historical issues + +.PHONY: test-robustness-issue14370 +test-robustness-issue14370: /tmp/etcd-v3.5.4-failpoints/bin + GO_TEST_FLAGS='-v --run=TestRobustness/Issue14370 --count 100 --failfast --bin-dir=/tmp/etcd-v3.5.4-failpoints/bin' make test-robustness && \ + echo "Failed to reproduce" || echo "Successful reproduction" + +.PHONY: test-robustness-issue13766 +test-robustness-issue13766: /tmp/etcd-v3.5.2-failpoints/bin + GO_TEST_FLAGS='-v --run=TestRobustness/Issue13766 --count 100 --failfast --bin-dir=/tmp/etcd-v3.5.2-failpoints/bin' make test-robustness && \ + echo "Failed to reproduce" || echo "Successful reproduction" + +.PHONY: test-robustness-issue14685 +test-robustness-issue14685: /tmp/etcd-v3.5.5-failpoints/bin + GO_TEST_FLAGS='-v --run=TestRobustness/Issue14685 --count 100 --failfast --bin-dir=/tmp/etcd-v3.5.5-failpoints/bin' make test-robustness && \ + echo "Failed to reproduce" || echo "Successful reproduction" + +# Failpoints + +GOFAIL_VERSION = $(shell cd tools/mod && go list -m -f {{.Version}} go.etcd.io/gofail) + +.PHONY: gofail-enable +gofail-enable: install-gofail + gofail enable server/etcdserver/ server/storage/backend/ server/storage/mvcc/ server/storage/wal/ + cd ./server && go get go.etcd.io/gofail@${GOFAIL_VERSION} + cd ./etcdutl && go get go.etcd.io/gofail@${GOFAIL_VERSION} + cd ./etcdctl && go get go.etcd.io/gofail@${GOFAIL_VERSION} + cd ./tests && go get go.etcd.io/gofail@${GOFAIL_VERSION} + +.PHONY: gofail-disable +gofail-disable: install-gofail + gofail disable server/etcdserver/ server/storage/backend/ server/storage/mvcc/ server/storage/wal/ + cd ./server && go mod tidy + cd ./etcdutl && go mod tidy + cd ./etcdctl && go mod tidy + cd ./tests && go mod tidy + +.PHONY: install-gofail +install-gofail: + cd tools/mod; go install go.etcd.io/gofail@${GOFAIL_VERSION} + +# Build previous releases for robustness tests + +/tmp/etcd-v3.6.0-failpoints/bin: + rm -rf /tmp/etcd-v3.6.0-failpoints/ + mkdir -p /tmp/etcd-v3.6.0-failpoints/ + cd /tmp/etcd-v3.6.0-failpoints/; \ + git clone --depth 1 --branch main https://github.com/etcd-io/etcd.git .; \ + make gofail-enable; \ + make build; + +/tmp/etcd-v3.5.2-failpoints/bin: +/tmp/etcd-v3.5.4-failpoints/bin: +/tmp/etcd-v3.5.5-failpoints/bin: +/tmp/etcd-v3.5.%-failpoints/bin: + rm -rf /tmp/etcd-v3.5.$*-failpoints/ + mkdir -p /tmp/etcd-v3.5.$*-failpoints/ + cd /tmp/etcd-v3.5.$*-failpoints/; \ + git clone --depth 1 --branch v3.5.$* https://github.com/etcd-io/etcd.git .; \ + go get go.etcd.io/gofail@${GOFAIL_VERSION}; \ + (cd server; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \ + (cd etcdctl; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \ + (cd etcdutl; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \ + FAILPOINTS=true ./build; + +/tmp/etcd-release-3.5-failpoints/bin/etcd: + rm -rf /tmp/etcd-release-3.5-failpoints/ + mkdir -p /tmp/etcd-release-3.5-failpoints/ + cd /tmp/etcd-release-3.5-failpoints/; \ + git clone --depth 1 --branch release-3.5 https://github.com/etcd-io/etcd.git .; \ + go get go.etcd.io/gofail@${GOFAIL_VERSION}; \ + (cd server; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \ + (cd etcdctl; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \ + (cd etcdutl; go get go.etcd.io/gofail@${GOFAIL_VERSION}); \ + FAILPOINTS=true ./build; + +/tmp/etcd-v3.4.23-failpoints/bin: +/tmp/etcd-v3.4.%-failpoints/bin: + rm -rf /tmp/etcd-v3.4.$*-failpoints/ + mkdir -p /tmp/etcd-v3.4.$*-failpoints/ + cd /tmp/etcd-v3.4.$*-failpoints/; \ + git clone --depth 1 --branch v3.4.$* https://github.com/etcd-io/etcd.git .; \ + go get go.etcd.io/gofail@${GOFAIL_VERSION}; \ + FAILPOINTS=true ./build; + +/tmp/etcd-release-3.4-failpoints/bin/etcd: + rm -rf /tmp/etcd-release-3.4-failpoints/ + mkdir -p /tmp/etcd-release-3.4-failpoints/ + cd /tmp/etcd-release-3.4-failpoints/; \ + git clone --depth 1 --branch release-3.4 https://github.com/etcd-io/etcd.git .; \ + go get go.etcd.io/gofail@${GOFAIL_VERSION}; \ + FAILPOINTS=true ./build;