Merge pull request #15284 from serathius/funtional-remove

tests: Remove functional testing as they were replaced by linearizabi…
dependabot/go_modules/go.uber.org/atomic-1.10.0
Marek Siarkowicz 2023-02-13 12:32:04 +01:00 committed by GitHub
commit ed30d5415a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
66 changed files with 8 additions and 15764 deletions

View File

@ -1,33 +0,0 @@
name: functional-arm64-tests
on:
schedule:
- cron: '0 0/4 * * *' # runs every 4 hours
permissions: read-all
jobs:
test:
runs-on: [Linux, ARM64]
strategy:
fail-fast: true
matrix:
target:
- linux-arm64-functional
steps:
- uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0
- uses: actions/setup-go@6edd4406fa81c3da01a34fa6f6343087c207a568 # v3.5.0
with:
ref: main
go-version: "1.19.5"
- run: date
- env:
TARGET: ${{ matrix.target }}
run: |
echo "${TARGET}"
case "${TARGET}" in
linux-arm64-functional)
GO_BUILD_FLAGS='-v -mod=readonly' ./scripts/build.sh && GOARCH=arm64 PASSES='functional' ./scripts/test.sh
;;
*)
echo "Failed to find target"
exit 1
;;
esac

View File

@ -1,30 +0,0 @@
name: functional-tests
on: [push, pull_request]
permissions: read-all
jobs:
test:
runs-on: ubuntu-latest
strategy:
fail-fast: true
matrix:
target:
- linux-amd64-functional
steps:
- uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0
- uses: actions/setup-go@6edd4406fa81c3da01a34fa6f6343087c207a568 # v3.5.0
with:
go-version: "1.19.5"
- run: date
- env:
TARGET: ${{ matrix.target }}
run: |
echo "${TARGET}"
case "${TARGET}" in
linux-amd64-functional)
GO_BUILD_FLAGS='-v -mod=readonly' ./scripts/build.sh && GOARCH=amd64 PASSES='functional' ./scripts/test.sh
;;
*)
echo "Failed to find target"
exit 1
;;
esac

View File

@ -23,7 +23,7 @@ etcd is a distributed reliable key-value store for the most critical data of a d
etcd is written in Go and uses the [Raft][] consensus algorithm to manage a highly-available replicated log.
etcd is used [in production by many companies](./ADOPTERS.md), and the development team stands behind it in critical deployment scenarios, where etcd is frequently teamed with applications such as [Kubernetes][k8s], [locksmith][], [vulcand][], [Doorman][], and many others. Reliability is further ensured by [**rigorous testing**](https://github.com/etcd-io/etcd/tree/main/tests/functional).
etcd is used [in production by many companies](./ADOPTERS.md), and the development team stands behind it in critical deployment scenarios, where etcd is frequently teamed with applications such as [Kubernetes][k8s], [locksmith][], [vulcand][], [Doorman][], and many others. Reliability is further ensured by [**rigorous testing**](https://github.com/etcd-io/etcd/tree/main/tests/linearizability).
See [etcdctl][etcdctl] for a simple command line client.

View File

@ -78,15 +78,6 @@ tools_build() {
"-ldflags=${GO_LDFLAGS[*]}" \
-o="${out}/${tool}" "./${tool}" || return 2
done
tests_build "${@}"
}
tests_build() {
out=${BINDIR:-./bin}
out=$(readlink -f "$out")
out="${out}/functional/cmd"
mkdir -p "${out}"
BINDIR="${out}" run ./tests/functional/build.sh || return 2
}
run_build() {

View File

@ -36,7 +36,7 @@ echo " - raft-root: ${RAFT_ROOT}"
GOGOPROTO_PATH="${GOGOPROTO_ROOT}:${GOGOPROTO_ROOT}/protobuf"
# directories containing protos to be built
DIRS="./server/storage/wal/walpb ./api/etcdserverpb ./server/etcdserver/api/snap/snappb ./api/mvccpb ./server/lease/leasepb ./api/authpb ./server/etcdserver/api/v3lock/v3lockpb ./server/etcdserver/api/v3election/v3electionpb ./api/membershippb ./tests/functional ./api/versionpb"
DIRS="./server/storage/wal/walpb ./api/etcdserverpb ./server/etcdserver/api/snap/snappb ./api/mvccpb ./server/lease/leasepb ./api/authpb ./server/etcdserver/api/v3lock/v3lockpb ./server/etcdserver/api/v3election/v3electionpb ./api/membershippb ./api/versionpb"
log_callout -e "\\nRunning gofast (gogo) proto generation..."

View File

@ -151,57 +151,6 @@ function generic_checker {
fi
}
function killall_functional_test {
log_callout "Killing all etcd-agent and etcd processes..."
killall -9 etcd-agent
# When functional test is successful, the etcd processes have already been
# stopped by the agent, so we should ignore the error in this case.
killall -9 etcd || true
}
function functional_pass {
run ./tests/functional/build.sh || exit 1
# Clean up any data and logs from previous runs
rm -rf /tmp/etcd-functional-* /tmp/etcd-functional-*.backup
# TODO: These ports should be dynamically allocated instead of hard-coded.
for a in 1 2 3; do
./bin/etcd-agent --network tcp --address 127.0.0.1:${a}9027 < /dev/null &
done
for a in 1 2 3; do
log_callout "Waiting for 'etcd-agent' on ${a}9027..."
while ! nc -z localhost ${a}9027; do
sleep 1
done
done
trap killall_functional_test 0
log_callout "functional test START!"
run ./bin/etcd-tester --config ./tests/functional/functional.yaml -test.v && log_success "'etcd-tester' succeeded"
local etcd_tester_exit_code=$?
if [[ "${etcd_tester_exit_code}" -ne "0" ]]; then
log_error "ETCD_TESTER_EXIT_CODE:" ${etcd_tester_exit_code}
log_error -e "\\nFAILED! 'tail -100 /tmp/etcd-functional-1/etcd.log'"
tail -100 /tmp/etcd-functional-1/etcd.log
log_error -e "\\nFAILED! 'tail -100 /tmp/etcd-functional-2/etcd.log'"
tail -100 /tmp/etcd-functional-2/etcd.log
log_error -e "\\nFAILED! 'tail -100 /tmp/etcd-functional-3/etcd.log'"
tail -100 /tmp/etcd-functional-3/etcd.log
log_error "--- FAIL: exit code" ${etcd_tester_exit_code}
exit ${etcd_tester_exit_code}
fi
log_success "functional test PASS!"
}
function grpcproxy_pass {
run_pass "grpcproxy_integration" "${@}"
run_pass "grpcproxy_e2e" "${@}"

View File

@ -1,42 +0,0 @@
FROM fedora:35
RUN dnf check-update || true \
&& dnf install --assumeyes \
git curl wget mercurial meld gcc gcc-c++ which \
gcc automake autoconf dh-autoreconf libtool libtool-ltdl \
tar unzip gzip \
&& dnf check-update || true \
&& dnf upgrade --assumeyes || true \
&& dnf autoremove --assumeyes || true \
&& dnf clean all || true
ENV GOROOT /usr/local/go
ENV GOPATH /go
ENV PATH ${GOPATH}/bin:${GOROOT}/bin:${PATH}
ENV GO_VERSION 1.19.5
ENV GO_DOWNLOAD_URL https://storage.googleapis.com/golang
RUN rm -rf ${GOROOT} \
&& curl -s ${GO_DOWNLOAD_URL}/go${GO_VERSION}.linux-amd64.tar.gz | tar -v -C /usr/local/ -xz \
&& mkdir -p ${GOPATH}/src ${GOPATH}/bin \
&& go version
RUN mkdir -p ${GOPATH}/src/go.etcd.io/etcd
ADD . ${GOPATH}/src/go.etcd.io/etcd
ADD ./tests/functional/functional.yaml /functional.yaml
RUN go get -v go.etcd.io/gofail \
&& pushd ${GOPATH}/src/go.etcd.io/etcd \
&& GO_BUILD_FLAGS="-v" ./scripts/build.sh \
&& mkdir -p /bin \
&& cp ./bin/etcd /bin/etcd \
&& cp ./bin/etcdctl /bin/etcdctl \
&& GO_BUILD_FLAGS="-v" FAILPOINTS=1 ./scripts/build.sh \
&& cp ./bin/etcd /bin/etcd-failpoints \
&& ./tests/functional/build \
&& cp ./bin/etcd-agent /bin/etcd-agent \
&& cp ./bin/etcd-proxy /bin/etcd-proxy \
&& cp ./bin/etcd-runner /bin/etcd-runner \
&& cp ./bin/etcd-tester /bin/etcd-tester \
&& go build -v -o /bin/benchmark ./tools/benchmark \
&& popd \
&& rm -rf ${GOPATH}/src/go.etcd.io/etcd

View File

@ -1,14 +0,0 @@
s1: bin/etcd --name s1 --data-dir /tmp/etcd-proxy-data.s1 --listen-client-urls http://127.0.0.1:1379 --advertise-client-urls http://127.0.0.1:13790 --listen-peer-urls http://127.0.0.1:1380 --initial-advertise-peer-urls http://127.0.0.1:13800 --initial-cluster-token tkn --initial-cluster 's1=http://127.0.0.1:13800,s2=http://127.0.0.1:23800,s3=http://127.0.0.1:33800' --initial-cluster-state new
s1-client-proxy: bin/etcd-proxy --from localhost:13790 --to localhost:1379 --http-port 1378
s1-peer-proxy: bin/etcd-proxy --from localhost:13800 --to localhost:1380 --http-port 1381
s2: bin/etcd --name s2 --data-dir /tmp/etcd-proxy-data.s2 --listen-client-urls http://127.0.0.1:2379 --advertise-client-urls http://127.0.0.1:23790 --listen-peer-urls http://127.0.0.1:2380 --initial-advertise-peer-urls http://127.0.0.1:23800 --initial-cluster-token tkn --initial-cluster 's1=http://127.0.0.1:13800,s2=http://127.0.0.1:23800,s3=http://127.0.0.1:33800' --initial-cluster-state new
s2-client-proxy: bin/etcd-proxy --from localhost:23790 --to localhost:2379 --http-port 2378
s2-peer-proxy: bin/etcd-proxy --from localhost:23800 --to localhost:2380 --http-port 2381
s3: bin/etcd --name s3 --data-dir /tmp/etcd-proxy-data.s3 --listen-client-urls http://127.0.0.1:3379 --advertise-client-urls http://127.0.0.1:33790 --listen-peer-urls http://127.0.0.1:3380 --initial-advertise-peer-urls http://127.0.0.1:33800 --initial-cluster-token tkn --initial-cluster 's1=http://127.0.0.1:13800,s2=http://127.0.0.1:23800,s3=http://127.0.0.1:33800' --initial-cluster-state new
s3-client-proxy: bin/etcd-proxy --from localhost:33790 --to localhost:3379 --http-port 3378
s3-client-proxy: bin/etcd-proxy --from localhost:33800 --to localhost:3380 --http-port 3381

View File

@ -1,218 +0,0 @@
## etcd Functional Testing
[`functional`](https://godoc.org/github.com/coreos/etcd/functional) verifies the correct behavior of etcd under various system and network malfunctions. It sets up an etcd cluster under high pressure loads and continuously injects failures into the cluster. Then it expects the etcd cluster to recover within a few seconds. This has been extremely helpful to find critical bugs.
See [`rpcpb.Case`](https://godoc.org/github.com/coreos/etcd/functional/rpcpb#Case) for all failure cases.
See [functional.yaml](https://github.com/etcd-io/etcd/blob/main/tests/functional/functional.yaml) for an example configuration.
### Run locally
```bash
PASSES=functional ./test
```
### Run with Docker
```bash
pushd ..
make build-docker-functional push-docker-functional pull-docker-functional
popd
```
And run [example scripts](./scripts).
```bash
# run 3 agents for 3-node local etcd cluster
./functional/scripts/docker-local-agent.sh 1
./functional/scripts/docker-local-agent.sh 2
./functional/scripts/docker-local-agent.sh 3
# to run only 1 tester round
./functional/scripts/docker-local-tester.sh
```
## etcd Proxy
Proxy layer that simulates various network conditions.
Test locally
```bash
$ ./scripts/build.sh
$ ./bin/etcd
$ make build-functional
$ ./bin/etcd-proxy --help
$ ./bin/etcd-proxy --from localhost:23790 --to localhost:2379 --http-port 2378 --verbose
$ ./bin/etcdctl --endpoints localhost:2379 put foo bar
$ ./bin/etcdctl --endpoints localhost:23790 put foo bar
```
Proxy overhead per request is under 500μs
```bash
$ go build -v -o ./bin/benchmark ./tools/benchmark
$ ./bin/benchmark \
--endpoints localhost:2379 \
--conns 5 \
--clients 15 \
put \
--key-size 48 \
--val-size 50000 \
--total 10000
<<COMMENT
Summary:
Total: 8.4611 secs.
Slowest: 0.1324 secs.
Fastest: 0.0011 secs.
Average: 0.0121 secs.
Stddev: 0.0125 secs.
Requests/sec: 1181.8758
Response time histogram:
0.0011 [1] |
0.0142 [7899] |∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
0.0273 [1339] |∎∎∎∎∎∎
0.0405 [543] |∎∎
0.0536 [67] |
0.0667 [49] |
0.0798 [9] |
0.0930 [15] |
0.1061 [42] |
0.1192 [21] |
0.1324 [15] |
Latency distribution:
10% in 0.0049 secs.
25% in 0.0064 secs.
50% in 0.0085 secs.
75% in 0.0126 secs.
90% in 0.0243 secs.
95% in 0.0307 secs.
99% in 0.0686 secs.
99.9% in 0.1294 secs.
COMMENT
$ ./bin/benchmark \
--endpoints localhost:23790 \
--conns 5 \
--clients 15 \
put \
--key-size 48 \
--val-size 50000 \
--total 10000
<<COMMENT
Summary:
Total: 9.1128 secs.
Slowest: 0.1363 secs.
Fastest: 0.0015 secs.
Average: 0.0131 secs.
Stddev: 0.0113 secs.
Requests/sec: 1097.3613
Response time histogram:
0.0015 [1] |
0.0150 [7407] |∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
0.0285 [2017] |∎∎∎∎∎∎∎∎∎∎
0.0419 [440] |∎∎
0.0554 [30] |
0.0689 [13] |
0.0824 [12] |
0.0959 [48] |
0.1093 [2] |
0.1228 [16] |
0.1363 [14] |
Latency distribution:
10% in 0.0054 secs.
25% in 0.0071 secs.
50% in 0.0100 secs.
75% in 0.0153 secs.
90% in 0.0241 secs.
95% in 0.0297 secs.
99% in 0.0584 secs.
99.9% in 0.1312 secs.
COMMENT
```
Delay client transmit
```bash
$ curl -L http://localhost:2378/delay-tx -X PUT \
-d "latency=5s&random-variable=100ms"
# added send latency 5s±100ms (current latency 4.92143955s)
$ curl -L http://localhost:2378/delay-tx
# current send latency 4.92143955s
$ ./bin/etcdctl \
--endpoints localhost:23790 \
--command-timeout=3s \
put foo bar
# Error: context deadline exceeded
$ curl -L http://localhost:2378/delay-tx -X DELETE
# removed latency 4.92143955s
$ curl -L http://localhost:2378/delay-tx
# current send latency 0s
$ ./bin/etcdctl \
--endpoints localhost:23790 \
--command-timeout=3s \
put foo bar
# OK
```
Pause client transmit
```bash
$ curl -L http://localhost:2378/pause-tx -X PUT
# paused forwarding [tcp://localhost:23790 -> tcp://localhost:2379]
$ ./bin/etcdctl \
--endpoints localhost:23790 \
put foo bar
# Error: context deadline exceeded
$ curl -L http://localhost:2378/pause-tx -X DELETE
# unpaused forwarding [tcp://localhost:23790 -> tcp://localhost:2379]
```
Drop client packets
```bash
$ curl -L http://localhost:2378/blackhole-tx -X PUT
# blackholed; dropping packets [tcp://localhost:23790 -> tcp://localhost:2379]
$ ./bin/etcdctl --endpoints localhost:23790 put foo bar
# Error: context deadline exceeded
$ curl -L http://localhost:2378/blackhole-tx -X DELETE
# unblackholed; restart forwarding [tcp://localhost:23790 -> tcp://localhost:2379]
```
Trigger leader election
```bash
$ ./scripts/build.sh
$ make build-functional
$ rm -rf /tmp/etcd-proxy-data.s*
$ goreman -f ./functional/Procfile-proxy start
$ ./bin/etcdctl \
--endpoints localhost:13790,localhost:23790,localhost:33790 \
member list
# isolate s1 when s1 is the current leader
$ curl -L http://localhost:1381/blackhole-tx -X PUT
$ curl -L http://localhost:1381/blackhole-rx -X PUT
# s1 becomes follower after election timeout
```

View File

@ -1,16 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package agent implements functional-tester agent server.
package agent

View File

@ -1,702 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package agent
import (
"errors"
"fmt"
"net/url"
"os"
"os/exec"
"path/filepath"
"syscall"
"time"
"go.etcd.io/etcd/client/pkg/v3/fileutil"
"go.etcd.io/etcd/pkg/v3/proxy"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
"go.uber.org/zap"
)
// return error for system errors (e.g. fail to create files)
// return status error in response for wrong configuration/operation (e.g. start etcd twice)
func (srv *Server) handleTesterRequest(req *rpcpb.Request) (resp *rpcpb.Response, err error) {
defer func() {
if err == nil && req != nil {
srv.last = req.Operation
srv.lg.Info("handler success", zap.String("operation", req.Operation.String()))
}
}()
if req != nil {
srv.Member = req.Member
srv.Tester = req.Tester
}
switch req.Operation {
case rpcpb.Operation_INITIAL_START_ETCD:
return srv.handle_INITIAL_START_ETCD(req)
case rpcpb.Operation_RESTART_ETCD:
return srv.handle_RESTART_ETCD(req)
case rpcpb.Operation_SIGTERM_ETCD:
return srv.handle_SIGTERM_ETCD()
case rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA:
return srv.handle_SIGQUIT_ETCD_AND_REMOVE_DATA()
case rpcpb.Operation_SAVE_SNAPSHOT:
return srv.handle_SAVE_SNAPSHOT()
case rpcpb.Operation_RESTORE_RESTART_FROM_SNAPSHOT:
return srv.handle_RESTORE_RESTART_FROM_SNAPSHOT(req)
case rpcpb.Operation_RESTART_FROM_SNAPSHOT:
return srv.handle_RESTART_FROM_SNAPSHOT(req)
case rpcpb.Operation_SIGQUIT_ETCD_AND_ARCHIVE_DATA:
return srv.handle_SIGQUIT_ETCD_AND_ARCHIVE_DATA()
case rpcpb.Operation_BLACKHOLE_PEER_PORT_TX_RX:
return srv.handle_BLACKHOLE_PEER_PORT_TX_RX(), nil
case rpcpb.Operation_UNBLACKHOLE_PEER_PORT_TX_RX:
return srv.handle_UNBLACKHOLE_PEER_PORT_TX_RX(), nil
case rpcpb.Operation_DELAY_PEER_PORT_TX_RX:
return srv.handle_DELAY_PEER_PORT_TX_RX(), nil
case rpcpb.Operation_UNDELAY_PEER_PORT_TX_RX:
return srv.handle_UNDELAY_PEER_PORT_TX_RX(), nil
default:
msg := fmt.Sprintf("operation not found (%v)", req.Operation)
return &rpcpb.Response{Success: false, Status: msg}, errors.New(msg)
}
}
// just archive the first file
func (srv *Server) createEtcdLogFile() error {
var err error
if srv.etcdLogFile, err = os.Create(srv.Member.Etcd.LogOutputs[0]); err != nil {
return err
}
srv.lg.Info("created etcd log file", zap.String("path", srv.Member.Etcd.LogOutputs[0]))
return nil
}
func (srv *Server) createEtcd(fromSnapshot bool, failpoints string) error {
if !fileutil.Exist(srv.Member.EtcdExec) {
return fmt.Errorf("unknown etcd exec path %q does not exist", srv.Member.EtcdExec)
}
etcdPath, etcdFlags := srv.Member.EtcdExec, srv.Member.Etcd.Flags()
if fromSnapshot {
etcdFlags = srv.Member.EtcdOnSnapshotRestore.Flags()
}
u, _ := url.Parse(srv.Member.FailpointHTTPAddr)
srv.lg.Info(
"creating etcd command",
zap.String("etcd-exec", etcdPath),
zap.Strings("etcd-flags", etcdFlags),
zap.String("GOFAIL_FAILPOINTS", failpoints),
zap.String("failpoint-http-addr", srv.Member.FailpointHTTPAddr),
zap.String("failpoint-addr", u.Host),
)
srv.etcdCmd = exec.Command(etcdPath, etcdFlags...)
srv.etcdCmd.Env = []string{"GOFAIL_HTTP=" + u.Host}
if failpoints != "" {
srv.etcdCmd.Env = append(srv.etcdCmd.Env, "GOFAIL_FAILPOINTS="+failpoints)
}
srv.etcdCmd.Stdout = srv.etcdLogFile
srv.etcdCmd.Stderr = srv.etcdLogFile
return nil
}
// start but do not wait for it to complete
func (srv *Server) runEtcd() error {
errc := make(chan error)
go func() {
time.Sleep(1 * time.Second)
// server advertise client/peer listener had to start first
// before setting up proxy listener
errc <- srv.startProxy()
}()
if srv.etcdCmd != nil {
srv.lg.Info(
"starting etcd command",
zap.String("command-path", srv.etcdCmd.Path),
)
err := srv.etcdCmd.Start()
srv.lg.Info(
"started etcd command",
zap.String("command-path", srv.etcdCmd.Path),
zap.Strings("command-args", srv.etcdCmd.Args),
zap.Strings("envs", srv.etcdCmd.Env),
zap.Error(err),
)
if err != nil {
return err
}
return <-errc
}
select {
case <-srv.etcdServer.Server.ReadyNotify():
srv.lg.Info("embedded etcd is ready")
case <-time.After(time.Minute):
srv.etcdServer.Close()
return fmt.Errorf("took too long to start %v", <-srv.etcdServer.Err())
}
return <-errc
}
// SIGQUIT to exit with stackstrace
func (srv *Server) stopEtcd(sig os.Signal) error {
srv.stopProxy()
if srv.etcdCmd != nil {
srv.lg.Info(
"stopping etcd command",
zap.String("command-path", srv.etcdCmd.Path),
zap.String("signal", sig.String()),
)
if err := srv.etcdCmd.Process.Signal(sig); err != nil {
return err
}
errc := make(chan error)
go func() {
_, ew := srv.etcdCmd.Process.Wait()
errc <- ew
close(errc)
}()
select {
case <-time.After(5 * time.Second):
srv.etcdCmd.Process.Kill()
case e := <-errc:
return e
}
err := <-errc
srv.lg.Info(
"stopped etcd command",
zap.String("command-path", srv.etcdCmd.Path),
zap.String("signal", sig.String()),
zap.Error(err),
)
return err
}
srv.lg.Info("stopping embedded etcd")
srv.etcdServer.Server.HardStop()
srv.etcdServer.Close()
srv.lg.Info("stopped embedded etcd")
return nil
}
func (srv *Server) startProxy() error {
if srv.Member.EtcdClientProxy {
advertiseClientURL, advertiseClientURLPort, err := getURLAndPort(srv.Member.Etcd.AdvertiseClientURLs[0])
if err != nil {
return err
}
listenClientURL, _, err := getURLAndPort(srv.Member.Etcd.ListenClientURLs[0])
if err != nil {
return err
}
srv.lg.Info("Checking client target's connectivity", zap.String("target", listenClientURL.Host))
if err := checkTCPConnect(srv.lg, listenClientURL.Host); err != nil {
return fmt.Errorf("check client target failed, %w", err)
}
srv.lg.Info("starting proxy on client traffic", zap.String("url", advertiseClientURL.String()))
srv.advertiseClientPortToProxy[advertiseClientURLPort] = proxy.NewServer(proxy.ServerConfig{
Logger: srv.lg,
From: *advertiseClientURL,
To: *listenClientURL,
})
select {
case err = <-srv.advertiseClientPortToProxy[advertiseClientURLPort].Error():
srv.lg.Info("starting client proxy failed", zap.Error(err))
return err
case <-time.After(2 * time.Second):
srv.lg.Info("started proxy on client traffic", zap.String("url", advertiseClientURL.String()))
}
}
if srv.Member.EtcdPeerProxy {
advertisePeerURL, advertisePeerURLPort, err := getURLAndPort(srv.Member.Etcd.AdvertisePeerURLs[0])
if err != nil {
return err
}
listenPeerURL, _, err := getURLAndPort(srv.Member.Etcd.ListenPeerURLs[0])
if err != nil {
return err
}
srv.lg.Info("Checking peer target's connectivity", zap.String("target", listenPeerURL.Host))
if err := checkTCPConnect(srv.lg, listenPeerURL.Host); err != nil {
return fmt.Errorf("check peer target failed, %w", err)
}
srv.lg.Info("starting proxy on peer traffic", zap.String("url", advertisePeerURL.String()))
srv.advertisePeerPortToProxy[advertisePeerURLPort] = proxy.NewServer(proxy.ServerConfig{
Logger: srv.lg,
From: *advertisePeerURL,
To: *listenPeerURL,
})
select {
case err = <-srv.advertisePeerPortToProxy[advertisePeerURLPort].Error():
srv.lg.Info("starting peer proxy failed", zap.Error(err))
return err
case <-time.After(2 * time.Second):
srv.lg.Info("started proxy on peer traffic", zap.String("url", advertisePeerURL.String()))
}
}
return nil
}
func (srv *Server) stopProxy() {
if srv.Member.EtcdClientProxy && len(srv.advertiseClientPortToProxy) > 0 {
for port, px := range srv.advertiseClientPortToProxy {
if err := px.Close(); err != nil {
srv.lg.Warn("failed to close proxy", zap.Int("port", port))
continue
}
select {
case <-px.Done():
// enough time to release port
time.Sleep(time.Second)
case <-time.After(time.Second):
}
srv.lg.Info("closed proxy",
zap.Int("port", port),
zap.String("from", px.From()),
zap.String("to", px.To()),
)
}
srv.advertiseClientPortToProxy = make(map[int]proxy.Server)
}
if srv.Member.EtcdPeerProxy && len(srv.advertisePeerPortToProxy) > 0 {
for port, px := range srv.advertisePeerPortToProxy {
if err := px.Close(); err != nil {
srv.lg.Warn("failed to close proxy", zap.Int("port", port))
continue
}
select {
case <-px.Done():
// enough time to release port
time.Sleep(time.Second)
case <-time.After(time.Second):
}
srv.lg.Info("closed proxy",
zap.Int("port", port),
zap.String("from", px.From()),
zap.String("to", px.To()),
)
}
srv.advertisePeerPortToProxy = make(map[int]proxy.Server)
}
}
// if started with manual TLS, stores TLS assets
// from tester/client to disk before starting etcd process
func (srv *Server) saveTLSAssets() error {
const defaultFileMode os.FileMode = 0644
if err := safeDataToFile(srv.Member.PeerCertPath, []byte(srv.Member.PeerCertData), defaultFileMode); err != nil {
return err
}
if err := safeDataToFile(srv.Member.PeerKeyPath, []byte(srv.Member.PeerKeyData), defaultFileMode); err != nil {
return err
}
if err := safeDataToFile(srv.Member.PeerTrustedCAPath, []byte(srv.Member.PeerTrustedCAData), defaultFileMode); err != nil {
return err
}
if srv.Member.PeerCertPath != "" &&
srv.Member.PeerKeyPath != "" &&
srv.Member.PeerTrustedCAPath != "" {
srv.lg.Info(
"wrote",
zap.String("peer-cert", srv.Member.PeerCertPath),
zap.String("peer-key", srv.Member.PeerKeyPath),
zap.String("peer-trusted-ca", srv.Member.PeerTrustedCAPath),
)
}
if err := safeDataToFile(srv.Member.ClientCertPath, []byte(srv.Member.ClientCertData), defaultFileMode); err != nil {
return err
}
if err := safeDataToFile(srv.Member.ClientKeyPath, []byte(srv.Member.ClientKeyData), defaultFileMode); err != nil {
return err
}
if err := safeDataToFile(srv.Member.ClientTrustedCAPath, []byte(srv.Member.ClientTrustedCAData), defaultFileMode); err != nil {
return err
}
if srv.Member.ClientCertPath != "" &&
srv.Member.ClientKeyPath != "" &&
srv.Member.ClientTrustedCAPath != "" {
srv.lg.Info(
"wrote",
zap.String("client-cert", srv.Member.ClientCertPath),
zap.String("client-key", srv.Member.ClientKeyPath),
zap.String("client-trusted-ca", srv.Member.ClientTrustedCAPath),
)
}
return nil
}
func (srv *Server) loadAutoTLSAssets() error {
if srv.Member.Etcd.PeerAutoTLS {
// in case of slow disk
time.Sleep(time.Second)
fdir := filepath.Join(srv.Member.Etcd.DataDir, "fixtures", "peer")
srv.lg.Info(
"loading peer auto TLS assets",
zap.String("dir", fdir),
zap.String("endpoint", srv.EtcdClientEndpoint),
)
// load peer cert.pem
certPath := filepath.Join(fdir, "cert.pem")
certData, err := loadFileData(certPath)
if err != nil {
return err
}
srv.Member.PeerCertData = string(certData)
// load peer key.pem
keyPath := filepath.Join(fdir, "key.pem")
keyData, err := loadFileData(keyPath)
if err != nil {
return err
}
srv.Member.PeerKeyData = string(keyData)
srv.lg.Info(
"loaded peer auto TLS assets",
zap.String("peer-cert-path", certPath),
zap.Int("peer-cert-length", len(certData)),
zap.String("peer-key-path", keyPath),
zap.Int("peer-key-length", len(keyData)),
)
}
if srv.Member.Etcd.ClientAutoTLS {
// in case of slow disk
time.Sleep(time.Second)
fdir := filepath.Join(srv.Member.Etcd.DataDir, "fixtures", "client")
srv.lg.Info(
"loading client TLS assets",
zap.String("dir", fdir),
zap.String("endpoint", srv.EtcdClientEndpoint),
)
// load client cert.pem
certPath := filepath.Join(fdir, "cert.pem")
certData, err := loadFileData(certPath)
if err != nil {
return err
}
srv.Member.ClientCertData = string(certData)
// load client key.pem
keyPath := filepath.Join(fdir, "key.pem")
keyData, err := loadFileData(keyPath)
if err != nil {
return err
}
srv.Member.ClientKeyData = string(keyData)
srv.lg.Info(
"loaded client TLS assets",
zap.String("client-cert-path", certPath),
zap.Int("client-cert-length", len(certData)),
zap.String("client-key-path", keyPath),
zap.Int("client-key-length", len(keyData)),
)
}
return nil
}
func (srv *Server) handle_INITIAL_START_ETCD(req *rpcpb.Request) (*rpcpb.Response, error) {
if srv.last != rpcpb.Operation_NOT_STARTED {
return &rpcpb.Response{
Success: false,
Status: fmt.Sprintf("%q is not valid; last server operation was %q", rpcpb.Operation_INITIAL_START_ETCD.String(), srv.last.String()),
Member: req.Member,
}, nil
}
if err := fileutil.TouchDirAll(srv.lg, srv.Member.BaseDir); err != nil {
return nil, err
}
srv.lg.Info("created base directory", zap.String("path", srv.Member.BaseDir))
if srv.etcdServer == nil {
if err := srv.createEtcdLogFile(); err != nil {
return nil, err
}
}
if err := srv.saveTLSAssets(); err != nil {
return nil, err
}
if err := srv.createEtcd(false, req.Member.Failpoints); err != nil {
return nil, err
}
if err := srv.runEtcd(); err != nil {
return nil, err
}
if err := srv.loadAutoTLSAssets(); err != nil {
return nil, err
}
return &rpcpb.Response{
Success: true,
Status: "start etcd PASS",
Member: srv.Member,
}, nil
}
func (srv *Server) handle_RESTART_ETCD(req *rpcpb.Request) (*rpcpb.Response, error) {
var err error
if !fileutil.Exist(srv.Member.BaseDir) {
if err = fileutil.TouchDirAll(srv.lg, srv.Member.BaseDir); err != nil {
return nil, err
}
}
if err = srv.saveTLSAssets(); err != nil {
return nil, err
}
if err = srv.createEtcd(false, req.Member.Failpoints); err != nil {
return nil, err
}
if err = srv.runEtcd(); err != nil {
return nil, err
}
if err = srv.loadAutoTLSAssets(); err != nil {
return nil, err
}
return &rpcpb.Response{
Success: true,
Status: "restart etcd PASS",
Member: srv.Member,
}, nil
}
func (srv *Server) handle_SIGTERM_ETCD() (*rpcpb.Response, error) {
if err := srv.stopEtcd(syscall.SIGTERM); err != nil {
return nil, err
}
if srv.etcdServer != nil {
srv.etcdServer.GetLogger().Sync()
} else {
srv.etcdLogFile.Sync()
}
return &rpcpb.Response{
Success: true,
Status: "killed etcd",
}, nil
}
func (srv *Server) handle_SIGQUIT_ETCD_AND_REMOVE_DATA() (*rpcpb.Response, error) {
if err := srv.stopEtcd(syscall.SIGQUIT); err != nil {
return nil, err
}
if srv.etcdServer != nil {
srv.etcdServer.GetLogger().Sync()
} else {
srv.etcdLogFile.Sync()
srv.etcdLogFile.Close()
}
// for debugging purposes, rename instead of removing
if err := os.RemoveAll(srv.Member.BaseDir + ".backup"); err != nil {
return nil, err
}
if err := os.Rename(srv.Member.BaseDir, srv.Member.BaseDir+".backup"); err != nil {
return nil, err
}
srv.lg.Info(
"renamed",
zap.String("base-dir", srv.Member.BaseDir),
zap.String("new-dir", srv.Member.BaseDir+".backup"),
)
// create a new log file for next new member restart
if !fileutil.Exist(srv.Member.BaseDir) {
if err := fileutil.TouchDirAll(srv.lg, srv.Member.BaseDir); err != nil {
return nil, err
}
}
return &rpcpb.Response{
Success: true,
Status: "killed etcd and removed base directory",
}, nil
}
func (srv *Server) handle_SAVE_SNAPSHOT() (*rpcpb.Response, error) {
if err := srv.Member.SaveSnapshot(srv.lg); err != nil {
return nil, err
}
return &rpcpb.Response{
Success: true,
Status: "saved snapshot",
SnapshotInfo: srv.Member.SnapshotInfo,
}, nil
}
func (srv *Server) handle_RESTORE_RESTART_FROM_SNAPSHOT(req *rpcpb.Request) (resp *rpcpb.Response, err error) {
if err = srv.Member.RestoreSnapshot(srv.lg); err != nil {
return nil, err
}
resp, err = srv.handle_RESTART_FROM_SNAPSHOT(req)
if resp != nil && err == nil {
resp.Status = "restored snapshot and " + resp.Status
}
return resp, err
}
func (srv *Server) handle_RESTART_FROM_SNAPSHOT(req *rpcpb.Request) (resp *rpcpb.Response, err error) {
if err = srv.saveTLSAssets(); err != nil {
return nil, err
}
if err = srv.createEtcd(true, req.Member.Failpoints); err != nil {
return nil, err
}
if err = srv.runEtcd(); err != nil {
return nil, err
}
if err = srv.loadAutoTLSAssets(); err != nil {
return nil, err
}
return &rpcpb.Response{
Success: true,
Status: "restarted etcd from snapshot",
SnapshotInfo: srv.Member.SnapshotInfo,
}, nil
}
func (srv *Server) handle_SIGQUIT_ETCD_AND_ARCHIVE_DATA() (*rpcpb.Response, error) {
if err := srv.stopEtcd(syscall.SIGQUIT); err != nil {
return nil, err
}
if srv.etcdServer != nil {
srv.etcdServer.GetLogger().Sync()
} else {
srv.etcdLogFile.Sync()
srv.etcdLogFile.Close()
}
// TODO: support separate WAL directory
if err := archive(srv.lg, srv.Member.BaseDir, srv.Member.Etcd.LogOutputs[0], srv.Member.Etcd.DataDir); err != nil {
return nil, err
}
srv.lg.Info("archived data", zap.String("base-dir", srv.Member.BaseDir))
if srv.etcdServer == nil {
if err := srv.createEtcdLogFile(); err != nil {
return nil, err
}
}
// TODO: Verify whether this cleaning of 'cache pages' is needed.
srv.lg.Info("cleaning up page cache")
if err := cleanPageCache(); err != nil {
srv.lg.Warn("failed to clean up page cache", zap.String("error", err.Error()))
}
srv.lg.Info("cleaned up page cache")
return &rpcpb.Response{
Success: true,
Status: "cleaned up etcd",
}, nil
}
func (srv *Server) handle_BLACKHOLE_PEER_PORT_TX_RX() *rpcpb.Response {
for port, px := range srv.advertisePeerPortToProxy {
srv.lg.Info("blackholing", zap.Int("peer-port", port))
px.BlackholeTx()
px.BlackholeRx()
srv.lg.Info("blackholed", zap.Int("peer-port", port))
}
return &rpcpb.Response{
Success: true,
Status: "blackholed peer port tx/rx",
}
}
func (srv *Server) handle_UNBLACKHOLE_PEER_PORT_TX_RX() *rpcpb.Response {
for port, px := range srv.advertisePeerPortToProxy {
srv.lg.Info("unblackholing", zap.Int("peer-port", port))
px.UnblackholeTx()
px.UnblackholeRx()
srv.lg.Info("unblackholed", zap.Int("peer-port", port))
}
return &rpcpb.Response{
Success: true,
Status: "unblackholed peer port tx/rx",
}
}
func (srv *Server) handle_DELAY_PEER_PORT_TX_RX() *rpcpb.Response {
lat := time.Duration(srv.Tester.UpdatedDelayLatencyMs) * time.Millisecond
rv := time.Duration(srv.Tester.DelayLatencyMsRv) * time.Millisecond
for port, px := range srv.advertisePeerPortToProxy {
srv.lg.Info("delaying",
zap.Int("peer-port", port),
zap.Duration("latency", lat),
zap.Duration("random-variable", rv),
)
px.DelayTx(lat, rv)
px.DelayRx(lat, rv)
srv.lg.Info("delayed",
zap.Int("peer-port", port),
zap.Duration("latency", lat),
zap.Duration("random-variable", rv),
)
}
return &rpcpb.Response{
Success: true,
Status: "delayed peer port tx/rx",
}
}
func (srv *Server) handle_UNDELAY_PEER_PORT_TX_RX() *rpcpb.Response {
for port, px := range srv.advertisePeerPortToProxy {
srv.lg.Info("undelaying", zap.Int("peer-port", port))
px.UndelayTx()
px.UndelayRx()
srv.lg.Info("undelayed", zap.Int("peer-port", port))
}
return &rpcpb.Response{
Success: true,
Status: "undelayed peer port tx/rx",
}
}

View File

@ -1,162 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package agent
import (
"math"
"net"
"os"
"os/exec"
"go.etcd.io/etcd/pkg/v3/proxy"
"go.etcd.io/etcd/server/v3/embed"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
"go.uber.org/zap"
"google.golang.org/grpc"
)
// Server implements "rpcpb.TransportServer"
// and other etcd operations as an agent
// no need to lock fields since request operations are
// serialized in tester-side
type Server struct {
lg *zap.Logger
grpcServer *grpc.Server
network string
address string
ln net.Listener
rpcpb.TransportServer
last rpcpb.Operation
*rpcpb.Member
*rpcpb.Tester
etcdServer *embed.Etcd
etcdCmd *exec.Cmd
etcdLogFile *os.File
// forward incoming advertise URLs traffic to listen URLs
advertiseClientPortToProxy map[int]proxy.Server
advertisePeerPortToProxy map[int]proxy.Server
}
// NewServer returns a new agent server.
func NewServer(
lg *zap.Logger,
network string,
address string,
) *Server {
return &Server{
lg: lg,
network: network,
address: address,
last: rpcpb.Operation_NOT_STARTED,
advertiseClientPortToProxy: make(map[int]proxy.Server),
advertisePeerPortToProxy: make(map[int]proxy.Server),
}
}
const (
maxRequestBytes = 1.5 * 1024 * 1024
grpcOverheadBytes = 512 * 1024
maxStreams = math.MaxUint32
maxSendBytes = math.MaxInt32
)
// StartServe starts serving agent server.
func (srv *Server) StartServe() error {
var err error
srv.ln, err = net.Listen(srv.network, srv.address)
if err != nil {
return err
}
var opts []grpc.ServerOption
opts = append(opts, grpc.MaxRecvMsgSize(int(maxRequestBytes+grpcOverheadBytes)))
opts = append(opts, grpc.MaxSendMsgSize(maxSendBytes))
opts = append(opts, grpc.MaxConcurrentStreams(maxStreams))
srv.grpcServer = grpc.NewServer(opts...)
rpcpb.RegisterTransportServer(srv.grpcServer, srv)
srv.lg.Info(
"gRPC server started",
zap.String("address", srv.address),
zap.String("listener-address", srv.ln.Addr().String()),
)
err = srv.grpcServer.Serve(srv.ln)
if err != nil {
srv.lg.Warn(
"gRPC server is stopped with error",
zap.String("address", srv.address),
zap.Error(err),
)
} else {
srv.lg.Info(
"gRPC server is stopped",
zap.String("address", srv.address),
)
}
return err
}
// Stop stops serving gRPC server.
func (srv *Server) Stop() {
srv.lg.Info("gRPC server stopping", zap.String("address", srv.address))
srv.grpcServer.Stop()
srv.lg.Info("gRPC server stopped", zap.String("address", srv.address))
}
// Transport communicates with etcd tester.
func (srv *Server) Transport(stream rpcpb.Transport_TransportServer) (reterr error) {
errc := make(chan error, 1)
go func() {
for {
var req *rpcpb.Request
var err error
req, err = stream.Recv()
if err != nil {
errc <- err
// TODO: handle error and retry
return
}
var resp *rpcpb.Response
resp, err = srv.handleTesterRequest(req)
if err != nil {
errc <- err
// TODO: handle error and retry
return
}
if err = stream.Send(resp); err != nil {
errc <- err
// TODO: handle error and retry
return
}
}
}()
select {
case reterr = <-errc:
case <-stream.Context().Done():
reterr = stream.Context().Err()
}
return reterr
}

View File

@ -1,151 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package agent
import (
"fmt"
"io"
"net"
"net/url"
"os"
"os/exec"
"path/filepath"
"strconv"
"time"
"go.etcd.io/etcd/client/pkg/v3/fileutil"
"go.uber.org/zap"
)
// TODO: support separate WAL directory
func archive(lg *zap.Logger, baseDir, etcdLogPath, dataDir string) error {
dir := filepath.Join(baseDir, "etcd-failure-archive", time.Now().Format(time.RFC3339))
if existDir(dir) {
dir = filepath.Join(baseDir, "etcd-failure-archive", time.Now().Add(time.Second).Format(time.RFC3339))
}
if err := fileutil.TouchDirAll(lg, dir); err != nil {
return err
}
dst := filepath.Join(dir, "etcd.log")
if err := copyFile(etcdLogPath, dst); err != nil {
if !os.IsNotExist(err) {
return err
}
}
if err := os.Rename(dataDir, filepath.Join(dir, filepath.Base(dataDir))); err != nil {
if !os.IsNotExist(err) {
return err
}
}
return nil
}
func existDir(fpath string) bool {
st, err := os.Stat(fpath)
if err != nil {
if os.IsNotExist(err) {
return false
}
} else {
return st.IsDir()
}
return false
}
func getURLAndPort(addr string) (urlAddr *url.URL, port int, err error) {
urlAddr, err = url.Parse(addr)
if err != nil {
return nil, -1, err
}
var s string
_, s, err = net.SplitHostPort(urlAddr.Host)
if err != nil {
return nil, -1, err
}
port, err = strconv.Atoi(s)
if err != nil {
return nil, -1, err
}
return urlAddr, port, err
}
func copyFile(src, dst string) error {
f, err := os.Open(src)
if err != nil {
return err
}
defer f.Close()
w, err := os.Create(dst)
if err != nil {
return err
}
defer w.Close()
if _, err = io.Copy(w, f); err != nil {
return err
}
return w.Sync()
}
func safeDataToFile(filePath string, fileData []byte, mode os.FileMode) error {
if filePath != "" {
if len(fileData) == 0 {
return fmt.Errorf("got empty data for %q", filePath)
}
if err := os.WriteFile(filePath, fileData, mode); err != nil {
return fmt.Errorf("writing file %q failed, %w", filePath, err)
}
}
return nil
}
func loadFileData(filePath string) ([]byte, error) {
if !fileutil.Exist(filePath) {
return nil, fmt.Errorf("cannot find %q", filePath)
}
data, err := os.ReadFile(filePath)
if err != nil {
return nil, fmt.Errorf("read file %q failed, %w", filePath, err)
}
return data, nil
}
func checkTCPConnect(lg *zap.Logger, target string) error {
for i := 0; i < 10; i++ {
if conn, err := net.Dial("tcp", target); err != nil {
lg.Error("The target isn't reachable", zap.Int("retries", i), zap.String("target", target), zap.Error(err))
} else {
if conn != nil {
conn.Close()
lg.Info("The target is reachable", zap.Int("retries", i), zap.String("target", target))
return nil
}
lg.Error("The target isn't reachable due to the returned conn is nil", zap.Int("retries", i), zap.String("target", target))
}
time.Sleep(time.Second)
}
return fmt.Errorf("timed out waiting for the target (%s) to be reachable", target)
}
func cleanPageCache() error {
// https://www.kernel.org/doc/Documentation/sysctl/vm.txt
// https://github.com/torvalds/linux/blob/master/fs/drop_caches.c
cmd := exec.Command("/bin/sh", "-c", `echo "echo 1 > /proc/sys/vm/drop_caches" | sudo -s -n`)
return cmd.Run()
}

View File

@ -1,36 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package agent
import (
"net/url"
"reflect"
"testing"
)
func TestGetURLAndPort(t *testing.T) {
addr := "https://127.0.0.1:2379"
urlAddr, port, err := getURLAndPort(addr)
if err != nil {
t.Fatal(err)
}
exp := &url.URL{Scheme: "https", Host: "127.0.0.1:2379"}
if !reflect.DeepEqual(urlAddr, exp) {
t.Fatalf("expected %+v, got %+v", exp, urlAddr)
}
if port != 2379 {
t.Fatalf("port expected 2379, got %d", port)
}
}

View File

@ -1,16 +0,0 @@
#!/usr/bin/env bash
if ! [[ "$0" =~ "tests/functional/build" ]]; then
echo "must be run from repository root"
exit 255
fi
outdir="${BINDIR:-../bin}"
(
cd ./tests
CGO_ENABLED=0 go build -trimpath -v -installsuffix cgo -ldflags "-s -w" -o "${outdir}/etcd-agent" ./functional/cmd/etcd-agent
CGO_ENABLED=0 go build -trimpath -v -installsuffix cgo -ldflags "-s -w" -o "${outdir}/etcd-proxy" ./functional/cmd/etcd-proxy
CGO_ENABLED=0 go build -trimpath -v -installsuffix cgo -ldflags "-s -w" -o "${outdir}/etcd-runner" ./functional/cmd/etcd-runner
CGO_ENABLED=0 go test -v -installsuffix cgo -ldflags "-s -w" -c -o "${outdir}/etcd-tester" ./functional/cmd/etcd-tester
)

View File

@ -1,49 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// etcd-agent is a program that runs functional-tester agent.
package main
import (
"flag"
"go.uber.org/zap/zapcore"
"go.etcd.io/etcd/tests/v3/functional/agent"
"go.uber.org/zap"
)
var logger *zap.Logger
func main() {
network := flag.String("network", "tcp", "network to serve agent server")
address := flag.String("address", "127.0.0.1:9027", "address to serve agent server")
flag.Parse()
lcfg := zap.NewDevelopmentConfig()
lcfg.Level = zap.NewAtomicLevelAt(zapcore.InfoLevel)
logger, err := lcfg.Build()
if err != nil {
panic(err)
}
logger = logger.Named("agent").With(zap.String("address", *address))
defer logger.Sync()
srv := agent.NewServer(logger, *network, *address)
err = srv.StartServe()
logger.Info("agent exiting", zap.Error(err))
}

View File

@ -1,233 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// etcd-proxy is a proxy layer that simulates various network conditions.
package main
import (
"context"
"flag"
"fmt"
"io"
"log"
"net/http"
"net/url"
"os"
"os/signal"
"syscall"
"time"
"go.etcd.io/etcd/pkg/v3/proxy"
"go.uber.org/zap"
)
var from string
var to string
var httpPort int
var verbose bool
func main() {
// TODO: support TLS
flag.StringVar(&from, "from", "localhost:23790", "Address URL to proxy from.")
flag.StringVar(&to, "to", "localhost:2379", "Address URL to forward.")
flag.IntVar(&httpPort, "http-port", 2378, "Port to serve etcd-proxy API.")
flag.BoolVar(&verbose, "verbose", false, "'true' to run proxy in verbose mode.")
flag.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage of %q:\n", os.Args[0])
fmt.Fprintln(os.Stderr, `
etcd-proxy simulates various network conditions for etcd testing purposes.
See README.md for more examples.
Example:
# build etcd
$ ./scripts/build.sh
$ ./bin/etcd
# build etcd-proxy
$ make build-etcd-proxy
# to test etcd with proxy layer
$ ./bin/etcd-proxy --help
$ ./bin/etcd-proxy --from localhost:23790 --to localhost:2379 --http-port 2378 --verbose
$ ./bin/etcdctl --endpoints localhost:2379 put foo bar
$ ./bin/etcdctl --endpoints localhost:23790 put foo bar`)
flag.PrintDefaults()
}
flag.Parse()
cfg := proxy.ServerConfig{
From: url.URL{Scheme: "tcp", Host: from},
To: url.URL{Scheme: "tcp", Host: to},
}
if verbose {
var err error
cfg.Logger, err = zap.NewDevelopment()
if err != nil {
panic(err)
}
cfg.Logger = cfg.Logger.Named("proxy").With(
zap.String("from", from),
zap.String("to", to),
zap.Int("port", httpPort))
}
p := proxy.NewServer(cfg)
select {
case <-p.Ready():
case err := <-p.Error():
panic(err)
}
defer p.Close()
mux := http.NewServeMux()
mux.HandleFunc("/", func(w http.ResponseWriter, req *http.Request) {
w.Write([]byte(fmt.Sprintf("proxying [%s -> %s]\n", p.From(), p.To())))
})
mux.HandleFunc("/delay-tx", func(w http.ResponseWriter, req *http.Request) {
switch req.Method {
case http.MethodGet:
w.Write([]byte(fmt.Sprintf("current send latency %v\n", p.LatencyTx())))
case http.MethodPut, http.MethodPost:
if err := req.ParseForm(); err != nil {
w.Write([]byte(fmt.Sprintf("wrong form %q\n", err.Error())))
return
}
lat, err := time.ParseDuration(req.PostForm.Get("latency"))
if err != nil {
w.Write([]byte(fmt.Sprintf("wrong latency form %q\n", err.Error())))
return
}
rv, err := time.ParseDuration(req.PostForm.Get("random-variable"))
if err != nil {
w.Write([]byte(fmt.Sprintf("wrong random-variable form %q\n", err.Error())))
return
}
p.DelayTx(lat, rv)
w.Write([]byte(fmt.Sprintf("added send latency %v±%v (current latency %v)\n", lat, rv, p.LatencyTx())))
case http.MethodDelete:
lat := p.LatencyTx()
p.UndelayTx()
w.Write([]byte(fmt.Sprintf("removed latency %v\n", lat)))
default:
w.Write([]byte(fmt.Sprintf("unsupported method %q\n", req.Method)))
}
})
mux.HandleFunc("/delay-rx", func(w http.ResponseWriter, req *http.Request) {
switch req.Method {
case http.MethodGet:
w.Write([]byte(fmt.Sprintf("current receive latency %v\n", p.LatencyRx())))
case http.MethodPut, http.MethodPost:
if err := req.ParseForm(); err != nil {
w.Write([]byte(fmt.Sprintf("wrong form %q\n", err.Error())))
return
}
lat, err := time.ParseDuration(req.PostForm.Get("latency"))
if err != nil {
w.Write([]byte(fmt.Sprintf("wrong latency form %q\n", err.Error())))
return
}
rv, err := time.ParseDuration(req.PostForm.Get("random-variable"))
if err != nil {
w.Write([]byte(fmt.Sprintf("wrong random-variable form %q\n", err.Error())))
return
}
p.DelayRx(lat, rv)
w.Write([]byte(fmt.Sprintf("added receive latency %v±%v (current latency %v)\n", lat, rv, p.LatencyRx())))
case http.MethodDelete:
lat := p.LatencyRx()
p.UndelayRx()
w.Write([]byte(fmt.Sprintf("removed latency %v\n", lat)))
default:
w.Write([]byte(fmt.Sprintf("unsupported method %q\n", req.Method)))
}
})
mux.HandleFunc("/pause-tx", func(w http.ResponseWriter, req *http.Request) {
switch req.Method {
case http.MethodPut, http.MethodPost:
p.PauseTx()
w.Write([]byte(fmt.Sprintf("paused forwarding [%s -> %s]\n", p.From(), p.To())))
case http.MethodDelete:
p.UnpauseTx()
w.Write([]byte(fmt.Sprintf("unpaused forwarding [%s -> %s]\n", p.From(), p.To())))
default:
w.Write([]byte(fmt.Sprintf("unsupported method %q\n", req.Method)))
}
})
mux.HandleFunc("/pause-rx", func(w http.ResponseWriter, req *http.Request) {
switch req.Method {
case http.MethodPut, http.MethodPost:
p.PauseRx()
w.Write([]byte(fmt.Sprintf("paused forwarding [%s <- %s]\n", p.From(), p.To())))
case http.MethodDelete:
p.UnpauseRx()
w.Write([]byte(fmt.Sprintf("unpaused forwarding [%s <- %s]\n", p.From(), p.To())))
default:
w.Write([]byte(fmt.Sprintf("unsupported method %q\n", req.Method)))
}
})
mux.HandleFunc("/blackhole-tx", func(w http.ResponseWriter, req *http.Request) {
switch req.Method {
case http.MethodPut, http.MethodPost:
p.BlackholeTx()
w.Write([]byte(fmt.Sprintf("blackholed; dropping packets [%s -> %s]\n", p.From(), p.To())))
case http.MethodDelete:
p.UnblackholeTx()
w.Write([]byte(fmt.Sprintf("unblackholed; restart forwarding [%s -> %s]\n", p.From(), p.To())))
default:
w.Write([]byte(fmt.Sprintf("unsupported method %q\n", req.Method)))
}
})
mux.HandleFunc("/blackhole-rx", func(w http.ResponseWriter, req *http.Request) {
switch req.Method {
case http.MethodPut, http.MethodPost:
p.BlackholeRx()
w.Write([]byte(fmt.Sprintf("blackholed; dropping packets [%s <- %s]\n", p.From(), p.To())))
case http.MethodDelete:
p.UnblackholeRx()
w.Write([]byte(fmt.Sprintf("unblackholed; restart forwarding [%s <- %s]\n", p.From(), p.To())))
default:
w.Write([]byte(fmt.Sprintf("unsupported method %q\n", req.Method)))
}
})
srv := &http.Server{
Addr: fmt.Sprintf(":%d", httpPort),
Handler: mux,
ErrorLog: log.New(io.Discard, "net/http", 0),
}
defer srv.Close()
sig := make(chan os.Signal, 1)
signal.Notify(sig, os.Interrupt, syscall.SIGTERM)
defer signal.Stop(sig)
go func() {
s := <-sig
fmt.Printf("\n\nreceived signal %q, shutting down HTTP server\n\n", s)
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
err := srv.Shutdown(ctx)
cancel()
fmt.Printf("gracefully stopped HTTP server with %v\n\n", err)
os.Exit(0)
}()
fmt.Printf("\nserving HTTP server http://localhost:%d\n\n", httpPort)
err := srv.ListenAndServe()
fmt.Printf("HTTP server exit with error %v\n", err)
}

View File

@ -1,23 +0,0 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// etcd-runner is a program for testing etcd clientv3 features
// against a fault injected cluster.
package main
import "go.etcd.io/etcd/tests/v3/functional/runner"
func main() {
runner.Start()
}

View File

@ -1,55 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// etcd-tester is a program that runs functional-tester client.
package main
import (
"flag"
"testing"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
"go.uber.org/zap/zaptest"
"go.etcd.io/etcd/client/pkg/v3/testutil"
"go.etcd.io/etcd/tests/v3/functional/tester"
)
var config = flag.String("config", "../../functional.yaml", "path to tester configuration")
func TestFunctional(t *testing.T) {
testutil.SkipTestIfShortMode(t, "functional tests are skipped in --short mode")
lg := zaptest.NewLogger(t, zaptest.Level(zapcore.InfoLevel)).Named("tester")
clus, err := tester.NewCluster(lg, *config)
if err != nil {
t.Fatalf("failed to create a cluster: %v", err)
}
if err = clus.Send_INITIAL_START_ETCD(); err != nil {
t.Fatal("Bootstrap failed", zap.Error(err))
}
t.Log("wait health after bootstrap")
if err = clus.WaitHealth(); err != nil {
t.Fatal("WaitHealth failed", zap.Error(err))
}
if err := clus.Run(t); err == nil {
// Only stop etcd and cleanup data when test is successful.
clus.Send_SIGQUIT_ETCD_AND_REMOVE_DATA()
}
}

View File

@ -1,259 +0,0 @@
agent-configs:
- etcd-exec: ./bin/etcd
agent-addr: 127.0.0.1:19027
failpoint-http-addr: http://127.0.0.1:7381
base-dir: /tmp/etcd-functional-1
etcd-client-proxy: false
etcd-peer-proxy: true
etcd-client-endpoint: 127.0.0.1:1379
etcd:
name: s1
data-dir: /tmp/etcd-functional-1/etcd.data
wal-dir: /tmp/etcd-functional-1/etcd.data/member/wal
heartbeat-interval: 100
election-timeout: 1000
listen-client-urls: ["https://127.0.0.1:1379"]
advertise-client-urls: ["https://127.0.0.1:1379"]
auto-tls: true
client-cert-auth: false
cert-file: ""
key-file: ""
trusted-ca-file: ""
listen-peer-urls: ["https://127.0.0.1:1380"]
initial-advertise-peer-urls: ["https://127.0.0.1:1381"]
peer-auto-tls: true
peer-client-cert-auth: false
peer-cert-file: ""
peer-key-file: ""
peer-trusted-ca-file: ""
initial-cluster: s1=https://127.0.0.1:1381,s2=https://127.0.0.1:2381,s3=https://127.0.0.1:3381
initial-cluster-state: new
initial-cluster-token: tkn
snapshot-count: 2000
quota-backend-bytes: 10740000000 # 10 GiB
pre-vote: true
initial-corrupt-check: true
logger: zap
log-outputs: [/tmp/etcd-functional-1/etcd.log]
log-level: info
socket-reuse-address: true
socket-reuse-port: true
client-cert-data: ""
client-cert-path: ""
client-key-data: ""
client-key-path: ""
client-trusted-ca-data: ""
client-trusted-ca-path: ""
peer-cert-data: ""
peer-cert-path: ""
peer-key-data: ""
peer-key-path: ""
peer-trusted-ca-data: ""
peer-trusted-ca-path: ""
snapshot-path: /tmp/etcd-functional-1.snapshot.db
- etcd-exec: ./bin/etcd
agent-addr: 127.0.0.1:29027
failpoint-http-addr: http://127.0.0.1:7382
base-dir: /tmp/etcd-functional-2
etcd-client-proxy: false
etcd-peer-proxy: true
etcd-client-endpoint: 127.0.0.1:2379
etcd:
name: s2
data-dir: /tmp/etcd-functional-2/etcd.data
wal-dir: /tmp/etcd-functional-2/etcd.data/member/wal
heartbeat-interval: 100
election-timeout: 1000
listen-client-urls: ["https://127.0.0.1:2379"]
advertise-client-urls: ["https://127.0.0.1:2379"]
auto-tls: true
client-cert-auth: false
cert-file: ""
key-file: ""
trusted-ca-file: ""
listen-peer-urls: ["https://127.0.0.1:2380"]
initial-advertise-peer-urls: ["https://127.0.0.1:2381"]
peer-auto-tls: true
peer-client-cert-auth: false
peer-cert-file: ""
peer-key-file: ""
peer-trusted-ca-file: ""
initial-cluster: s1=https://127.0.0.1:1381,s2=https://127.0.0.1:2381,s3=https://127.0.0.1:3381
initial-cluster-state: new
initial-cluster-token: tkn
snapshot-count: 2000
quota-backend-bytes: 10740000000 # 10 GiB
pre-vote: true
initial-corrupt-check: true
logger: zap
log-outputs: [/tmp/etcd-functional-2/etcd.log]
log-level: info
socket-reuse-address: true
socket-reuse-port: true
client-cert-data: ""
client-cert-path: ""
client-key-data: ""
client-key-path: ""
client-trusted-ca-data: ""
client-trusted-ca-path: ""
peer-cert-data: ""
peer-cert-path: ""
peer-key-data: ""
peer-key-path: ""
peer-trusted-ca-data: ""
peer-trusted-ca-path: ""
snapshot-path: /tmp/etcd-functional-2.snapshot.db
- etcd-exec: ./bin/etcd
agent-addr: 127.0.0.1:39027
failpoint-http-addr: http://127.0.0.1:7383
base-dir: /tmp/etcd-functional-3
etcd-client-proxy: false
etcd-peer-proxy: true
etcd-client-endpoint: 127.0.0.1:3379
etcd:
name: s3
data-dir: /tmp/etcd-functional-3/etcd.data
wal-dir: /tmp/etcd-functional-3/etcd.data/member/wal
heartbeat-interval: 100
election-timeout: 1000
listen-client-urls: ["https://127.0.0.1:3379"]
advertise-client-urls: ["https://127.0.0.1:3379"]
auto-tls: true
client-cert-auth: false
cert-file: ""
key-file: ""
trusted-ca-file: ""
listen-peer-urls: ["https://127.0.0.1:3380"]
initial-advertise-peer-urls: ["https://127.0.0.1:3381"]
peer-auto-tls: true
peer-client-cert-auth: false
peer-cert-file: ""
peer-key-file: ""
peer-trusted-ca-file: ""
initial-cluster: s1=https://127.0.0.1:1381,s2=https://127.0.0.1:2381,s3=https://127.0.0.1:3381
initial-cluster-state: new
initial-cluster-token: tkn
snapshot-count: 2000
quota-backend-bytes: 10740000000 # 10 GiB
pre-vote: true
initial-corrupt-check: true
logger: zap
log-outputs: [/tmp/etcd-functional-3/etcd.log]
log-level: info
socket-reuse-address: true
socket-reuse-port: true
client-cert-data: ""
client-cert-path: ""
client-key-data: ""
client-key-path: ""
client-trusted-ca-data: ""
client-trusted-ca-path: ""
peer-cert-data: ""
peer-cert-path: ""
peer-key-data: ""
peer-key-path: ""
peer-trusted-ca-data: ""
peer-trusted-ca-path: ""
snapshot-path: /tmp/etcd-functional-3.snapshot.db
tester-config:
data-dir: /tmp/etcd-tester-data
network: tcp
addr: 127.0.0.1:9028
# slow enough to trigger election
delay-latency-ms: 5000
delay-latency-ms-rv: 500
round-limit: 1
exit-on-failure: true
enable-pprof: true
case-delay-ms: 7000
case-shuffle: true
# For full descriptions,
# https://pkg.go.dev/go.etcd.io/etcd/tests/v3/functional/rpcpb#Case
cases:
- SIGTERM_ONE_FOLLOWER
- SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
- SIGTERM_LEADER
- SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT
- SIGTERM_QUORUM
- SIGTERM_ALL
- SIGQUIT_AND_REMOVE_ONE_FOLLOWER
- SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
- BLACKHOLE_PEER_PORT_TX_RX_LEADER
- BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT
- BLACKHOLE_PEER_PORT_TX_RX_QUORUM
- BLACKHOLE_PEER_PORT_TX_RX_ALL
- DELAY_PEER_PORT_TX_RX_LEADER
- RANDOM_DELAY_PEER_PORT_TX_RX_LEADER
- DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT
- RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT
- DELAY_PEER_PORT_TX_RX_QUORUM
- RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM
- DELAY_PEER_PORT_TX_RX_ALL
- RANDOM_DELAY_PEER_PORT_TX_RX_ALL
- NO_FAIL_WITH_STRESS
- NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS
# - FAILPOINTS_WITH_DISK_IO_LATENCY
# TODO: use iptables for discarding outbound rafthttp traffic to peer port
# - BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER
# - BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
# - DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER
# - RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER
# - DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
# - RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
# - SIGQUIT_AND_REMOVE_LEADER
# - SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT
# - SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH
failpoint-commands:
- panic("etcd-tester")
# - panic("etcd-tester"),1*sleep(1000)
# - sleep(3000)
runner-exec-path: ./bin/etcd-runner
external-exec-path: ""
# make up ±70% of workloads with writes
stressers:
- type: KV_WRITE_SMALL
weight: 0.35
- type: KV_WRITE_LARGE
weight: 0.002
- type: KV_READ_ONE_KEY
weight: 0.07
- type: KV_READ_RANGE
weight: 0.07
- type: KV_DELETE_ONE_KEY
weight: 0.07
- type: KV_DELETE_RANGE
weight: 0.07
- type: KV_TXN_WRITE_DELETE
weight: 0.35
- type: LEASE
weight: 0.0
# - ELECTION_RUNNER
# - WATCH_RUNNER
# - LOCK_RACER_RUNNER
# - LEASE_RUNNER
checkers:
- KV_HASH
- LEASE_EXPIRE
#- SHORT_TTL_LEASE_EXPIRE
stress-key-size: 100
stress-key-size-large: 32769
stress-key-suffix-range: 250000
stress-key-suffix-range-txn: 100
stress-key-txn-ops: 10
stress-clients: 100
stress-qps: 2000

View File

@ -1,106 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package rpcpb
import (
"fmt"
"reflect"
"strings"
)
var etcdFields = []string{
"Name",
"DataDir",
"WALDir",
"HeartbeatIntervalMs",
"ElectionTimeoutMs",
"ListenClientURLs",
"AdvertiseClientURLs",
"ClientAutoTLS",
"ClientCertAuth",
"ClientCertFile",
"ClientKeyFile",
"ClientTrustedCAFile",
"ListenPeerURLs",
"AdvertisePeerURLs",
"PeerAutoTLS",
"PeerClientCertAuth",
"PeerCertFile",
"PeerKeyFile",
"PeerTrustedCAFile",
"InitialCluster",
"InitialClusterState",
"InitialClusterToken",
"SnapshotCount",
"QuotaBackendBytes",
"PreVote",
"InitialCorruptCheck",
"Logger",
"LogOutputs",
"LogLevel",
"SocketReuseAddress",
"SocketReusePort",
}
// Flags returns etcd flags in string slice.
func (e *Etcd) Flags() (fs []string) {
tp := reflect.TypeOf(*e)
vo := reflect.ValueOf(*e)
for _, name := range etcdFields {
field, ok := tp.FieldByName(name)
if !ok {
panic(fmt.Errorf("field %q not found", name))
}
fv := reflect.Indirect(vo).FieldByName(name)
var sv string
switch fv.Type().Kind() {
case reflect.String:
sv = fv.String()
case reflect.Slice:
n := fv.Len()
sl := make([]string, n)
for i := 0; i < n; i++ {
sl[i] = fv.Index(i).String()
}
sv = strings.Join(sl, ",")
case reflect.Int64:
sv = fmt.Sprintf("%d", fv.Int())
case reflect.Bool:
sv = fmt.Sprintf("%v", fv.Bool())
default:
panic(fmt.Errorf("field %q (%v) cannot be parsed", name, fv.Type().Kind()))
}
fname := field.Tag.Get("yaml")
// TODO: remove this
if fname == "initial-corrupt-check" {
fname = "experimental-" + fname
}
if sv != "" {
fs = append(fs, fmt.Sprintf("--%s=%s", fname, sv))
}
}
return fs
}

View File

@ -1,95 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package rpcpb
import (
"reflect"
"testing"
)
func TestEtcd(t *testing.T) {
e := &Etcd{
Name: "s1",
DataDir: "/tmp/etcd-functionl-1/etcd.data",
WALDir: "/tmp/etcd-functionl-1/etcd.data/member/wal",
HeartbeatIntervalMs: 100,
ElectionTimeoutMs: 1000,
ListenClientURLs: []string{"https://127.0.0.1:1379"},
AdvertiseClientURLs: []string{"https://127.0.0.1:13790"},
ClientAutoTLS: true,
ClientCertAuth: false,
ClientCertFile: "",
ClientKeyFile: "",
ClientTrustedCAFile: "",
ListenPeerURLs: []string{"https://127.0.0.1:1380"},
AdvertisePeerURLs: []string{"https://127.0.0.1:13800"},
PeerAutoTLS: true,
PeerClientCertAuth: false,
PeerCertFile: "",
PeerKeyFile: "",
PeerTrustedCAFile: "",
InitialCluster: "s1=https://127.0.0.1:13800,s2=https://127.0.0.1:23800,s3=https://127.0.0.1:33800",
InitialClusterState: "new",
InitialClusterToken: "tkn",
SnapshotCount: 10000,
QuotaBackendBytes: 10740000000,
PreVote: true,
InitialCorruptCheck: true,
Logger: "zap",
LogOutputs: []string{"/tmp/etcd-functional-1/etcd.log"},
LogLevel: "info",
SocketReuseAddress: true,
SocketReusePort: true,
}
exps := []string{
"--name=s1",
"--data-dir=/tmp/etcd-functionl-1/etcd.data",
"--wal-dir=/tmp/etcd-functionl-1/etcd.data/member/wal",
"--heartbeat-interval=100",
"--election-timeout=1000",
"--listen-client-urls=https://127.0.0.1:1379",
"--advertise-client-urls=https://127.0.0.1:13790",
"--auto-tls=true",
"--client-cert-auth=false",
"--listen-peer-urls=https://127.0.0.1:1380",
"--initial-advertise-peer-urls=https://127.0.0.1:13800",
"--peer-auto-tls=true",
"--peer-client-cert-auth=false",
"--initial-cluster=s1=https://127.0.0.1:13800,s2=https://127.0.0.1:23800,s3=https://127.0.0.1:33800",
"--initial-cluster-state=new",
"--initial-cluster-token=tkn",
"--snapshot-count=10000",
"--quota-backend-bytes=10740000000",
"--pre-vote=true",
"--experimental-initial-corrupt-check=true",
"--logger=zap",
"--log-outputs=/tmp/etcd-functional-1/etcd.log",
"--log-level=info",
"--socket-reuse-address=true",
"--socket-reuse-port=true",
}
fs := e.Flags()
if !reflect.DeepEqual(exps, fs) {
t.Fatalf("expected %q, got %q", exps, fs)
}
}

View File

@ -1,375 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package rpcpb
import (
"context"
"crypto/tls"
"fmt"
"net/url"
"os"
"time"
pb "go.etcd.io/etcd/api/v3/etcdserverpb"
"go.etcd.io/etcd/client/pkg/v3/logutil"
"go.etcd.io/etcd/client/pkg/v3/transport"
clientv3 "go.etcd.io/etcd/client/v3"
"go.etcd.io/etcd/etcdutl/v3/snapshot"
"github.com/dustin/go-humanize"
"go.uber.org/zap"
grpc "google.golang.org/grpc"
"google.golang.org/grpc/credentials"
)
// ElectionTimeout returns an election timeout duration.
func (m *Member) ElectionTimeout() time.Duration {
return time.Duration(m.Etcd.ElectionTimeoutMs) * time.Millisecond
}
// DialEtcdGRPCServer creates a raw gRPC connection to an etcd member.
func (m *Member) DialEtcdGRPCServer(opts ...grpc.DialOption) (*grpc.ClientConn, error) {
dialOpts := []grpc.DialOption{
grpc.WithTimeout(5 * time.Second),
grpc.WithBlock(),
}
secure := false
for _, cu := range m.Etcd.AdvertiseClientURLs {
u, err := url.Parse(cu)
if err != nil {
return nil, err
}
if u.Scheme == "https" { // TODO: handle unix
secure = true
}
}
if secure {
// assume save TLS assets are already stord on disk
tlsInfo := transport.TLSInfo{
CertFile: m.ClientCertPath,
KeyFile: m.ClientKeyPath,
TrustedCAFile: m.ClientTrustedCAPath,
// TODO: remove this with generated certs
// only need it for auto TLS
InsecureSkipVerify: true,
}
tlsConfig, err := tlsInfo.ClientConfig()
if err != nil {
return nil, err
}
creds := credentials.NewTLS(tlsConfig)
dialOpts = append(dialOpts, grpc.WithTransportCredentials(creds))
} else {
dialOpts = append(dialOpts, grpc.WithInsecure())
}
dialOpts = append(dialOpts, opts...)
return grpc.Dial(m.EtcdClientEndpoint, dialOpts...)
}
// CreateEtcdClientConfig creates a client configuration from member.
func (m *Member) CreateEtcdClientConfig(opts ...grpc.DialOption) (cfg *clientv3.Config, err error) {
secure := false
for _, cu := range m.Etcd.AdvertiseClientURLs {
var u *url.URL
u, err = url.Parse(cu)
if err != nil {
return nil, err
}
if u.Scheme == "https" { // TODO: handle unix
secure = true
}
}
// TODO: make this configurable
level := "error"
if os.Getenv("ETCD_CLIENT_DEBUG") != "" {
level = "debug"
}
lcfg := logutil.DefaultZapLoggerConfig
lcfg.Level = zap.NewAtomicLevelAt(logutil.ConvertToZapLevel(level))
cfg = &clientv3.Config{
Endpoints: []string{m.EtcdClientEndpoint},
DialTimeout: 10 * time.Second,
DialOptions: opts,
LogConfig: &lcfg,
}
if secure {
// assume save TLS assets are already stord on disk
tlsInfo := transport.TLSInfo{
CertFile: m.ClientCertPath,
KeyFile: m.ClientKeyPath,
TrustedCAFile: m.ClientTrustedCAPath,
// TODO: remove this with generated certs
// only need it for auto TLS
InsecureSkipVerify: true,
}
var tlsConfig *tls.Config
tlsConfig, err = tlsInfo.ClientConfig()
if err != nil {
return nil, err
}
cfg.TLS = tlsConfig
}
return cfg, err
}
// CreateEtcdClient creates a client from member.
func (m *Member) CreateEtcdClient(opts ...grpc.DialOption) (*clientv3.Client, error) {
cfg, err := m.CreateEtcdClientConfig(opts...)
if err != nil {
return nil, err
}
return clientv3.New(*cfg)
}
// CheckCompact ensures that historical data before given revision has been compacted.
func (m *Member) CheckCompact(rev int64) error {
cli, err := m.CreateEtcdClient()
if err != nil {
return fmt.Errorf("%v (%q)", err, m.EtcdClientEndpoint)
}
defer cli.Close()
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
wch := cli.Watch(ctx, "\x00", clientv3.WithFromKey(), clientv3.WithRev(rev-1))
wr, ok := <-wch
cancel()
if !ok {
return fmt.Errorf("watch channel terminated (endpoint %q)", m.EtcdClientEndpoint)
}
if wr.CompactRevision != rev {
return fmt.Errorf("got compact revision %v, wanted %v (endpoint %q)", wr.CompactRevision, rev, m.EtcdClientEndpoint)
}
return nil
}
// Defrag runs defragmentation on this member.
func (m *Member) Defrag() error {
cli, err := m.CreateEtcdClient()
if err != nil {
return fmt.Errorf("%v (%q)", err, m.EtcdClientEndpoint)
}
defer cli.Close()
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
_, err = cli.Defragment(ctx, m.EtcdClientEndpoint)
cancel()
return err
}
// RevHash fetches current revision and hash on this member.
func (m *Member) RevHash() (int64, int64, error) {
conn, err := m.DialEtcdGRPCServer()
if err != nil {
return 0, 0, err
}
defer conn.Close()
mt := pb.NewMaintenanceClient(conn)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
resp, err := mt.Hash(ctx, &pb.HashRequest{}, grpc.WaitForReady(true))
cancel()
if err != nil {
return 0, 0, err
}
return resp.Header.Revision, int64(resp.Hash), nil
}
// Rev fetches current revision on this member.
func (m *Member) Rev(ctx context.Context) (int64, error) {
cli, err := m.CreateEtcdClient()
if err != nil {
return 0, fmt.Errorf("%v (%q)", err, m.EtcdClientEndpoint)
}
defer cli.Close()
resp, err := cli.Status(ctx, m.EtcdClientEndpoint)
if err != nil {
return 0, err
}
return resp.Header.Revision, nil
}
// Compact compacts member storage with given revision.
// It blocks until it's physically done.
func (m *Member) Compact(rev int64, timeout time.Duration) error {
cli, err := m.CreateEtcdClient()
if err != nil {
return fmt.Errorf("%v (%q)", err, m.EtcdClientEndpoint)
}
defer cli.Close()
ctx, cancel := context.WithTimeout(context.Background(), timeout)
_, err = cli.Compact(ctx, rev, clientv3.WithCompactPhysical())
cancel()
return err
}
// IsLeader returns true if this member is the current cluster leader.
func (m *Member) IsLeader() (bool, error) {
cli, err := m.CreateEtcdClient()
if err != nil {
return false, fmt.Errorf("%v (%q)", err, m.EtcdClientEndpoint)
}
defer cli.Close()
resp, err := cli.Status(context.Background(), m.EtcdClientEndpoint)
if err != nil {
return false, err
}
return resp.Header.MemberId == resp.Leader, nil
}
// WriteHealthKey writes a health key to this member.
func (m *Member) WriteHealthKey() error {
cli, err := m.CreateEtcdClient()
if err != nil {
return fmt.Errorf("%v (%q)", err, m.EtcdClientEndpoint)
}
defer cli.Close()
// give enough time-out in case expensive requests (range/delete) are pending
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
_, err = cli.Put(ctx, "health", "good")
cancel()
if err != nil {
return fmt.Errorf("%v (%q)", err, m.EtcdClientEndpoint)
}
return nil
}
// SaveSnapshot downloads a snapshot file from this member, locally.
// It's meant to requested remotely, so that local member can store
// snapshot file on local disk.
func (m *Member) SaveSnapshot(lg *zap.Logger) (err error) {
// remove existing snapshot first
if err = os.RemoveAll(m.SnapshotPath); err != nil {
return err
}
var ccfg *clientv3.Config
ccfg, err = m.CreateEtcdClientConfig()
if err != nil {
return fmt.Errorf("%v (%q)", err, m.EtcdClientEndpoint)
}
lg.Info(
"snapshot save START",
zap.String("member-name", m.Etcd.Name),
zap.Strings("member-client-urls", m.Etcd.AdvertiseClientURLs),
zap.String("snapshot-path", m.SnapshotPath),
)
now := time.Now()
mgr := snapshot.NewV3(lg)
version, err := mgr.Save(context.Background(), *ccfg, m.SnapshotPath)
if err != nil {
return err
}
took := time.Since(now)
var fi os.FileInfo
fi, err = os.Stat(m.SnapshotPath)
if err != nil {
return err
}
var st snapshot.Status
st, err = mgr.Status(m.SnapshotPath)
if err != nil {
return err
}
m.SnapshotInfo = &SnapshotInfo{
MemberName: m.Etcd.Name,
MemberClientURLs: m.Etcd.AdvertiseClientURLs,
SnapshotPath: m.SnapshotPath,
SnapshotFileSize: humanize.Bytes(uint64(fi.Size())),
SnapshotTotalSize: humanize.Bytes(uint64(st.TotalSize)),
SnapshotTotalKey: int64(st.TotalKey),
SnapshotHash: int64(st.Hash),
SnapshotRevision: st.Revision,
Took: fmt.Sprintf("%v", took),
Version: version,
}
lg.Info(
"snapshot save END",
zap.String("member-name", m.SnapshotInfo.MemberName),
zap.String("member-version", m.SnapshotInfo.Version),
zap.Strings("member-client-urls", m.SnapshotInfo.MemberClientURLs),
zap.String("snapshot-path", m.SnapshotPath),
zap.String("snapshot-file-size", m.SnapshotInfo.SnapshotFileSize),
zap.String("snapshot-total-size", m.SnapshotInfo.SnapshotTotalSize),
zap.Int64("snapshot-total-key", m.SnapshotInfo.SnapshotTotalKey),
zap.Int64("snapshot-hash", m.SnapshotInfo.SnapshotHash),
zap.Int64("snapshot-revision", m.SnapshotInfo.SnapshotRevision),
zap.String("took", m.SnapshotInfo.Took),
)
return nil
}
// RestoreSnapshot restores a cluster from a given snapshot file on disk.
// It's meant to requested remotely, so that local member can load the
// snapshot file from local disk.
func (m *Member) RestoreSnapshot(lg *zap.Logger) (err error) {
if err = os.RemoveAll(m.EtcdOnSnapshotRestore.DataDir); err != nil {
return err
}
if err = os.RemoveAll(m.EtcdOnSnapshotRestore.WALDir); err != nil {
return err
}
lg.Info(
"snapshot restore START",
zap.String("member-name", m.Etcd.Name),
zap.Strings("member-client-urls", m.Etcd.AdvertiseClientURLs),
zap.String("snapshot-path", m.SnapshotPath),
)
now := time.Now()
mgr := snapshot.NewV3(lg)
err = mgr.Restore(snapshot.RestoreConfig{
SnapshotPath: m.SnapshotInfo.SnapshotPath,
Name: m.EtcdOnSnapshotRestore.Name,
OutputDataDir: m.EtcdOnSnapshotRestore.DataDir,
OutputWALDir: m.EtcdOnSnapshotRestore.WALDir,
PeerURLs: m.EtcdOnSnapshotRestore.AdvertisePeerURLs,
InitialCluster: m.EtcdOnSnapshotRestore.InitialCluster,
InitialClusterToken: m.EtcdOnSnapshotRestore.InitialClusterToken,
SkipHashCheck: false,
// TODO: set SkipHashCheck it true, to recover from existing db file
})
took := time.Since(now)
lg.Info(
"snapshot restore END",
zap.String("member-name", m.SnapshotInfo.MemberName),
zap.String("member-version", m.SnapshotInfo.Version),
zap.Strings("member-client-urls", m.SnapshotInfo.MemberClientURLs),
zap.String("snapshot-path", m.SnapshotPath),
zap.String("snapshot-file-size", m.SnapshotInfo.SnapshotFileSize),
zap.String("snapshot-total-size", m.SnapshotInfo.SnapshotTotalSize),
zap.Int64("snapshot-total-key", m.SnapshotInfo.SnapshotTotalKey),
zap.Int64("snapshot-hash", m.SnapshotInfo.SnapshotHash),
zap.Int64("snapshot-revision", m.SnapshotInfo.SnapshotRevision),
zap.String("took", took.String()),
zap.Error(err),
)
return err
}

File diff suppressed because it is too large Load Diff

View File

@ -1,635 +0,0 @@
syntax = "proto3";
package rpcpb;
import "gogoproto/gogo.proto";
option (gogoproto.marshaler_all) = true;
option (gogoproto.sizer_all) = true;
option (gogoproto.unmarshaler_all) = true;
option (gogoproto.goproto_getters_all) = false;
message Request {
Operation Operation = 1;
// Member contains the same Member object from tester configuration.
Member Member = 2;
// Tester contains tester configuration.
Tester Tester = 3;
}
// SnapshotInfo contains SAVE_SNAPSHOT request results.
message SnapshotInfo {
string MemberName = 1;
repeated string MemberClientURLs = 2;
string SnapshotPath = 3;
string SnapshotFileSize = 4;
string SnapshotTotalSize = 5;
int64 SnapshotTotalKey = 6;
int64 SnapshotHash = 7;
int64 SnapshotRevision = 8;
string Took = 9;
string Version = 10;
}
message Response {
bool Success = 1;
string Status = 2;
// Member contains the same Member object from tester request.
Member Member = 3;
// SnapshotInfo contains SAVE_SNAPSHOT request results.
SnapshotInfo SnapshotInfo = 4;
}
service Transport {
rpc Transport(stream Request) returns (stream Response) {}
}
message Member {
// EtcdExec is the executable etcd binary path in agent server.
string EtcdExec = 1 [(gogoproto.moretags) = "yaml:\"etcd-exec\""];
// AgentAddr is the agent HTTP server address.
string AgentAddr = 11 [(gogoproto.moretags) = "yaml:\"agent-addr\""];
// FailpointHTTPAddr is the agent's failpoints HTTP server address.
string FailpointHTTPAddr = 12 [(gogoproto.moretags) = "yaml:\"failpoint-http-addr\""];
// BaseDir is the base directory where all logs and etcd data are stored.
string BaseDir = 101 [(gogoproto.moretags) = "yaml:\"base-dir\""];
// EtcdClientProxy is true when client traffic needs to be proxied.
// If true, listen client URL port must be different than advertise client URL port.
bool EtcdClientProxy = 201 [(gogoproto.moretags) = "yaml:\"etcd-client-proxy\""];
// EtcdPeerProxy is true when peer traffic needs to be proxied.
// If true, listen peer URL port must be different than advertise peer URL port.
bool EtcdPeerProxy = 202 [(gogoproto.moretags) = "yaml:\"etcd-peer-proxy\""];
// EtcdClientEndpoint is the etcd client endpoint.
string EtcdClientEndpoint = 301 [(gogoproto.moretags) = "yaml:\"etcd-client-endpoint\""];
// Etcd defines etcd binary configuration flags.
Etcd Etcd = 302 [(gogoproto.moretags) = "yaml:\"etcd\""];
// EtcdOnSnapshotRestore defines one-time use configuration during etcd
// snapshot recovery process.
Etcd EtcdOnSnapshotRestore = 303;
// ClientCertData contains cert file contents from this member's etcd server.
string ClientCertData = 401 [(gogoproto.moretags) = "yaml:\"client-cert-data\""];
string ClientCertPath = 402 [(gogoproto.moretags) = "yaml:\"client-cert-path\""];
// ClientKeyData contains key file contents from this member's etcd server.
string ClientKeyData = 403 [(gogoproto.moretags) = "yaml:\"client-key-data\""];
string ClientKeyPath = 404 [(gogoproto.moretags) = "yaml:\"client-key-path\""];
// ClientTrustedCAData contains trusted CA file contents from this member's etcd server.
string ClientTrustedCAData = 405 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-data\""];
string ClientTrustedCAPath = 406 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-path\""];
// PeerCertData contains cert file contents from this member's etcd server.
string PeerCertData = 501 [(gogoproto.moretags) = "yaml:\"peer-cert-data\""];
string PeerCertPath = 502 [(gogoproto.moretags) = "yaml:\"peer-cert-path\""];
// PeerKeyData contains key file contents from this member's etcd server.
string PeerKeyData = 503 [(gogoproto.moretags) = "yaml:\"peer-key-data\""];
string PeerKeyPath = 504 [(gogoproto.moretags) = "yaml:\"peer-key-path\""];
// PeerTrustedCAData contains trusted CA file contents from this member's etcd server.
string PeerTrustedCAData = 505 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-data\""];
string PeerTrustedCAPath = 506 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-path\""];
// SnapshotPath is the snapshot file path to store or restore from.
string SnapshotPath = 601 [(gogoproto.moretags) = "yaml:\"snapshot-path\""];
// SnapshotInfo contains last SAVE_SNAPSHOT request results.
SnapshotInfo SnapshotInfo = 602;
// Failpoints is the GOFAIL_FAILPOINTS environment variable value to use when starting etcd.
string Failpoints = 701 [(gogoproto.moretags) = "yaml:\"failpoints\""];
}
message Tester {
string DataDir = 1 [(gogoproto.moretags) = "yaml:\"data-dir\""];
string Network = 2 [(gogoproto.moretags) = "yaml:\"network\""];
string Addr = 3 [(gogoproto.moretags) = "yaml:\"addr\""];
// DelayLatencyMsRv is the delay latency in milliseconds,
// to inject to simulated slow network.
uint32 DelayLatencyMs = 11 [(gogoproto.moretags) = "yaml:\"delay-latency-ms\""];
// DelayLatencyMsRv is the delay latency random variable in milliseconds.
uint32 DelayLatencyMsRv = 12 [(gogoproto.moretags) = "yaml:\"delay-latency-ms-rv\""];
// UpdatedDelayLatencyMs is the update delay latency in milliseconds,
// to inject to simulated slow network. It's the final latency to apply,
// in case the latency numbers are randomly generated from given delay latency field.
uint32 UpdatedDelayLatencyMs = 13 [(gogoproto.moretags) = "yaml:\"updated-delay-latency-ms\""];
// RoundLimit is the limit of rounds to run failure set (-1 to run without limits).
int32 RoundLimit = 21 [(gogoproto.moretags) = "yaml:\"round-limit\""];
// ExitOnCaseFail is true, then exit tester on first failure.
bool ExitOnCaseFail = 22 [(gogoproto.moretags) = "yaml:\"exit-on-failure\""];
// EnablePprof is true to enable profiler.
bool EnablePprof = 23 [(gogoproto.moretags) = "yaml:\"enable-pprof\""];
// CaseDelayMs is the delay duration after failure is injected.
// Useful when triggering snapshot or no-op failure cases.
uint32 CaseDelayMs = 31 [(gogoproto.moretags) = "yaml:\"case-delay-ms\""];
// CaseShuffle is true to randomize failure injecting order.
bool CaseShuffle = 32 [(gogoproto.moretags) = "yaml:\"case-shuffle\""];
// Cases is the selected test cases to schedule.
// If empty, run all failure cases.
repeated string Cases = 33 [(gogoproto.moretags) = "yaml:\"cases\""];
// FailpointCommands is the list of "gofail" commands
// (e.g. panic("etcd-tester"),1*sleep(1000).
repeated string FailpointCommands = 34 [(gogoproto.moretags) = "yaml:\"failpoint-commands\""];
// RunnerExecPath is a path of etcd-runner binary.
string RunnerExecPath = 41 [(gogoproto.moretags) = "yaml:\"runner-exec-path\""];
// ExternalExecPath is a path of script for enabling/disabling an external fault injector.
string ExternalExecPath = 42 [(gogoproto.moretags) = "yaml:\"external-exec-path\""];
// Stressers is the list of stresser types:
// KV, LEASE, ELECTION_RUNNER, WATCH_RUNNER, LOCK_RACER_RUNNER, LEASE_RUNNER.
repeated Stresser Stressers = 101 [(gogoproto.moretags) = "yaml:\"stressers\""];
// Checkers is the list of consistency checker types:
// KV_HASH, LEASE_EXPIRE, NO_CHECK, RUNNER.
// Leave empty to skip consistency checks.
repeated string Checkers = 102 [(gogoproto.moretags) = "yaml:\"checkers\""];
// StressKeySize is the size of each small key written into etcd.
int32 StressKeySize = 201 [(gogoproto.moretags) = "yaml:\"stress-key-size\""];
// StressKeySizeLarge is the size of each large key written into etcd.
int32 StressKeySizeLarge = 202 [(gogoproto.moretags) = "yaml:\"stress-key-size-large\""];
// StressKeySuffixRange is the count of key range written into etcd.
// Stress keys are created with "fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)".
int32 StressKeySuffixRange = 203 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range\""];
// StressKeySuffixRangeTxn is the count of key range written into etcd txn (max 100).
// Stress keys are created with "fmt.Sprintf("/k%03d", i)".
int32 StressKeySuffixRangeTxn = 204 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range-txn\""];
// StressKeyTxnOps is the number of operations per a transaction (max 64).
int32 StressKeyTxnOps = 205 [(gogoproto.moretags) = "yaml:\"stress-key-txn-ops\""];
// StressClients is the number of concurrent stressing clients
// with "one" shared TCP connection.
int32 StressClients = 301 [(gogoproto.moretags) = "yaml:\"stress-clients\""];
// StressQPS is the maximum number of stresser requests per second.
int32 StressQPS = 302 [(gogoproto.moretags) = "yaml:\"stress-qps\""];
}
enum StresserType {
KV_WRITE_SMALL = 0;
KV_WRITE_LARGE = 1;
KV_READ_ONE_KEY = 2;
KV_READ_RANGE = 3;
KV_DELETE_ONE_KEY = 4;
KV_DELETE_RANGE = 5;
KV_TXN_WRITE_DELETE = 6;
LEASE = 10;
ELECTION_RUNNER = 20;
WATCH_RUNNER = 31;
LOCK_RACER_RUNNER = 41;
LEASE_RUNNER = 51;
}
message Stresser {
string Type = 1 [(gogoproto.moretags) = "yaml:\"type\""];
double Weight = 2 [(gogoproto.moretags) = "yaml:\"weight\""];
}
enum Checker {
KV_HASH = 0;
LEASE_EXPIRE = 1;
RUNNER = 2;
NO_CHECK = 3;
SHORT_TTL_LEASE_EXPIRE = 4;
}
message Etcd {
string Name = 1 [(gogoproto.moretags) = "yaml:\"name\""];
string DataDir = 2 [(gogoproto.moretags) = "yaml:\"data-dir\""];
string WALDir = 3 [(gogoproto.moretags) = "yaml:\"wal-dir\""];
// HeartbeatIntervalMs is the time (in milliseconds) of a heartbeat interval.
// Default value is 100, which is 100ms.
int64 HeartbeatIntervalMs = 11 [(gogoproto.moretags) = "yaml:\"heartbeat-interval\""];
// ElectionTimeoutMs is the time (in milliseconds) for an election to timeout.
// Default value is 1000, which is 1s.
int64 ElectionTimeoutMs = 12 [(gogoproto.moretags) = "yaml:\"election-timeout\""];
repeated string ListenClientURLs = 21 [(gogoproto.moretags) = "yaml:\"listen-client-urls\""];
repeated string AdvertiseClientURLs = 22 [(gogoproto.moretags) = "yaml:\"advertise-client-urls\""];
bool ClientAutoTLS = 23 [(gogoproto.moretags) = "yaml:\"auto-tls\""];
bool ClientCertAuth = 24 [(gogoproto.moretags) = "yaml:\"client-cert-auth\""];
string ClientCertFile = 25 [(gogoproto.moretags) = "yaml:\"cert-file\""];
string ClientKeyFile = 26 [(gogoproto.moretags) = "yaml:\"key-file\""];
string ClientTrustedCAFile = 27 [(gogoproto.moretags) = "yaml:\"trusted-ca-file\""];
repeated string ListenPeerURLs = 31 [(gogoproto.moretags) = "yaml:\"listen-peer-urls\""];
repeated string AdvertisePeerURLs = 32 [(gogoproto.moretags) = "yaml:\"initial-advertise-peer-urls\""];
bool PeerAutoTLS = 33 [(gogoproto.moretags) = "yaml:\"peer-auto-tls\""];
bool PeerClientCertAuth = 34 [(gogoproto.moretags) = "yaml:\"peer-client-cert-auth\""];
string PeerCertFile = 35 [(gogoproto.moretags) = "yaml:\"peer-cert-file\""];
string PeerKeyFile = 36 [(gogoproto.moretags) = "yaml:\"peer-key-file\""];
string PeerTrustedCAFile = 37 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-file\""];
string InitialCluster = 41 [(gogoproto.moretags) = "yaml:\"initial-cluster\""];
string InitialClusterState = 42 [(gogoproto.moretags) = "yaml:\"initial-cluster-state\""];
string InitialClusterToken = 43 [(gogoproto.moretags) = "yaml:\"initial-cluster-token\""];
int64 SnapshotCount = 51 [(gogoproto.moretags) = "yaml:\"snapshot-count\""];
int64 QuotaBackendBytes = 52 [(gogoproto.moretags) = "yaml:\"quota-backend-bytes\""];
bool PreVote = 63 [(gogoproto.moretags) = "yaml:\"pre-vote\""];
bool InitialCorruptCheck = 64 [(gogoproto.moretags) = "yaml:\"initial-corrupt-check\""];
string Logger = 71 [(gogoproto.moretags) = "yaml:\"logger\""];
// LogOutputs is the log file to store current etcd server logs.
repeated string LogOutputs = 72 [(gogoproto.moretags) = "yaml:\"log-outputs\""];
string LogLevel = 73 [(gogoproto.moretags) = "yaml:\"log-level\""];
bool SocketReuseAddress = 81 [(gogoproto.moretags) = "yaml:\"socket-reuse-address\""];
bool SocketReusePort = 82 [(gogoproto.moretags) = "yaml:\"socket-reuse-port\""];
}
enum Operation {
// NOT_STARTED is the agent status before etcd first start.
NOT_STARTED = 0;
// INITIAL_START_ETCD is only called to start etcd, the very first time.
INITIAL_START_ETCD = 10;
// RESTART_ETCD is sent to restart killed etcd.
RESTART_ETCD = 11;
// SIGTERM_ETCD pauses etcd process while keeping data directories
// and previous etcd configurations.
SIGTERM_ETCD = 20;
// SIGQUIT_ETCD_AND_REMOVE_DATA kills etcd process and removes all data
// directories to simulate destroying the whole machine.
SIGQUIT_ETCD_AND_REMOVE_DATA = 21;
// SAVE_SNAPSHOT is sent to trigger local member to download its snapshot
// onto its local disk with the specified path from tester.
SAVE_SNAPSHOT = 30;
// RESTORE_RESTART_FROM_SNAPSHOT is sent to trigger local member to
// restore a cluster from existing snapshot from disk, and restart
// an etcd instance from recovered data.
RESTORE_RESTART_FROM_SNAPSHOT = 31;
// RESTART_FROM_SNAPSHOT is sent to trigger local member to restart
// and join an existing cluster that has been recovered from a snapshot.
// Local member joins this cluster with fresh data.
RESTART_FROM_SNAPSHOT = 32;
// SIGQUIT_ETCD_AND_ARCHIVE_DATA is sent when consistency check failed,
// thus need to archive etcd data directories.
SIGQUIT_ETCD_AND_ARCHIVE_DATA = 40;
// BLACKHOLE_PEER_PORT_TX_RX drops all outgoing/incoming packets from/to
// the peer port on target member's peer port.
BLACKHOLE_PEER_PORT_TX_RX = 100;
// UNBLACKHOLE_PEER_PORT_TX_RX removes outgoing/incoming packet dropping.
UNBLACKHOLE_PEER_PORT_TX_RX = 101;
// DELAY_PEER_PORT_TX_RX delays all outgoing/incoming packets from/to
// the peer port on target member's peer port.
DELAY_PEER_PORT_TX_RX = 200;
// UNDELAY_PEER_PORT_TX_RX removes all outgoing/incoming delays.
UNDELAY_PEER_PORT_TX_RX = 201;
}
// Case defines various system faults or test case in distributed systems,
// in order to verify correct behavior of etcd servers and clients.
enum Case {
// SIGTERM_ONE_FOLLOWER stops a randomly chosen follower (non-leader)
// but does not delete its data directories on disk for next restart.
// It waits "delay-ms" before recovering this failure.
// The expected behavior is that the follower comes back online
// and rejoins the cluster, and then each member continues to process
// client requests ('Put' request that requires Raft consensus).
SIGTERM_ONE_FOLLOWER = 0;
// SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly chosen
// follower but does not delete its data directories on disk for next
// restart. And waits until most up-to-date node (leader) applies the
// snapshot count of entries since the stop operation.
// The expected behavior is that the follower comes back online and
// rejoins the cluster, and then active leader sends snapshot
// to the follower to force it to follow the leader's log.
// As always, after recovery, each member must be able to process
// client requests.
SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 1;
// SIGTERM_LEADER stops the active leader node but does not delete its
// data directories on disk for next restart. Then it waits "delay-ms"
// before recovering this failure, in order to trigger election timeouts.
// The expected behavior is that a new leader gets elected, and the
// old leader comes back online and rejoins the cluster as a follower.
// As always, after recovery, each member must be able to process
// client requests.
SIGTERM_LEADER = 2;
// SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader node
// but does not delete its data directories on disk for next restart.
// And waits until most up-to-date node ("new" leader) applies the
// snapshot count of entries since the stop operation.
// The expected behavior is that cluster elects a new leader, and the
// old leader comes back online and rejoins the cluster as a follower.
// And it receives the snapshot from the new leader to overwrite its
// store. As always, after recovery, each member must be able to
// process client requests.
SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT = 3;
// SIGTERM_QUORUM stops majority number of nodes to make the whole cluster
// inoperable but does not delete data directories on stopped nodes
// for next restart. And it waits "delay-ms" before recovering failure.
// The expected behavior is that nodes come back online, thus cluster
// comes back operative as well. As always, after recovery, each member
// must be able to process client requests.
SIGTERM_QUORUM = 4;
// SIGTERM_ALL stops the whole cluster but does not delete data directories
// on disk for next restart. And it waits "delay-ms" before recovering
// this failure.
// The expected behavior is that nodes come back online, thus cluster
// comes back operative as well. As always, after recovery, each member
// must be able to process client requests.
SIGTERM_ALL = 5;
// SIGQUIT_AND_REMOVE_ONE_FOLLOWER stops a randomly chosen follower
// (non-leader), deletes its data directories on disk, and removes
// this member from cluster (membership reconfiguration). On recovery,
// tester adds a new member, and this member joins the existing cluster
// with fresh data. It waits "delay-ms" before recovering this
// failure. This simulates destroying one follower machine, where operator
// needs to add a new member from a fresh machine.
// The expected behavior is that a new member joins the existing cluster,
// and then each member continues to process client requests.
SIGQUIT_AND_REMOVE_ONE_FOLLOWER = 10;
// SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly
// chosen follower, deletes its data directories on disk, and removes
// this member from cluster (membership reconfiguration). On recovery,
// tester adds a new member, and this member joins the existing cluster
// restart. On member remove, cluster waits until most up-to-date node
// (leader) applies the snapshot count of entries since the stop operation.
// This simulates destroying a leader machine, where operator needs to add
// a new member from a fresh machine.
// The expected behavior is that a new member joins the existing cluster,
// and receives a snapshot from the active leader. As always, after
// recovery, each member must be able to process client requests.
SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 11;
// SIGQUIT_AND_REMOVE_LEADER stops the active leader node, deletes its
// data directories on disk, and removes this member from cluster.
// On recovery, tester adds a new member, and this member joins the
// existing cluster with fresh data. It waits "delay-ms" before
// recovering this failure. This simulates destroying a leader machine,
// where operator needs to add a new member from a fresh machine.
// The expected behavior is that a new member joins the existing cluster,
// and then each member continues to process client requests.
SIGQUIT_AND_REMOVE_LEADER = 12;
// SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader,
// deletes its data directories on disk, and removes this member from
// cluster (membership reconfiguration). On recovery, tester adds a new
// member, and this member joins the existing cluster restart. On member
// remove, cluster waits until most up-to-date node (new leader) applies
// the snapshot count of entries since the stop operation. This simulates
// destroying a leader machine, where operator needs to add a new member
// from a fresh machine.
// The expected behavior is that on member remove, cluster elects a new
// leader, and a new member joins the existing cluster and receives a
// snapshot from the newly elected leader. As always, after recovery, each
// member must be able to process client requests.
SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT = 13;
// SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH first
// stops majority number of nodes, deletes data directories on those quorum
// nodes, to make the whole cluster inoperable. Now that quorum and their
// data are totally destroyed, cluster cannot even remove unavailable nodes
// (e.g. 2 out of 3 are lost, so no leader can be elected).
// Let's assume 3-node cluster of node A, B, and C. One day, node A and B
// are destroyed and all their data are gone. The only viable solution is
// to recover from C's latest snapshot.
//
// To simulate:
// 1. Assume node C is the current leader with most up-to-date data.
// 2. Download snapshot from node C, before destroying node A and B.
// 3. Destroy node A and B, and make the whole cluster inoperable.
// 4. Now node C cannot operate either.
// 5. SIGTERM node C and remove its data directories.
// 6. Restore a new seed member from node C's latest snapshot file.
// 7. Add another member to establish 2-node cluster.
// 8. Add another member to establish 3-node cluster.
// 9. Add more if any.
//
// The expected behavior is that etcd successfully recovers from such
// disastrous situation as only 1-node survives out of 3-node cluster,
// new members joins the existing cluster, and previous data from snapshot
// are still preserved after recovery process. As always, after recovery,
// each member must be able to process client requests.
SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH = 14;
// BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER drops all outgoing/incoming
// packets from/to the peer port on a randomly chosen follower
// (non-leader), and waits for "delay-ms" until recovery.
// The expected behavior is that once dropping operation is undone,
// each member must be able to process client requests.
BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER = 100;
// BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT drops
// all outgoing/incoming packets from/to the peer port on a randomly
// chosen follower (non-leader), and waits for most up-to-date node
// (leader) applies the snapshot count of entries since the blackhole
// operation.
// The expected behavior is that once packet drop operation is undone,
// the slow follower tries to catch up, possibly receiving the snapshot
// from the active leader. As always, after recovery, each member must
// be able to process client requests.
BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 101;
// BLACKHOLE_PEER_PORT_TX_RX_LEADER drops all outgoing/incoming packets
// from/to the peer port on the active leader (isolated), and waits for
// "delay-ms" until recovery, in order to trigger election timeout.
// The expected behavior is that after election timeout, a new leader gets
// elected, and once dropping operation is undone, the old leader comes
// back and rejoins the cluster as a follower. As always, after recovery,
// each member must be able to process client requests.
BLACKHOLE_PEER_PORT_TX_RX_LEADER = 102;
// BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT drops all
// outgoing/incoming packets from/to the peer port on the active leader,
// and waits for most up-to-date node (leader) applies the snapshot
// count of entries since the blackhole operation.
// The expected behavior is that cluster elects a new leader, and once
// dropping operation is undone, the old leader comes back and rejoins
// the cluster as a follower. The slow follower tries to catch up, likely
// receiving the snapshot from the new active leader. As always, after
// recovery, each member must be able to process client requests.
BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 103;
// BLACKHOLE_PEER_PORT_TX_RX_QUORUM drops all outgoing/incoming packets
// from/to the peer ports on majority nodes of cluster, thus losing its
// leader and cluster being inoperable. And it waits for "delay-ms"
// until recovery.
// The expected behavior is that once packet drop operation is undone,
// nodes come back online, thus cluster comes back operative. As always,
// after recovery, each member must be able to process client requests.
BLACKHOLE_PEER_PORT_TX_RX_QUORUM = 104;
// BLACKHOLE_PEER_PORT_TX_RX_ALL drops all outgoing/incoming packets
// from/to the peer ports on all nodes, thus making cluster totally
// inoperable. It waits for "delay-ms" until recovery.
// The expected behavior is that once packet drop operation is undone,
// nodes come back online, thus cluster comes back operative. As always,
// after recovery, each member must be able to process client requests.
BLACKHOLE_PEER_PORT_TX_RX_ALL = 105;
// DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming packets
// from/to the peer port on a randomly chosen follower (non-leader).
// It waits for "delay-ms" until recovery.
// The expected behavior is that once packet delay operation is undone,
// the follower comes back and tries to catch up with latest changes from
// cluster. And as always, after recovery, each member must be able to
// process client requests.
DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 200;
// RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming
// packets from/to the peer port on a randomly chosen follower
// (non-leader) with a randomized time duration (thus isolated). It
// waits for "delay-ms" until recovery.
// The expected behavior is that once packet delay operation is undone,
// each member must be able to process client requests.
RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 201;
// DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
// outgoing/incoming packets from/to the peer port on a randomly chosen
// follower (non-leader), and waits for most up-to-date node (leader)
// applies the snapshot count of entries since the delay operation.
// The expected behavior is that the delayed follower gets isolated
// and behind the current active leader, and once delay operation is undone,
// the slow follower comes back and catches up possibly receiving snapshot
// from the active leader. As always, after recovery, each member must be
// able to process client requests.
DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 202;
// RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
// outgoing/incoming packets from/to the peer port on a randomly chosen
// follower (non-leader) with a randomized time duration, and waits for
// most up-to-date node (leader) applies the snapshot count of entries
// since the delay operation.
// The expected behavior is that the delayed follower gets isolated
// and behind the current active leader, and once delay operation is undone,
// the slow follower comes back and catches up, possibly receiving a
// snapshot from the active leader. As always, after recovery, each member
// must be able to process client requests.
RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 203;
// DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets from/to
// the peer port on the active leader. And waits for "delay-ms" until
// recovery.
// The expected behavior is that cluster may elect a new leader, and
// once packet delay operation is undone, the (old) leader comes back
// and tries to catch up with latest changes from cluster. As always,
// after recovery, each member must be able to process client requests.
DELAY_PEER_PORT_TX_RX_LEADER = 204;
// RANDOM_DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets
// from/to the peer port on the active leader with a randomized time
// duration. And waits for "delay-ms" until recovery.
// The expected behavior is that cluster may elect a new leader, and
// once packet delay operation is undone, the (old) leader comes back
// and tries to catch up with latest changes from cluster. As always,
// after recovery, each member must be able to process client requests.
RANDOM_DELAY_PEER_PORT_TX_RX_LEADER = 205;
// DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
// outgoing/incoming packets from/to the peer port on the active leader,
// and waits for most up-to-date node (current or new leader) applies the
// snapshot count of entries since the delay operation.
// The expected behavior is that cluster may elect a new leader, and
// the old leader gets isolated and behind the current active leader,
// and once delay operation is undone, the slow follower comes back
// and catches up, likely receiving a snapshot from the active leader.
// As always, after recovery, each member must be able to process client
// requests.
DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 206;
// RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
// outgoing/incoming packets from/to the peer port on the active leader,
// with a randomized time duration. And it waits for most up-to-date node
// (current or new leader) applies the snapshot count of entries since the
// delay operation.
// The expected behavior is that cluster may elect a new leader, and
// the old leader gets isolated and behind the current active leader,
// and once delay operation is undone, the slow follower comes back
// and catches up, likely receiving a snapshot from the active leader.
// As always, after recovery, each member must be able to process client
// requests.
RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 207;
// DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets from/to
// the peer ports on majority nodes of cluster. And it waits for
// "delay-ms" until recovery, likely to trigger election timeouts.
// The expected behavior is that cluster may elect a new leader, while
// quorum of nodes struggle with slow networks, and once delay operation
// is undone, nodes come back and cluster comes back operative. As always,
// after recovery, each member must be able to process client requests.
DELAY_PEER_PORT_TX_RX_QUORUM = 208;
// RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets
// from/to the peer ports on majority nodes of cluster, with randomized
// time durations. And it waits for "delay-ms" until recovery, likely
// to trigger election timeouts.
// The expected behavior is that cluster may elect a new leader, while
// quorum of nodes struggle with slow networks, and once delay operation
// is undone, nodes come back and cluster comes back operative. As always,
// after recovery, each member must be able to process client requests.
RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM = 209;
// DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets from/to the
// peer ports on all nodes. And it waits for "delay-ms" until recovery,
// likely to trigger election timeouts.
// The expected behavior is that cluster may become totally inoperable,
// struggling with slow networks across the whole cluster. Once delay
// operation is undone, nodes come back and cluster comes back operative.
// As always, after recovery, each member must be able to process client
// requests.
DELAY_PEER_PORT_TX_RX_ALL = 210;
// RANDOM_DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets
// from/to the peer ports on all nodes, with randomized time durations.
// And it waits for "delay-ms" until recovery, likely to trigger
// election timeouts.
// The expected behavior is that cluster may become totally inoperable,
// struggling with slow networks across the whole cluster. Once delay
// operation is undone, nodes come back and cluster comes back operative.
// As always, after recovery, each member must be able to process client
// requests.
RANDOM_DELAY_PEER_PORT_TX_RX_ALL = 211;
// NO_FAIL_WITH_STRESS stops injecting failures while testing the
// consistency and correctness under pressure loads, for the duration of
// "delay-ms". Goal is to ensure cluster be still making progress
// on recovery, and verify system does not deadlock following a sequence
// of failure injections.
// The expected behavior is that cluster remains fully operative in healthy
// condition. As always, after recovery, each member must be able to process
// client requests.
NO_FAIL_WITH_STRESS = 300;
// NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS neither injects failures nor
// sends stressig client requests to the cluster, for the duration of
// "delay-ms". Goal is to ensure cluster be still making progress
// on recovery, and verify system does not deadlock following a sequence
// of failure injections.
// The expected behavior is that cluster remains fully operative in healthy
// condition, and clients requests during liveness period succeed without
// errors.
// Note: this is how Google Chubby does failure injection testing
// https://static.googleusercontent.com/media/research.google.com/en//archive/paxos_made_live.pdf.
NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS = 301;
// FAILPOINTS injects failpoints to etcd server runtime, triggering panics
// in critical code paths.
FAILPOINTS = 400;
// FAILPOINTS_WITH_DISK_IO_LATENCY injects high disk I/O latency failure in raftAfterSave code paths.
FAILPOINTS_WITH_DISK_IO_LATENCY = 401;
// EXTERNAL runs external failure injection scripts.
EXTERNAL = 500;
}

View File

@ -1,144 +0,0 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package runner
import (
"context"
"errors"
"fmt"
"go.etcd.io/etcd/client/v3/concurrency"
"github.com/spf13/cobra"
)
// NewElectionCommand returns the cobra command for "election runner".
func NewElectionCommand() *cobra.Command {
cmd := &cobra.Command{
Use: "election [election name (defaults to 'elector')]",
Short: "Performs election operation",
Run: runElectionFunc,
}
cmd.Flags().IntVar(&totalClientConnections, "total-client-connections", 10, "total number of client connections")
return cmd
}
func runElectionFunc(cmd *cobra.Command, args []string) {
election := "elector"
if len(args) == 1 {
election = args[0]
}
if len(args) > 1 {
ExitWithError(ExitBadArgs, errors.New("election takes at most one argument"))
}
rcs := make([]roundClient, totalClientConnections)
validatec := make(chan struct{}, len(rcs))
// nextc closes when election is ready for next round.
nextc := make(chan struct{})
eps := endpointsFromFlag(cmd)
for i := range rcs {
v := fmt.Sprintf("%d", i)
observedLeader := ""
validateWaiters := 0
var rcNextc chan struct{}
setRcNextc := func() {
rcNextc = nextc
}
rcs[i].c = newClient(eps, dialTimeout)
var (
s *concurrency.Session
err error
)
for {
s, err = concurrency.NewSession(rcs[i].c)
if err == nil {
break
}
}
e := concurrency.NewElection(s, election)
rcs[i].acquire = func() (err error) {
ctx, cancel := context.WithCancel(context.Background())
donec := make(chan struct{})
go func() {
defer close(donec)
for ctx.Err() == nil {
if ol, ok := <-e.Observe(ctx); ok {
observedLeader = string(ol.Kvs[0].Value)
break
}
}
if observedLeader != v {
cancel()
}
}()
err = e.Campaign(ctx, v)
cancel()
<-donec
if err == nil {
observedLeader = v
}
if observedLeader == v {
validateWaiters = len(rcs)
}
select {
case <-ctx.Done():
return nil
default:
return err
}
}
rcs[i].validate = func() error {
l, err := e.Leader(context.TODO())
if err == nil && string(l.Kvs[0].Value) != observedLeader {
return fmt.Errorf("expected leader %q, got %q", observedLeader, l.Kvs[0].Value)
}
if err != nil {
return err
}
setRcNextc()
validatec <- struct{}{}
return nil
}
rcs[i].release = func() error {
for validateWaiters > 0 {
select {
case <-validatec:
validateWaiters--
default:
return fmt.Errorf("waiting on followers")
}
}
if err := e.Resign(context.TODO()); err != nil {
return err
}
if observedLeader == v {
oldNextc := nextc
nextc = make(chan struct{})
close(oldNextc)
}
<-rcNextc
observedLeader = ""
return nil
}
}
// each client creates 1 key from Campaign() and delete it from Resign()
// a round involves in 2*len(rcs) requests.
doRounds(rcs, rounds, 2*len(rcs))
}

View File

@ -1,42 +0,0 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package runner
import (
"fmt"
"os"
"go.etcd.io/etcd/client/v2"
)
const (
// http://tldp.org/LDP/abs/html/exitcodes.html
ExitSuccess = iota
ExitError
ExitBadConnection
ExitInvalidInput // for txn, watch command
ExitBadFeature // provided a valid flag with an unsupported value
ExitInterrupted
ExitIO
ExitBadArgs = 128
)
func ExitWithError(code int, err error) {
fmt.Fprintln(os.Stderr, "Error: ", err)
if cerr, ok := err.(*client.ClusterError); ok {
fmt.Fprintln(os.Stderr, cerr.Detail())
}
os.Exit(code)
}

View File

@ -1,114 +0,0 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package runner
import (
"context"
"fmt"
"log"
"sync"
"time"
clientv3 "go.etcd.io/etcd/client/v3"
"github.com/spf13/cobra"
"golang.org/x/time/rate"
)
// shared flags
var (
totalClientConnections int // total number of client connections to be made with server
endpoints []string
dialTimeout time.Duration
rounds int // total number of rounds to run; set to <= 0 to run forever.
reqRate int // maximum number of requests per second.
)
type roundClient struct {
c *clientv3.Client
progress int
acquire func() error
validate func() error
release func() error
}
func newClient(eps []string, timeout time.Duration) *clientv3.Client {
c, err := clientv3.New(clientv3.Config{
Endpoints: eps,
DialTimeout: timeout * time.Second,
})
if err != nil {
log.Fatal(err)
}
return c
}
func doRounds(rcs []roundClient, rounds int, requests int) {
var wg sync.WaitGroup
wg.Add(len(rcs))
finished := make(chan struct{})
limiter := rate.NewLimiter(rate.Limit(reqRate), reqRate)
for i := range rcs {
go func(rc *roundClient) {
defer wg.Done()
for rc.progress < rounds || rounds <= 0 {
if err := limiter.WaitN(context.Background(), requests/len(rcs)); err != nil {
log.Panicf("rate limiter error %v", err)
}
for rc.acquire() != nil { /* spin */
}
if err := rc.validate(); err != nil {
log.Fatal(err)
}
time.Sleep(10 * time.Millisecond)
rc.progress++
finished <- struct{}{}
for rc.release() != nil { /* spin */
}
}
}(&rcs[i])
}
start := time.Now()
for i := 1; i < len(rcs)*rounds+1 || rounds <= 0; i++ {
select {
case <-finished:
if i%100 == 0 {
fmt.Printf("finished %d, took %v\n", i, time.Since(start))
start = time.Now()
}
case <-time.After(time.Minute):
log.Panic("no progress after 1 minute!")
}
}
wg.Wait()
for _, rc := range rcs {
rc.c.Close()
}
}
func endpointsFromFlag(cmd *cobra.Command) []string {
eps, err := cmd.Flags().GetStringSlice("endpoints")
if err != nil {
ExitWithError(ExitError, err)
}
return eps
}

View File

@ -1,175 +0,0 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// copied from https://github.com/rkt/rkt/blob/master/rkt/help.go
package runner
import (
"bytes"
"fmt"
"io"
"os"
"strings"
"text/tabwriter"
"text/template"
"go.etcd.io/etcd/api/v3/version"
"github.com/spf13/cobra"
"github.com/spf13/pflag"
)
var (
commandUsageTemplate *template.Template
templFuncs = template.FuncMap{
"descToLines": func(s string) []string {
// trim leading/trailing whitespace and split into slice of lines
return strings.Split(strings.Trim(s, "\n\t "), "\n")
},
"cmdName": func(cmd *cobra.Command, startCmd *cobra.Command) string {
parts := []string{cmd.Name()}
for cmd.HasParent() && cmd.Parent().Name() != startCmd.Name() {
cmd = cmd.Parent()
parts = append([]string{cmd.Name()}, parts...)
}
return strings.Join(parts, " ")
},
}
)
func init() {
commandUsage := `
{{ $cmd := .Cmd }}\
{{ $cmdname := cmdName .Cmd .Cmd.Root }}\
NAME:
{{ if not .Cmd.HasParent }}\
{{printf "\t%s - %s" .Cmd.Name .Cmd.Short}}
{{else}}\
{{printf "\t%s - %s" $cmdname .Cmd.Short}}
{{end}}\
USAGE:
{{printf "\t%s" .Cmd.UseLine}}
{{ if not .Cmd.HasParent }}\
VERSION:
{{printf "\t%s" .Version}}
{{end}}\
{{if .Cmd.HasSubCommands}}\
API VERSION:
{{printf "\t%s" .APIVersion}}
{{end}}\
{{if .Cmd.HasSubCommands}}\
COMMANDS:
{{range .SubCommands}}\
{{ $cmdname := cmdName . $cmd }}\
{{ if .Runnable }}\
{{printf "\t%s\t%s" $cmdname .Short}}
{{end}}\
{{end}}\
{{end}}\
{{ if .Cmd.Long }}\
DESCRIPTION:
{{range $line := descToLines .Cmd.Long}}{{printf "\t%s" $line}}
{{end}}\
{{end}}\
{{if .Cmd.HasLocalFlags}}\
OPTIONS:
{{.LocalFlags}}\
{{end}}\
{{if .Cmd.HasInheritedFlags}}\
GLOBAL OPTIONS:
{{.GlobalFlags}}\
{{end}}
`[1:]
commandUsageTemplate = template.Must(template.New("command_usage").Funcs(templFuncs).Parse(strings.ReplaceAll(commandUsage, "\\\n", "")))
}
func etcdFlagUsages(flagSet *pflag.FlagSet) string {
x := new(bytes.Buffer)
flagSet.VisitAll(func(flag *pflag.Flag) {
if len(flag.Deprecated) > 0 {
return
}
var format string
if len(flag.Shorthand) > 0 {
format = " -%s, --%s"
} else {
format = " %s --%s"
}
if len(flag.NoOptDefVal) > 0 {
format = format + "["
}
if flag.Value.Type() == "string" {
// put quotes on the value
format = format + "=%q"
} else {
format = format + "=%s"
}
if len(flag.NoOptDefVal) > 0 {
format = format + "]"
}
format = format + "\t%s\n"
shorthand := flag.Shorthand
fmt.Fprintf(x, format, shorthand, flag.Name, flag.DefValue, flag.Usage)
})
return x.String()
}
func getSubCommands(cmd *cobra.Command) []*cobra.Command {
var subCommands []*cobra.Command
for _, subCmd := range cmd.Commands() {
subCommands = append(subCommands, subCmd)
subCommands = append(subCommands, getSubCommands(subCmd)...)
}
return subCommands
}
func usageFunc(cmd *cobra.Command) error {
subCommands := getSubCommands(cmd)
tabOut := getTabOutWithWriter(os.Stdout)
commandUsageTemplate.Execute(tabOut, struct {
Cmd *cobra.Command
LocalFlags string
GlobalFlags string
SubCommands []*cobra.Command
Version string
APIVersion string
}{
cmd,
etcdFlagUsages(cmd.LocalFlags()),
etcdFlagUsages(cmd.InheritedFlags()),
subCommands,
version.Version,
version.APIVersion,
})
tabOut.Flush()
return nil
}
func getTabOutWithWriter(writer io.Writer) *tabwriter.Writer {
aTabOut := new(tabwriter.Writer)
aTabOut.Init(writer, 0, 8, 1, '\t', 0)
return aTabOut
}

View File

@ -1,85 +0,0 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package runner
import (
"context"
"errors"
"log"
"time"
clientv3 "go.etcd.io/etcd/client/v3"
"github.com/spf13/cobra"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)
var (
leaseTTL int64
)
// NewLeaseRenewerCommand returns the cobra command for "lease-renewer runner".
func NewLeaseRenewerCommand() *cobra.Command {
cmd := &cobra.Command{
Use: "lease-renewer",
Short: "Performs lease renew operation",
Run: runLeaseRenewerFunc,
}
cmd.Flags().Int64Var(&leaseTTL, "ttl", 5, "lease's ttl")
return cmd
}
func runLeaseRenewerFunc(cmd *cobra.Command, args []string) {
if len(args) > 0 {
ExitWithError(ExitBadArgs, errors.New("lease-renewer does not take any argument"))
}
eps := endpointsFromFlag(cmd)
c := newClient(eps, dialTimeout)
ctx := context.Background()
for {
var (
l *clientv3.LeaseGrantResponse
lk *clientv3.LeaseKeepAliveResponse
err error
)
for {
l, err = c.Lease.Grant(ctx, leaseTTL)
if err == nil {
break
}
}
expire := time.Now().Add(time.Duration(l.TTL-1) * time.Second)
for {
lk, err = c.Lease.KeepAliveOnce(ctx, l.ID)
if ev, ok := status.FromError(err); ok && ev.Code() == codes.NotFound {
if time.Since(expire) < 0 {
log.Fatalf("bad renew! exceeded: %v", time.Since(expire))
}
log.Fatalf("lost lease %d, expire: %v\n", l.ID, expire)
}
if err != nil {
continue
}
expire = time.Now().Add(time.Duration(lk.TTL-1) * time.Second)
log.Printf("renewed lease %d, expire: %v\n", lk.ID, expire)
time.Sleep(time.Duration(lk.TTL-2) * time.Second)
}
}
}

View File

@ -1,94 +0,0 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package runner
import (
"context"
"errors"
"fmt"
"sync"
"go.etcd.io/etcd/client/v3/concurrency"
"github.com/spf13/cobra"
)
// NewLockRacerCommand returns the cobra command for "lock-racer runner".
func NewLockRacerCommand() *cobra.Command {
cmd := &cobra.Command{
Use: "lock-racer [name of lock (defaults to 'racers')]",
Short: "Performs lock race operation",
Run: runRacerFunc,
}
cmd.Flags().IntVar(&totalClientConnections, "total-client-connections", 10, "total number of client connections")
return cmd
}
func runRacerFunc(cmd *cobra.Command, args []string) {
racers := "racers"
if len(args) == 1 {
racers = args[0]
}
if len(args) > 1 {
ExitWithError(ExitBadArgs, errors.New("lock-racer takes at most one argument"))
}
rcs := make([]roundClient, totalClientConnections)
ctx := context.Background()
// mu ensures validate and release funcs are atomic.
var mu sync.Mutex
cnt := 0
eps := endpointsFromFlag(cmd)
for i := range rcs {
var (
s *concurrency.Session
err error
)
rcs[i].c = newClient(eps, dialTimeout)
for {
s, err = concurrency.NewSession(rcs[i].c)
if err == nil {
break
}
}
m := concurrency.NewMutex(s, racers)
rcs[i].acquire = func() error { return m.Lock(ctx) }
rcs[i].validate = func() error {
mu.Lock()
defer mu.Unlock()
if cnt++; cnt != 1 {
return fmt.Errorf("bad lock; count: %d", cnt)
}
return nil
}
rcs[i].release = func() error {
mu.Lock()
defer mu.Unlock()
if err := m.Unlock(ctx); err != nil {
return err
}
cnt = 0
return nil
}
}
// each client creates 1 key from NewMutex() and delete it from Unlock()
// a round involves in 2*len(rcs) requests.
doRounds(rcs, rounds, 2*len(rcs))
}

View File

@ -1,70 +0,0 @@
// Copyright 2017 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package runner implements individual etcd-runner commands for the etcd-runner utility.
package runner
import (
"log"
"math/rand"
"time"
"github.com/spf13/cobra"
)
const (
cliName = "etcd-runner"
cliDescription = "Stress tests using clientv3 functionality.."
defaultDialTimeout = 2 * time.Second
)
var (
rootCmd = &cobra.Command{
Use: cliName,
Short: cliDescription,
SuggestFor: []string{"etcd-runner"},
}
)
func init() {
cobra.EnablePrefixMatching = true
rand.Seed(time.Now().UnixNano())
log.SetFlags(log.Lmicroseconds)
rootCmd.PersistentFlags().StringSliceVar(&endpoints, "endpoints", []string{"127.0.0.1:2379"}, "gRPC endpoints")
rootCmd.PersistentFlags().DurationVar(&dialTimeout, "dial-timeout", defaultDialTimeout, "dial timeout for client connections")
rootCmd.PersistentFlags().IntVar(&reqRate, "req-rate", 30, "maximum number of requests per second")
rootCmd.PersistentFlags().IntVar(&rounds, "rounds", 100, "number of rounds to run; 0 to run forever")
rootCmd.AddCommand(
NewElectionCommand(),
NewLeaseRenewerCommand(),
NewLockRacerCommand(),
NewWatchCommand(),
)
}
func Start() {
rootCmd.SetUsageFunc(usageFunc)
// Make help just show the usage
rootCmd.SetHelpTemplate(`{{.UsageString}}`)
if err := rootCmd.Execute(); err != nil {
ExitWithError(ExitError, err)
}
}

View File

@ -1,210 +0,0 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package runner
import (
"context"
"errors"
"fmt"
"log"
"sync"
"time"
clientv3 "go.etcd.io/etcd/client/v3"
"go.etcd.io/etcd/pkg/v3/stringutil"
"github.com/spf13/cobra"
"golang.org/x/time/rate"
)
var (
runningTime time.Duration // time for which operation should be performed
noOfPrefixes int // total number of prefixes which will be watched upon
watchPerPrefix int // number of watchers per prefix
watchPrefix string // prefix append to keys in watcher
totalKeys int // total number of keys for operation
)
// NewWatchCommand returns the cobra command for "watcher runner".
func NewWatchCommand() *cobra.Command {
cmd := &cobra.Command{
Use: "watcher",
Short: "Performs watch operation",
Run: runWatcherFunc,
}
cmd.Flags().DurationVar(&runningTime, "running-time", 60, "number of seconds to run")
cmd.Flags().StringVar(&watchPrefix, "prefix", "", "the prefix to append on all keys")
cmd.Flags().IntVar(&noOfPrefixes, "total-prefixes", 10, "total no of prefixes to use")
cmd.Flags().IntVar(&watchPerPrefix, "watch-per-prefix", 10, "number of watchers per prefix")
cmd.Flags().IntVar(&totalKeys, "total-keys", 1000, "total number of keys to watch")
return cmd
}
func runWatcherFunc(cmd *cobra.Command, args []string) {
if len(args) > 0 {
ExitWithError(ExitBadArgs, errors.New("watcher does not take any argument"))
}
ctx := context.Background()
for round := 0; round < rounds || rounds <= 0; round++ {
fmt.Println("round", round)
performWatchOnPrefixes(ctx, cmd, round)
}
}
func performWatchOnPrefixes(ctx context.Context, cmd *cobra.Command, round int) {
keyPerPrefix := totalKeys / noOfPrefixes
prefixes := stringutil.UniqueStrings(5, noOfPrefixes)
keys := stringutil.RandomStrings(10, keyPerPrefix)
roundPrefix := fmt.Sprintf("%16x", round)
eps := endpointsFromFlag(cmd)
var (
revision int64
wg sync.WaitGroup
gr *clientv3.GetResponse
err error
)
client := newClient(eps, dialTimeout)
defer client.Close()
gr, err = getKey(ctx, client, "non-existent")
if err != nil {
log.Fatalf("failed to get the initial revision: %v", err)
}
revision = gr.Header.Revision
ctxt, cancel := context.WithDeadline(ctx, time.Now().Add(runningTime*time.Second))
defer cancel()
// generate and put keys in cluster
limiter := rate.NewLimiter(rate.Limit(reqRate), reqRate)
go func() {
for _, key := range keys {
for _, prefix := range prefixes {
if err = limiter.Wait(ctxt); err != nil {
return
}
if err = putKeyAtMostOnce(ctxt, client, watchPrefix+"-"+roundPrefix+"-"+prefix+"-"+key); err != nil {
log.Fatalf("failed to put key: %v", err)
return
}
}
}
}()
ctxc, cancelc := context.WithCancel(ctx)
wcs := make([]clientv3.WatchChan, 0)
rcs := make([]*clientv3.Client, 0)
for _, prefix := range prefixes {
for j := 0; j < watchPerPrefix; j++ {
rc := newClient(eps, dialTimeout)
rcs = append(rcs, rc)
wprefix := watchPrefix + "-" + roundPrefix + "-" + prefix
wc := rc.Watch(ctxc, wprefix, clientv3.WithPrefix(), clientv3.WithRev(revision))
wcs = append(wcs, wc)
wg.Add(1)
go func() {
defer wg.Done()
checkWatchResponse(wc, wprefix, keys)
}()
}
}
wg.Wait()
cancelc()
// verify all watch channels are closed
for e, wc := range wcs {
if _, ok := <-wc; ok {
log.Fatalf("expected wc to be closed, but received %v", e)
}
}
for _, rc := range rcs {
rc.Close()
}
if err = deletePrefix(ctx, client, watchPrefix); err != nil {
log.Fatalf("failed to clean up keys after test: %v", err)
}
}
func checkWatchResponse(wc clientv3.WatchChan, prefix string, keys []string) {
for n := 0; n < len(keys); {
wr, more := <-wc
if !more {
log.Fatalf("expect more keys (received %d/%d) for %s", n, len(keys), prefix)
}
for _, event := range wr.Events {
expectedKey := prefix + "-" + keys[n]
receivedKey := string(event.Kv.Key)
if expectedKey != receivedKey {
log.Fatalf("expected key %q, got %q for prefix : %q\n", expectedKey, receivedKey, prefix)
}
n++
}
}
}
func putKeyAtMostOnce(ctx context.Context, client *clientv3.Client, key string) error {
gr, err := getKey(ctx, client, key)
if err != nil {
return err
}
var modrev int64
if len(gr.Kvs) > 0 {
modrev = gr.Kvs[0].ModRevision
}
for ctx.Err() == nil {
_, err := client.Txn(ctx).If(clientv3.Compare(clientv3.ModRevision(key), "=", modrev)).Then(clientv3.OpPut(key, key)).Commit()
if err == nil {
return nil
}
}
return ctx.Err()
}
func deletePrefix(ctx context.Context, client *clientv3.Client, key string) error {
for ctx.Err() == nil {
if _, err := client.Delete(ctx, key, clientv3.WithPrefix()); err == nil {
return nil
}
}
return ctx.Err()
}
func getKey(ctx context.Context, client *clientv3.Client, key string) (*clientv3.GetResponse, error) {
for ctx.Err() == nil {
if gr, err := client.Get(ctx, key); err == nil {
return gr, nil
}
}
return nil, ctx.Err()
}

View File

@ -1,42 +0,0 @@
#!/usr/bin/env bash
<<COMMENT
# run 3 agents for 3-node local etcd cluster
./scripts/docker-local-agent.sh 1
./scripts/docker-local-agent.sh 2
./scripts/docker-local-agent.sh 3
COMMENT
if ! [[ "${0}" =~ "scripts/docker-local-agent.sh" ]]; then
echo "must be run from functional"
exit 255
fi
if [[ -z "${GO_VERSION}" ]]; then
GO_VERSION=1.14.3
fi
echo "Running with GO_VERSION:" ${GO_VERSION}
if [[ -z ${1} ]]; then
echo "Expected second argument: 1, 2, or 3"
exit 255
else
case ${1} in
1) ;;
2) ;;
3) ;;
*) echo "Expected second argument 1, 2, or 3, got" \"${1}\"
exit 255 ;;
esac
AGENT_NAME="agent-${1}"
AGENT_ADDR_FLAG="--network tcp --address 127.0.0.1:${1}9027"
fi
echo "AGENT_NAME:" ${AGENT_NAME}
echo "AGENT_ADDR_FLAG:" ${AGENT_ADDR_FLAG}
docker run \
--rm \
--net=host \
--name ${AGENT_NAME} \
gcr.io/etcd-development/etcd-functional:go${GO_VERSION} \
/bin/bash -c "./bin/etcd-agent ${AGENT_ADDR_FLAG}"

View File

@ -1,18 +0,0 @@
#!/usr/bin/env bash
if ! [[ "${0}" =~ "scripts/docker-local-tester.sh" ]]; then
echo "must be run from functional"
exit 255
fi
if [[ -z "${GO_VERSION}" ]]; then
GO_VERSION=1.14.3
fi
echo "Running with GO_VERSION:" ${GO_VERSION}
docker run \
--rm \
--net=host \
--name tester \
gcr.io/etcd-development/etcd-functional:go${GO_VERSION} \
/bin/bash -c "./bin/etcd-tester --config ./functional.yaml"

View File

@ -1,312 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"fmt"
"math/rand"
"time"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
"go.uber.org/zap"
)
// Case defines failure/test injection interface.
// To add a test case:
// 1. implement "Case" interface
// 2. define fail case name in "rpcpb.Case"
type Case interface {
// Inject injects the failure into the testing cluster at the given
// round. When calling the function, the cluster should be in health.
Inject(clus *Cluster) error
// Recover recovers the injected failure caused by the injection of the
// given round and wait for the recovery of the testing cluster.
Recover(clus *Cluster) error
// Desc returns a description of the failure
Desc() string
// TestCase returns "rpcpb.Case" enum type.
TestCase() rpcpb.Case
}
type injectMemberFunc func(*Cluster, int) error
type recoverMemberFunc func(*Cluster, int) error
type caseByFunc struct {
desc string
rpcpbCase rpcpb.Case
injectMember injectMemberFunc
recoverMember recoverMemberFunc
}
func (c *caseByFunc) Desc() string {
if c.desc != "" {
return c.desc
}
return c.rpcpbCase.String()
}
func (c *caseByFunc) TestCase() rpcpb.Case {
return c.rpcpbCase
}
type caseFollower struct {
caseByFunc
last int
lead int
}
func (c *caseFollower) updateIndex(clus *Cluster) error {
lead, err := clus.GetLeader()
if err != nil {
return err
}
c.lead = lead
n := len(clus.Members)
if c.last == -1 { // first run
c.last = clus.rd % n
if c.last == c.lead {
c.last = (c.last + 1) % n
}
} else {
c.last = (c.last + 1) % n
if c.last == c.lead {
c.last = (c.last + 1) % n
}
}
return nil
}
func (c *caseFollower) Inject(clus *Cluster) error {
if err := c.updateIndex(clus); err != nil {
return err
}
return c.injectMember(clus, c.last)
}
func (c *caseFollower) Recover(clus *Cluster) error {
return c.recoverMember(clus, c.last)
}
type caseLeader struct {
caseByFunc
last int
lead int
}
func (c *caseLeader) updateIndex(clus *Cluster) error {
lead, err := clus.GetLeader()
if err != nil {
return err
}
c.lead = lead
c.last = lead
return nil
}
func (c *caseLeader) Inject(clus *Cluster) error {
if err := c.updateIndex(clus); err != nil {
return err
}
return c.injectMember(clus, c.last)
}
func (c *caseLeader) Recover(clus *Cluster) error {
return c.recoverMember(clus, c.last)
}
type caseQuorum struct {
caseByFunc
injected map[int]struct{}
}
func (c *caseQuorum) Inject(clus *Cluster) error {
c.injected = pickQuorum(len(clus.Members))
for idx := range c.injected {
if err := c.injectMember(clus, idx); err != nil {
return err
}
}
return nil
}
func (c *caseQuorum) Recover(clus *Cluster) error {
for idx := range c.injected {
if err := c.recoverMember(clus, idx); err != nil {
return err
}
}
return nil
}
func pickQuorum(size int) (picked map[int]struct{}) {
picked = make(map[int]struct{})
r := rand.New(rand.NewSource(time.Now().UnixNano()))
quorum := size/2 + 1
for len(picked) < quorum {
idx := r.Intn(size)
picked[idx] = struct{}{}
}
return picked
}
type caseAll caseByFunc
func (c *caseAll) Inject(clus *Cluster) error {
for i := range clus.Members {
if err := c.injectMember(clus, i); err != nil {
return err
}
}
return nil
}
func (c *caseAll) Recover(clus *Cluster) error {
for i := range clus.Members {
if err := c.recoverMember(clus, i); err != nil {
return err
}
}
return nil
}
func (c *caseAll) Desc() string {
if c.desc != "" {
return c.desc
}
return c.rpcpbCase.String()
}
func (c *caseAll) TestCase() rpcpb.Case {
return c.rpcpbCase
}
// caseUntilSnapshot injects a failure/test and waits for a snapshot event
type caseUntilSnapshot struct {
desc string
rpcpbCase rpcpb.Case
Case
}
// all delay failure cases except the ones failing with latency
// greater than election timeout (trigger leader election and
// cluster keeps operating anyways)
var slowCases = map[rpcpb.Case]struct{}{
rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER: {},
rpcpb.Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT: {},
rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT: {},
rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER: {},
rpcpb.Case_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT: {},
rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT: {},
rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM: {},
rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_ALL: {},
}
func (c *caseUntilSnapshot) Inject(clus *Cluster) error {
if err := c.Case.Inject(clus); err != nil {
return err
}
snapshotCount := clus.Members[0].Etcd.SnapshotCount
now := time.Now()
clus.lg.Info(
"trigger snapshot START",
zap.String("desc", c.Desc()),
zap.Int64("etcd-snapshot-count", snapshotCount),
)
// maxRev may fail since failure just injected, retry if failed.
startRev, err := clus.maxRev()
for i := 0; i < 10 && startRev == 0; i++ {
startRev, err = clus.maxRev()
}
if startRev == 0 {
return err
}
lastRev := startRev
// healthy cluster could accept 1000 req/sec at least.
// 3x time to trigger snapshot.
retries := int(snapshotCount) / 1000 * 3
if _, ok := slowCases[c.TestCase()]; ok {
// slow network takes more retries
retries *= 5
}
for i := 0; i < retries; i++ {
lastRev, err = clus.maxRev()
if lastRev == 0 {
clus.lg.Info(
"trigger snapshot RETRY",
zap.Int("retries", i),
zap.Int64("etcd-snapshot-count", snapshotCount),
zap.Int64("start-revision", startRev),
zap.Error(err),
)
time.Sleep(3 * time.Second)
continue
}
// If the number of proposals committed is bigger than snapshot count,
// a new snapshot should have been created.
diff := lastRev - startRev
if diff > snapshotCount {
clus.lg.Info(
"trigger snapshot PASS",
zap.Int("retries", i),
zap.String("desc", c.Desc()),
zap.Int64("committed-entries", diff),
zap.Int64("etcd-snapshot-count", snapshotCount),
zap.Int64("start-revision", startRev),
zap.Int64("last-revision", lastRev),
zap.Duration("took", time.Since(now)),
)
return nil
}
clus.lg.Info(
"trigger snapshot RETRY",
zap.Int("retries", i),
zap.Int64("committed-entries", diff),
zap.Int64("etcd-snapshot-count", snapshotCount),
zap.Int64("start-revision", startRev),
zap.Int64("last-revision", lastRev),
zap.Duration("took", time.Since(now)),
zap.Error(err),
)
time.Sleep(time.Second)
if err != nil {
time.Sleep(2 * time.Second)
}
}
return fmt.Errorf("cluster too slow: only %d commits in %d retries", lastRev-startRev, retries)
}
func (c *caseUntilSnapshot) Desc() string {
if c.desc != "" {
return c.desc
}
if c.rpcpbCase.String() != "" {
return c.rpcpbCase.String()
}
return c.Case.Desc()
}
func (c *caseUntilSnapshot) TestCase() rpcpb.Case {
return c.rpcpbCase
}

View File

@ -1,41 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"time"
"go.uber.org/zap"
)
type caseDelay struct {
Case
delayDuration time.Duration
}
func (c *caseDelay) Inject(clus *Cluster) error {
if err := c.Case.Inject(clus); err != nil {
return err
}
if c.delayDuration > 0 {
clus.lg.Info(
"wait after inject",
zap.Duration("delay", c.delayDuration),
zap.String("desc", c.Case.Desc()),
)
time.Sleep(c.delayDuration)
}
return nil
}

View File

@ -1,53 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"fmt"
"os/exec"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
)
type caseExternal struct {
desc string
rpcpbCase rpcpb.Case
scriptPath string
}
func (c *caseExternal) Inject(clus *Cluster) error {
return exec.Command(c.scriptPath, "enable", fmt.Sprintf("%d", clus.rd)).Run()
}
func (c *caseExternal) Recover(clus *Cluster) error {
return exec.Command(c.scriptPath, "disable", fmt.Sprintf("%d", clus.rd)).Run()
}
func (c *caseExternal) Desc() string {
return c.desc
}
func (c *caseExternal) TestCase() rpcpb.Case {
return c.rpcpbCase
}
func new_Case_EXTERNAL(scriptPath string) Case {
return &caseExternal{
desc: fmt.Sprintf("external fault injector (script: %q)", scriptPath),
rpcpbCase: rpcpb.Case_EXTERNAL,
scriptPath: scriptPath,
}
}

View File

@ -1,207 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"fmt"
"io"
"net/http"
"strings"
"sync"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
)
type failpointStats struct {
mu sync.Mutex
// crashes counts the number of crashes for a failpoint
crashes map[string]int
}
var fpStats failpointStats
func failpointFailures(clus *Cluster) (ret []Case, err error) {
var fps []string
fps, err = failpointPaths(clus.Members[0].FailpointHTTPAddr)
if err != nil {
return nil, err
}
// create failure objects for all failpoints
for _, fp := range fps {
if len(fp) == 0 {
continue
}
fpFails := casesFromFailpoint(fp, clus.Tester.FailpointCommands)
// wrap in delays so failpoint has time to trigger
for i, fpf := range fpFails {
if strings.Contains(fp, "Snap") {
// hack to trigger snapshot failpoints
fpFails[i] = &caseUntilSnapshot{
desc: fpf.Desc(),
rpcpbCase: rpcpb.Case_FAILPOINTS,
Case: fpf,
}
} else {
fpFails[i] = &caseDelay{
Case: fpf,
delayDuration: clus.GetCaseDelayDuration(),
}
}
}
ret = append(ret, fpFails...)
}
fpStats.crashes = make(map[string]int)
return ret, err
}
func failpointPaths(endpoint string) ([]string, error) {
resp, err := http.Get(endpoint)
if err != nil {
return nil, err
}
defer resp.Body.Close()
body, rerr := io.ReadAll(resp.Body)
if rerr != nil {
return nil, rerr
}
var fps []string
for _, l := range strings.Split(string(body), "\n") {
fp := strings.Split(l, "=")[0]
fps = append(fps, fp)
}
return fps, nil
}
// failpoints follows FreeBSD FAIL_POINT syntax.
// e.g. panic("etcd-tester"),1*sleep(1000)->panic("etcd-tester")
func casesFromFailpoint(fp string, failpointCommands []string) (fs []Case) {
recov := makeRecoverFailpoint(fp)
for _, fcmd := range failpointCommands {
inject := makeInjectFailpoint(fp, fcmd)
fs = append(fs, []Case{
&caseFollower{
caseByFunc: caseByFunc{
desc: fmt.Sprintf("failpoint %q (one: %q)", fp, fcmd),
rpcpbCase: rpcpb.Case_FAILPOINTS,
injectMember: inject,
recoverMember: recov,
},
last: -1,
lead: -1,
},
&caseLeader{
caseByFunc: caseByFunc{
desc: fmt.Sprintf("failpoint %q (leader: %q)", fp, fcmd),
rpcpbCase: rpcpb.Case_FAILPOINTS,
injectMember: inject,
recoverMember: recov,
},
last: -1,
lead: -1,
},
&caseQuorum{
caseByFunc: caseByFunc{
desc: fmt.Sprintf("failpoint %q (quorum: %q)", fp, fcmd),
rpcpbCase: rpcpb.Case_FAILPOINTS,
injectMember: inject,
recoverMember: recov,
},
injected: make(map[int]struct{}),
},
&caseAll{
desc: fmt.Sprintf("failpoint %q (all: %q)", fp, fcmd),
rpcpbCase: rpcpb.Case_FAILPOINTS,
injectMember: inject,
recoverMember: recov,
},
}...)
}
return fs
}
func makeInjectFailpoint(fp, val string) injectMemberFunc {
return func(clus *Cluster, idx int) (err error) {
// Add the failpoint into the member's list of failpoints so that if the member is restarted, the
// failpoint state is persisted (via the GOFAIL_FAILPOINTS environment variable)
addFailpointToMemberList(clus.Members[idx], idx, fp)
// Enable the failpoint
return putFailpoint(clus.Members[idx].FailpointHTTPAddr, fp, val)
}
}
func makeRecoverFailpoint(fp string) recoverMemberFunc {
return func(clus *Cluster, idx int) error {
// Remove the failpoint into the member's list of failpoints.
removeFailpointFromMemberList(clus.Members[idx], idx, fp)
// Disable the failpoint
if err := delFailpoint(clus.Members[idx].FailpointHTTPAddr, fp); err == nil {
return nil
}
// node not responding, likely dead from fp panic; restart
fpStats.mu.Lock()
fpStats.crashes[fp]++
fpStats.mu.Unlock()
return recover_SIGTERM_ETCD(clus, idx)
}
}
func addFailpointToMemberList(member *rpcpb.Member, idx int, fp string) {
failpoints := strings.Split(member.Failpoints, ";")
failpoints = append(failpoints, fp)
member.Failpoints = strings.Join(failpoints, ";")
}
func removeFailpointFromMemberList(member *rpcpb.Member, idx int, fp string) {
failpoints := strings.Split(member.Failpoints, ";")
for i, f := range failpoints {
if f == fp {
failpoints = append(failpoints[:i], failpoints[i+1:]...)
break
}
}
member.Failpoints = strings.Join(failpoints, ";")
}
func putFailpoint(ep, fp, val string) error {
req, _ := http.NewRequest(http.MethodPut, ep+"/"+fp, strings.NewReader(val))
c := http.Client{}
resp, err := c.Do(req)
if err != nil {
return err
}
resp.Body.Close()
if resp.StatusCode/100 != 2 {
return fmt.Errorf("failed to PUT %s=%s at %s (%v)", fp, val, ep, resp.Status)
}
return nil
}
func delFailpoint(ep, fp string) error {
req, _ := http.NewRequest(http.MethodDelete, ep+"/"+fp, strings.NewReader(""))
c := http.Client{}
resp, err := c.Do(req)
if err != nil {
return err
}
resp.Body.Close()
if resp.StatusCode/100 != 2 {
return fmt.Errorf("failed to DELETE %s at %s (%v)", fp, ep, resp.Status)
}
return nil
}

View File

@ -1,71 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"fmt"
"strings"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
)
const (
diskIOFailpoint = "raftAfterSave"
)
func failpointDiskIOFailures(clus *Cluster) (ret []Case, err error) {
fps, err := failpointPaths(clus.Members[0].FailpointHTTPAddr)
if err != nil {
return nil, err
}
var detailDiskIOLatencyFailpointPath string
for i := 0; i < len(fps); i++ {
if strings.HasSuffix(fps[i], diskIOFailpoint) {
detailDiskIOLatencyFailpointPath = fps[i]
break
}
}
// create failure objects for diskIOFailpoint
fpFails := casesFromDiskIOFailpoint(detailDiskIOLatencyFailpointPath, clus.Tester.FailpointCommands)
// wrap in delays so failpoint has time to trigger
for i, fpf := range fpFails {
fpFails[i] = &caseDelay{
Case: fpf,
delayDuration: clus.GetCaseDelayDuration(),
}
}
ret = append(ret, fpFails...)
return ret, nil
}
func casesFromDiskIOFailpoint(fp string, failpointCommands []string) (fs []Case) {
recov := makeRecoverFailpoint(fp)
for _, fcmd := range failpointCommands {
inject := makeInjectFailpoint(fp, fcmd)
fs = append(fs, []Case{
&caseLeader{
caseByFunc: caseByFunc{
desc: fmt.Sprintf("failpoint %q (leader: %q)", fp, fcmd),
rpcpbCase: rpcpb.Case_FAILPOINTS,
injectMember: inject,
recoverMember: recov,
},
last: -1,
lead: -1,
},
}...)
}
return fs
}

View File

@ -1,104 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import "go.etcd.io/etcd/tests/v3/functional/rpcpb"
func inject_BLACKHOLE_PEER_PORT_TX_RX(clus *Cluster, idx int) error {
return clus.sendOp(idx, rpcpb.Operation_BLACKHOLE_PEER_PORT_TX_RX)
}
func recover_BLACKHOLE_PEER_PORT_TX_RX(clus *Cluster, idx int) error {
return clus.sendOp(idx, rpcpb.Operation_UNBLACKHOLE_PEER_PORT_TX_RX)
}
func new_Case_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER(clus *Cluster) Case {
cc := caseByFunc{
rpcpbCase: rpcpb.Case_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER,
injectMember: inject_BLACKHOLE_PEER_PORT_TX_RX,
recoverMember: recover_BLACKHOLE_PEER_PORT_TX_RX,
}
c := &caseFollower{cc, -1, -1}
return &caseDelay{
Case: c,
delayDuration: clus.GetCaseDelayDuration(),
}
}
func new_Case_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT() Case {
cc := caseByFunc{
rpcpbCase: rpcpb.Case_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
injectMember: inject_BLACKHOLE_PEER_PORT_TX_RX,
recoverMember: recover_BLACKHOLE_PEER_PORT_TX_RX,
}
c := &caseFollower{cc, -1, -1}
return &caseUntilSnapshot{
rpcpbCase: rpcpb.Case_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
Case: c,
}
}
func new_Case_BLACKHOLE_PEER_PORT_TX_RX_LEADER(clus *Cluster) Case {
cc := caseByFunc{
rpcpbCase: rpcpb.Case_BLACKHOLE_PEER_PORT_TX_RX_LEADER,
injectMember: inject_BLACKHOLE_PEER_PORT_TX_RX,
recoverMember: recover_BLACKHOLE_PEER_PORT_TX_RX,
}
c := &caseLeader{cc, -1, -1}
return &caseDelay{
Case: c,
delayDuration: clus.GetCaseDelayDuration(),
}
}
func new_Case_BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT() Case {
cc := caseByFunc{
rpcpbCase: rpcpb.Case_BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT,
injectMember: inject_BLACKHOLE_PEER_PORT_TX_RX,
recoverMember: recover_BLACKHOLE_PEER_PORT_TX_RX,
}
c := &caseLeader{cc, -1, -1}
return &caseUntilSnapshot{
rpcpbCase: rpcpb.Case_BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT,
Case: c,
}
}
func new_Case_BLACKHOLE_PEER_PORT_TX_RX_QUORUM(clus *Cluster) Case {
c := &caseQuorum{
caseByFunc: caseByFunc{
rpcpbCase: rpcpb.Case_BLACKHOLE_PEER_PORT_TX_RX_QUORUM,
injectMember: inject_BLACKHOLE_PEER_PORT_TX_RX,
recoverMember: recover_BLACKHOLE_PEER_PORT_TX_RX,
},
injected: make(map[int]struct{}),
}
return &caseDelay{
Case: c,
delayDuration: clus.GetCaseDelayDuration(),
}
}
func new_Case_BLACKHOLE_PEER_PORT_TX_RX_ALL(clus *Cluster) Case {
c := &caseAll{
rpcpbCase: rpcpb.Case_BLACKHOLE_PEER_PORT_TX_RX_ALL,
injectMember: inject_BLACKHOLE_PEER_PORT_TX_RX,
recoverMember: recover_BLACKHOLE_PEER_PORT_TX_RX,
}
return &caseDelay{
Case: c,
delayDuration: clus.GetCaseDelayDuration(),
}
}

View File

@ -1,156 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"time"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
"go.uber.org/zap"
)
const (
// Wait more when it recovers from slow network, because network layer
// needs extra time to propagate traffic control (tc command) change.
// Otherwise, we get different hash values from the previous revision.
// For more detail, please see https://github.com/etcd-io/etcd/issues/5121.
waitRecover = 5 * time.Second
)
func inject_DELAY_PEER_PORT_TX_RX(clus *Cluster, idx int) error {
clus.lg.Info(
"injecting delay latency",
zap.Duration("latency", time.Duration(clus.Tester.UpdatedDelayLatencyMs)*time.Millisecond),
zap.Duration("latency-rv", time.Duration(clus.Tester.DelayLatencyMsRv)*time.Millisecond),
zap.String("endpoint", clus.Members[idx].EtcdClientEndpoint),
)
return clus.sendOp(idx, rpcpb.Operation_DELAY_PEER_PORT_TX_RX)
}
func recover_DELAY_PEER_PORT_TX_RX(clus *Cluster, idx int) error {
err := clus.sendOp(idx, rpcpb.Operation_UNDELAY_PEER_PORT_TX_RX)
time.Sleep(waitRecover)
return err
}
func new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER(clus *Cluster, random bool) Case {
cc := caseByFunc{
rpcpbCase: rpcpb.Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER,
injectMember: inject_DELAY_PEER_PORT_TX_RX,
recoverMember: recover_DELAY_PEER_PORT_TX_RX,
}
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
if random {
clus.UpdateDelayLatencyMs()
cc.rpcpbCase = rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER
}
c := &caseFollower{cc, -1, -1}
return &caseDelay{
Case: c,
delayDuration: clus.GetCaseDelayDuration(),
}
}
func new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster, random bool) Case {
cc := caseByFunc{
rpcpbCase: rpcpb.Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
injectMember: inject_DELAY_PEER_PORT_TX_RX,
recoverMember: recover_DELAY_PEER_PORT_TX_RX,
}
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
if random {
clus.UpdateDelayLatencyMs()
cc.rpcpbCase = rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
}
c := &caseFollower{cc, -1, -1}
return &caseUntilSnapshot{
rpcpbCase: cc.rpcpbCase,
Case: c,
}
}
func new_Case_DELAY_PEER_PORT_TX_RX_LEADER(clus *Cluster, random bool) Case {
cc := caseByFunc{
rpcpbCase: rpcpb.Case_DELAY_PEER_PORT_TX_RX_LEADER,
injectMember: inject_DELAY_PEER_PORT_TX_RX,
recoverMember: recover_DELAY_PEER_PORT_TX_RX,
}
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
if random {
clus.UpdateDelayLatencyMs()
cc.rpcpbCase = rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER
}
c := &caseLeader{cc, -1, -1}
return &caseDelay{
Case: c,
delayDuration: clus.GetCaseDelayDuration(),
}
}
func new_Case_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster, random bool) Case {
cc := caseByFunc{
rpcpbCase: rpcpb.Case_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT,
injectMember: inject_DELAY_PEER_PORT_TX_RX,
recoverMember: recover_DELAY_PEER_PORT_TX_RX,
}
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
if random {
clus.UpdateDelayLatencyMs()
cc.rpcpbCase = rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT
}
c := &caseLeader{cc, -1, -1}
return &caseUntilSnapshot{
rpcpbCase: cc.rpcpbCase,
Case: c,
}
}
func new_Case_DELAY_PEER_PORT_TX_RX_QUORUM(clus *Cluster, random bool) Case {
c := &caseQuorum{
caseByFunc: caseByFunc{
rpcpbCase: rpcpb.Case_DELAY_PEER_PORT_TX_RX_QUORUM,
injectMember: inject_DELAY_PEER_PORT_TX_RX,
recoverMember: recover_DELAY_PEER_PORT_TX_RX,
},
injected: make(map[int]struct{}),
}
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
if random {
clus.UpdateDelayLatencyMs()
c.rpcpbCase = rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM
}
return &caseDelay{
Case: c,
delayDuration: clus.GetCaseDelayDuration(),
}
}
func new_Case_DELAY_PEER_PORT_TX_RX_ALL(clus *Cluster, random bool) Case {
c := &caseAll{
rpcpbCase: rpcpb.Case_DELAY_PEER_PORT_TX_RX_ALL,
injectMember: inject_DELAY_PEER_PORT_TX_RX,
recoverMember: recover_DELAY_PEER_PORT_TX_RX,
}
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
if random {
clus.UpdateDelayLatencyMs()
c.rpcpbCase = rpcpb.Case_RANDOM_DELAY_PEER_PORT_TX_RX_ALL
}
return &caseDelay{
Case: c,
delayDuration: clus.GetCaseDelayDuration(),
}
}

View File

@ -1,99 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"time"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
"go.uber.org/zap"
)
type caseNoFailWithStress caseByFunc
func (c *caseNoFailWithStress) Inject(clus *Cluster) error {
return nil
}
func (c *caseNoFailWithStress) Recover(clus *Cluster) error {
return nil
}
func (c *caseNoFailWithStress) Desc() string {
if c.desc != "" {
return c.desc
}
return c.rpcpbCase.String()
}
func (c *caseNoFailWithStress) TestCase() rpcpb.Case {
return c.rpcpbCase
}
func new_Case_NO_FAIL_WITH_STRESS(clus *Cluster) Case {
c := &caseNoFailWithStress{
rpcpbCase: rpcpb.Case_NO_FAIL_WITH_STRESS,
}
return &caseDelay{
Case: c,
delayDuration: clus.GetCaseDelayDuration(),
}
}
type caseNoFailWithNoStressForLiveness caseByFunc
func (c *caseNoFailWithNoStressForLiveness) Inject(clus *Cluster) error {
clus.lg.Info(
"extra delay for liveness mode with no stresser",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.String("desc", c.Desc()),
)
time.Sleep(clus.GetCaseDelayDuration())
clus.lg.Info(
"wait health in liveness mode",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.String("desc", c.Desc()),
)
return clus.WaitHealth()
}
func (c *caseNoFailWithNoStressForLiveness) Recover(clus *Cluster) error {
return nil
}
func (c *caseNoFailWithNoStressForLiveness) Desc() string {
if c.desc != "" {
return c.desc
}
return c.rpcpbCase.String()
}
func (c *caseNoFailWithNoStressForLiveness) TestCase() rpcpb.Case {
return c.rpcpbCase
}
func new_Case_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS(clus *Cluster) Case {
c := &caseNoFailWithNoStressForLiveness{
rpcpbCase: rpcpb.Case_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS,
}
return &caseDelay{
Case: c,
delayDuration: clus.GetCaseDelayDuration(),
}
}

View File

@ -1,229 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"context"
"fmt"
"sort"
"strings"
"time"
clientv3 "go.etcd.io/etcd/client/v3"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
"go.uber.org/zap"
)
func inject_SIGQUIT_ETCD_AND_REMOVE_DATA(clus *Cluster, idx1 int) error {
cli1, err := clus.Members[idx1].CreateEtcdClient()
if err != nil {
return err
}
defer cli1.Close()
var mresp *clientv3.MemberListResponse
mresp, err = cli1.MemberList(context.Background())
var mss []string
if err == nil && mresp != nil {
mss = describeMembers(mresp)
}
clus.lg.Info(
"member list before disastrous machine failure",
zap.String("request-to", clus.Members[idx1].EtcdClientEndpoint),
zap.Strings("members", mss),
zap.Error(err),
)
if err != nil {
return err
}
sresp, serr := cli1.Status(context.Background(), clus.Members[idx1].EtcdClientEndpoint)
if serr != nil {
return serr
}
id1 := sresp.Header.MemberId
is1 := fmt.Sprintf("%016x", id1)
clus.lg.Info(
"disastrous machine failure START",
zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
zap.String("target-member-id", is1),
zap.Error(err),
)
err = clus.sendOp(idx1, rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA)
clus.lg.Info(
"disastrous machine failure END",
zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
zap.String("target-member-id", is1),
zap.Error(err),
)
if err != nil {
return err
}
time.Sleep(2 * time.Second)
idx2 := (idx1 + 1) % len(clus.Members)
var cli2 *clientv3.Client
cli2, err = clus.Members[idx2].CreateEtcdClient()
if err != nil {
return err
}
defer cli2.Close()
// FIXME(bug): this may block forever during
// "SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT"
// is the new leader too busy with snapshotting?
// is raft proposal dropped?
// enable client keepalive for failover?
clus.lg.Info(
"member remove after disaster START",
zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
zap.String("target-member-id", is1),
zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
)
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
_, err = cli2.MemberRemove(ctx, id1)
cancel()
clus.lg.Info(
"member remove after disaster END",
zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
zap.String("target-member-id", is1),
zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
zap.Error(err),
)
if err != nil {
return err
}
time.Sleep(2 * time.Second)
mresp, err = cli2.MemberList(context.Background())
mss = []string{}
if err == nil && mresp != nil {
mss = describeMembers(mresp)
}
clus.lg.Info(
"member list after member remove",
zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
zap.Strings("members", mss),
zap.Error(err),
)
return err
}
func recover_SIGQUIT_ETCD_AND_REMOVE_DATA(clus *Cluster, idx1 int) error {
idx2 := (idx1 + 1) % len(clus.Members)
cli2, err := clus.Members[idx2].CreateEtcdClient()
if err != nil {
return err
}
defer cli2.Close()
_, err = cli2.MemberAdd(context.Background(), clus.Members[idx1].Etcd.AdvertisePeerURLs)
clus.lg.Info(
"member add before fresh restart",
zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
zap.Error(err),
)
if err != nil {
return err
}
time.Sleep(2 * time.Second)
clus.Members[idx1].Etcd.InitialClusterState = "existing"
err = clus.sendOp(idx1, rpcpb.Operation_RESTART_ETCD)
clus.lg.Info(
"fresh restart after member add",
zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
zap.Error(err),
)
if err != nil {
return err
}
time.Sleep(2 * time.Second)
var mresp *clientv3.MemberListResponse
mresp, err = cli2.MemberList(context.Background())
var mss []string
if err == nil && mresp != nil {
mss = describeMembers(mresp)
}
clus.lg.Info(
"member list after member add",
zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
zap.Strings("members", mss),
zap.Error(err),
)
return err
}
func new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus *Cluster) Case {
cc := caseByFunc{
rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER,
injectMember: inject_SIGQUIT_ETCD_AND_REMOVE_DATA,
recoverMember: recover_SIGQUIT_ETCD_AND_REMOVE_DATA,
}
c := &caseFollower{cc, -1, -1}
return &caseDelay{
Case: c,
delayDuration: clus.GetCaseDelayDuration(),
}
}
func new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster) Case {
return &caseUntilSnapshot{
rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
Case: new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus),
}
}
func new_Case_SIGQUIT_AND_REMOVE_LEADER(clus *Cluster) Case {
cc := caseByFunc{
rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_LEADER,
injectMember: inject_SIGQUIT_ETCD_AND_REMOVE_DATA,
recoverMember: recover_SIGQUIT_ETCD_AND_REMOVE_DATA,
}
c := &caseLeader{cc, -1, -1}
return &caseDelay{
Case: c,
delayDuration: clus.GetCaseDelayDuration(),
}
}
func new_Case_SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster) Case {
return &caseUntilSnapshot{
rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT,
Case: new_Case_SIGQUIT_AND_REMOVE_LEADER(clus),
}
}
func describeMembers(mresp *clientv3.MemberListResponse) (ss []string) {
ss = make([]string, len(mresp.Members))
for i, m := range mresp.Members {
ss[i] = fmt.Sprintf("Name %s / ID %016x / ClientURLs %s / PeerURLs %s",
m.Name,
m.ID,
strings.Join(m.ClientURLs, ","),
strings.Join(m.PeerURLs, ","),
)
}
sort.Strings(ss)
return ss
}

View File

@ -1,275 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"context"
"fmt"
"strings"
"time"
clientv3 "go.etcd.io/etcd/client/v3"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
"go.uber.org/zap"
)
type fetchSnapshotCaseQuorum struct {
desc string
rpcpbCase rpcpb.Case
injected map[int]struct{}
snapshotted int
}
func (c *fetchSnapshotCaseQuorum) Inject(clus *Cluster) error {
// 1. Assume node C is the current leader with most up-to-date data.
lead, err := clus.GetLeader()
if err != nil {
return err
}
c.snapshotted = lead
// 2. Download snapshot from node C, before destroying node A and B.
clus.lg.Info(
"save snapshot on leader node START",
zap.String("target-endpoint", clus.Members[lead].EtcdClientEndpoint),
)
var resp *rpcpb.Response
resp, err = clus.sendOpWithResp(lead, rpcpb.Operation_SAVE_SNAPSHOT)
if resp == nil || (resp != nil && !resp.Success) || err != nil {
clus.lg.Info(
"save snapshot on leader node FAIL",
zap.String("target-endpoint", clus.Members[lead].EtcdClientEndpoint),
zap.Error(err),
)
return err
}
clus.lg.Info(
"save snapshot on leader node SUCCESS",
zap.String("target-endpoint", clus.Members[lead].EtcdClientEndpoint),
zap.String("member-name", resp.SnapshotInfo.MemberName),
zap.Strings("member-client-urls", resp.SnapshotInfo.MemberClientURLs),
zap.String("snapshot-path", resp.SnapshotInfo.SnapshotPath),
zap.String("snapshot-file-size", resp.SnapshotInfo.SnapshotFileSize),
zap.String("snapshot-total-size", resp.SnapshotInfo.SnapshotTotalSize),
zap.Int64("snapshot-total-key", resp.SnapshotInfo.SnapshotTotalKey),
zap.Int64("snapshot-hash", resp.SnapshotInfo.SnapshotHash),
zap.Int64("snapshot-revision", resp.SnapshotInfo.SnapshotRevision),
zap.String("took", resp.SnapshotInfo.Took),
zap.Error(err),
)
if err != nil {
return err
}
clus.Members[lead].SnapshotInfo = resp.SnapshotInfo
leaderc, err := clus.Members[lead].CreateEtcdClient()
if err != nil {
return err
}
defer leaderc.Close()
var mresp *clientv3.MemberListResponse
mresp, err = leaderc.MemberList(context.Background())
var mss []string
if err == nil && mresp != nil {
mss = describeMembers(mresp)
}
clus.lg.Info(
"member list before disastrous machine failure",
zap.String("request-to", clus.Members[lead].EtcdClientEndpoint),
zap.Strings("members", mss),
zap.Error(err),
)
if err != nil {
return err
}
// simulate real life; machine failures may happen
// after some time since last snapshot save
time.Sleep(time.Second)
// 3. Destroy node A and B, and make the whole cluster inoperable.
for {
c.injected = pickQuorum(len(clus.Members))
if _, ok := c.injected[lead]; !ok {
break
}
}
for idx := range c.injected {
clus.lg.Info(
"disastrous machine failure to quorum START",
zap.String("target-endpoint", clus.Members[idx].EtcdClientEndpoint),
)
err = clus.sendOp(idx, rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA)
clus.lg.Info(
"disastrous machine failure to quorum END",
zap.String("target-endpoint", clus.Members[idx].EtcdClientEndpoint),
zap.Error(err),
)
if err != nil {
return err
}
}
// 4. Now node C cannot operate either.
// 5. SIGTERM node C and remove its data directories.
clus.lg.Info(
"disastrous machine failure to old leader START",
zap.String("target-endpoint", clus.Members[lead].EtcdClientEndpoint),
)
err = clus.sendOp(lead, rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA)
clus.lg.Info(
"disastrous machine failure to old leader END",
zap.String("target-endpoint", clus.Members[lead].EtcdClientEndpoint),
zap.Error(err),
)
return err
}
func (c *fetchSnapshotCaseQuorum) Recover(clus *Cluster) error {
// 6. Restore a new seed member from node C's latest snapshot file.
oldlead := c.snapshotted
// configuration on restart from recovered snapshot
// seed member's configuration is all the same as previous one
// except initial cluster string is now a single-node cluster
clus.Members[oldlead].EtcdOnSnapshotRestore = clus.Members[oldlead].Etcd
clus.Members[oldlead].EtcdOnSnapshotRestore.InitialClusterState = "existing"
name := clus.Members[oldlead].Etcd.Name
var initClus []string
for _, u := range clus.Members[oldlead].Etcd.AdvertisePeerURLs {
initClus = append(initClus, fmt.Sprintf("%s=%s", name, u))
}
clus.Members[oldlead].EtcdOnSnapshotRestore.InitialCluster = strings.Join(initClus, ",")
clus.lg.Info(
"restore snapshot and restart from snapshot request START",
zap.String("target-endpoint", clus.Members[oldlead].EtcdClientEndpoint),
zap.Strings("initial-cluster", initClus),
)
err := clus.sendOp(oldlead, rpcpb.Operation_RESTORE_RESTART_FROM_SNAPSHOT)
clus.lg.Info(
"restore snapshot and restart from snapshot request END",
zap.String("target-endpoint", clus.Members[oldlead].EtcdClientEndpoint),
zap.Strings("initial-cluster", initClus),
zap.Error(err),
)
if err != nil {
return err
}
leaderc, err := clus.Members[oldlead].CreateEtcdClient()
if err != nil {
return err
}
defer leaderc.Close()
// 7. Add another member to establish 2-node cluster.
// 8. Add another member to establish 3-node cluster.
// 9. Add more if any.
idxs := make([]int, 0, len(c.injected))
for idx := range c.injected {
idxs = append(idxs, idx)
}
clus.lg.Info("member add START", zap.Int("members-to-add", len(idxs)))
for i, idx := range idxs {
clus.lg.Info(
"member add request SENT",
zap.String("target-endpoint", clus.Members[idx].EtcdClientEndpoint),
zap.Strings("peer-urls", clus.Members[idx].Etcd.AdvertisePeerURLs),
)
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
_, err := leaderc.MemberAdd(ctx, clus.Members[idx].Etcd.AdvertisePeerURLs)
cancel()
clus.lg.Info(
"member add request DONE",
zap.String("target-endpoint", clus.Members[idx].EtcdClientEndpoint),
zap.Strings("peer-urls", clus.Members[idx].Etcd.AdvertisePeerURLs),
zap.Error(err),
)
if err != nil {
return err
}
// start the added(new) member with fresh data
clus.Members[idx].EtcdOnSnapshotRestore = clus.Members[idx].Etcd
clus.Members[idx].EtcdOnSnapshotRestore.InitialClusterState = "existing"
name := clus.Members[idx].Etcd.Name
for _, u := range clus.Members[idx].Etcd.AdvertisePeerURLs {
initClus = append(initClus, fmt.Sprintf("%s=%s", name, u))
}
clus.Members[idx].EtcdOnSnapshotRestore.InitialCluster = strings.Join(initClus, ",")
clus.lg.Info(
"restart from snapshot request SENT",
zap.String("target-endpoint", clus.Members[idx].EtcdClientEndpoint),
zap.Strings("initial-cluster", initClus),
)
err = clus.sendOp(idx, rpcpb.Operation_RESTART_FROM_SNAPSHOT)
clus.lg.Info(
"restart from snapshot request DONE",
zap.String("target-endpoint", clus.Members[idx].EtcdClientEndpoint),
zap.Strings("initial-cluster", initClus),
zap.Error(err),
)
if err != nil {
return err
}
if i != len(c.injected)-1 {
// wait until membership reconfiguration entry gets applied
// TODO: test concurrent member add
dur := 5 * clus.Members[idx].ElectionTimeout()
clus.lg.Info(
"waiting after restart from snapshot request",
zap.Int("i", i),
zap.Int("idx", idx),
zap.Duration("sleep", dur),
)
time.Sleep(dur)
} else {
clus.lg.Info(
"restart from snapshot request ALL END",
zap.Int("i", i),
zap.Int("idx", idx),
)
}
}
return nil
}
func (c *fetchSnapshotCaseQuorum) Desc() string {
if c.desc != "" {
return c.desc
}
return c.rpcpbCase.String()
}
func (c *fetchSnapshotCaseQuorum) TestCase() rpcpb.Case {
return c.rpcpbCase
}
func new_Case_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH(clus *Cluster) Case {
c := &fetchSnapshotCaseQuorum{
rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH,
injected: make(map[int]struct{}),
snapshotted: -1,
}
// simulate real life; machine replacements may happen
// after some time since disaster
return &caseDelay{
Case: c,
delayDuration: clus.GetCaseDelayDuration(),
}
}

View File

@ -1,92 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import "go.etcd.io/etcd/tests/v3/functional/rpcpb"
func inject_SIGTERM_ETCD(clus *Cluster, idx int) error {
return clus.sendOp(idx, rpcpb.Operation_SIGTERM_ETCD)
}
func recover_SIGTERM_ETCD(clus *Cluster, idx int) error {
return clus.sendOp(idx, rpcpb.Operation_RESTART_ETCD)
}
func new_Case_SIGTERM_ONE_FOLLOWER(clus *Cluster) Case {
cc := caseByFunc{
rpcpbCase: rpcpb.Case_SIGTERM_ONE_FOLLOWER,
injectMember: inject_SIGTERM_ETCD,
recoverMember: recover_SIGTERM_ETCD,
}
c := &caseFollower{cc, -1, -1}
return &caseDelay{
Case: c,
delayDuration: clus.GetCaseDelayDuration(),
}
}
func new_Case_SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster) Case {
return &caseUntilSnapshot{
rpcpbCase: rpcpb.Case_SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
Case: new_Case_SIGTERM_ONE_FOLLOWER(clus),
}
}
func new_Case_SIGTERM_LEADER(clus *Cluster) Case {
cc := caseByFunc{
rpcpbCase: rpcpb.Case_SIGTERM_LEADER,
injectMember: inject_SIGTERM_ETCD,
recoverMember: recover_SIGTERM_ETCD,
}
c := &caseLeader{cc, -1, -1}
return &caseDelay{
Case: c,
delayDuration: clus.GetCaseDelayDuration(),
}
}
func new_Case_SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster) Case {
return &caseUntilSnapshot{
rpcpbCase: rpcpb.Case_SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT,
Case: new_Case_SIGTERM_LEADER(clus),
}
}
func new_Case_SIGTERM_QUORUM(clus *Cluster) Case {
c := &caseQuorum{
caseByFunc: caseByFunc{
rpcpbCase: rpcpb.Case_SIGTERM_QUORUM,
injectMember: inject_SIGTERM_ETCD,
recoverMember: recover_SIGTERM_ETCD,
},
injected: make(map[int]struct{}),
}
return &caseDelay{
Case: c,
delayDuration: clus.GetCaseDelayDuration(),
}
}
func new_Case_SIGTERM_ALL(clus *Cluster) Case {
c := &caseAll{
rpcpbCase: rpcpb.Case_SIGTERM_ALL,
injectMember: inject_SIGTERM_ETCD,
recoverMember: recover_SIGTERM_ETCD,
}
return &caseDelay{
Case: c,
delayDuration: clus.GetCaseDelayDuration(),
}
}

View File

@ -1,28 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import "go.etcd.io/etcd/tests/v3/functional/rpcpb"
// Checker checks cluster consistency.
type Checker interface {
// Type returns the checker type.
Type() rpcpb.Checker
// EtcdClientEndpoints returns the client endpoints of
// all checker target nodes..
EtcdClientEndpoints() []string
// Check returns an error if the system fails a consistency check.
Check() error
}

View File

@ -1,89 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"fmt"
"time"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
"go.uber.org/zap"
)
const retries = 7
type kvHashChecker struct {
ctype rpcpb.Checker
clus *Cluster
}
func newKVHashChecker(clus *Cluster) Checker {
return &kvHashChecker{
ctype: rpcpb.Checker_KV_HASH,
clus: clus,
}
}
func (hc *kvHashChecker) checkRevAndHashes() (err error) {
var (
revs map[string]int64
hashes map[string]int64
)
// retries in case of transient failure or etcd cluster has not stablized yet.
for i := 0; i < retries; i++ {
revs, hashes, err = hc.clus.getRevisionHash()
if err != nil {
hc.clus.lg.Warn(
"failed to get revision and hash",
zap.Int("retries", i),
zap.Error(err),
)
} else {
sameRev := getSameValue(revs)
sameHashes := getSameValue(hashes)
if sameRev && sameHashes {
return nil
}
hc.clus.lg.Warn(
"retrying; etcd cluster is not stable",
zap.Int("retries", i),
zap.Bool("same-revisions", sameRev),
zap.Bool("same-hashes", sameHashes),
zap.String("revisions", fmt.Sprintf("%+v", revs)),
zap.String("hashes", fmt.Sprintf("%+v", hashes)),
)
}
time.Sleep(time.Second)
}
if err != nil {
return fmt.Errorf("failed revision and hash check (%v)", err)
}
return fmt.Errorf("etcd cluster is not stable: [revisions: %v] and [hashes: %v]", revs, hashes)
}
func (hc *kvHashChecker) Type() rpcpb.Checker {
return hc.ctype
}
func (hc *kvHashChecker) EtcdClientEndpoints() []string {
return hc.clus.EtcdClientEndpoints()
}
func (hc *kvHashChecker) Check() error {
return hc.checkRevAndHashes()
}

View File

@ -1,239 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"context"
"fmt"
"time"
"go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
clientv3 "go.etcd.io/etcd/client/v3"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
"go.uber.org/zap"
"google.golang.org/grpc"
)
type leaseExpireChecker struct {
ctype rpcpb.Checker
lg *zap.Logger
m *rpcpb.Member
ls *leaseStresser
cli *clientv3.Client
}
func newLeaseExpireChecker(ls *leaseStresser) Checker {
return &leaseExpireChecker{
ctype: rpcpb.Checker_LEASE_EXPIRE,
lg: ls.lg,
m: ls.m,
ls: ls,
}
}
func (lc *leaseExpireChecker) Type() rpcpb.Checker {
return lc.ctype
}
func (lc *leaseExpireChecker) EtcdClientEndpoints() []string {
return []string{lc.m.EtcdClientEndpoint}
}
func (lc *leaseExpireChecker) Check() error {
if lc.ls == nil {
return nil
}
if lc.ls != nil &&
(lc.ls.revokedLeases == nil ||
lc.ls.aliveLeases == nil ||
lc.ls.shortLivedLeases == nil) {
return nil
}
cli, err := lc.m.CreateEtcdClient(grpc.WithBackoffMaxDelay(time.Second))
if err != nil {
return fmt.Errorf("%v (%q)", err, lc.m.EtcdClientEndpoint)
}
defer func() {
if cli != nil {
cli.Close()
}
}()
lc.cli = cli
if err := check(lc.lg, lc.cli, true, lc.ls.revokedLeases.leases); err != nil {
return err
}
if err := check(lc.lg, lc.cli, false, lc.ls.aliveLeases.leases); err != nil {
return err
}
return lc.checkShortLivedLeases()
}
const leaseExpireCheckerTimeout = 10 * time.Second
// checkShortLivedLeases ensures leases expire.
func (lc *leaseExpireChecker) checkShortLivedLeases() error {
ctx, cancel := context.WithTimeout(context.Background(), leaseExpireCheckerTimeout)
errc := make(chan error)
defer cancel()
for leaseID := range lc.ls.shortLivedLeases.leases {
go func(id int64) {
errc <- lc.checkShortLivedLease(ctx, id)
}(leaseID)
}
var errs []error
for range lc.ls.shortLivedLeases.leases {
if err := <-errc; err != nil {
errs = append(errs, err)
}
}
return errsToError(errs)
}
func (lc *leaseExpireChecker) checkShortLivedLease(ctx context.Context, leaseID int64) (err error) {
// retry in case of transient failure or lease is expired but not yet revoked due to the fact that etcd cluster didn't have enought time to delete it.
var resp *clientv3.LeaseTimeToLiveResponse
for i := 0; i < retries; i++ {
resp, err = getLeaseByID(ctx, lc.cli, leaseID)
// lease not found, for ~v3.1 compatibilities, check ErrLeaseNotFound
if (err == nil && resp.TTL == -1) || (err != nil && rpctypes.Error(err) == rpctypes.ErrLeaseNotFound) {
return nil
}
if err != nil {
lc.lg.Debug(
"retrying; Lease TimeToLive failed",
zap.Int("retries", i),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
zap.Error(err),
)
continue
}
if resp.TTL > 0 {
dur := time.Duration(resp.TTL) * time.Second
lc.lg.Debug(
"lease has not been expired, wait until expire",
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
zap.Int64("ttl", resp.TTL),
zap.Duration("wait-duration", dur),
)
time.Sleep(dur)
} else {
lc.lg.Debug(
"lease expired but not yet revoked",
zap.Int("retries", i),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
zap.Int64("ttl", resp.TTL),
zap.Duration("wait-duration", time.Second),
)
time.Sleep(time.Second)
}
if err = checkLease(ctx, lc.lg, lc.cli, false, leaseID); err != nil {
continue
}
return nil
}
return err
}
func checkLease(ctx context.Context, lg *zap.Logger, cli *clientv3.Client, expired bool, leaseID int64) error {
keysExpired, err := hasKeysAttachedToLeaseExpired(ctx, lg, cli, leaseID)
if err != nil {
lg.Warn(
"hasKeysAttachedToLeaseExpired failed",
zap.Any("endpoint", cli.Endpoints()),
zap.Error(err),
)
return err
}
leaseExpired, err := hasLeaseExpired(ctx, lg, cli, leaseID)
if err != nil {
lg.Warn(
"hasLeaseExpired failed",
zap.Any("endpoint", cli.Endpoints()),
zap.Error(err),
)
return err
}
if leaseExpired != keysExpired {
return fmt.Errorf("lease %v expiration mismatch (lease expired=%v, keys expired=%v)", leaseID, leaseExpired, keysExpired)
}
if leaseExpired != expired {
return fmt.Errorf("lease %v expected expired=%v, got %v", leaseID, expired, leaseExpired)
}
return nil
}
func check(lg *zap.Logger, cli *clientv3.Client, expired bool, leases map[int64]time.Time) error {
ctx, cancel := context.WithTimeout(context.Background(), leaseExpireCheckerTimeout)
defer cancel()
for leaseID := range leases {
if err := checkLease(ctx, lg, cli, expired, leaseID); err != nil {
return err
}
}
return nil
}
// TODO: handle failures from "grpc.WaitForReady(true)"
func getLeaseByID(ctx context.Context, cli *clientv3.Client, leaseID int64) (*clientv3.LeaseTimeToLiveResponse, error) {
return cli.TimeToLive(
ctx,
clientv3.LeaseID(leaseID),
clientv3.WithAttachedKeys(),
)
}
func hasLeaseExpired(ctx context.Context, lg *zap.Logger, cli *clientv3.Client, leaseID int64) (bool, error) {
// keep retrying until lease's state is known or ctx is being canceled
for ctx.Err() == nil {
resp, err := getLeaseByID(ctx, cli, leaseID)
if err != nil {
// for ~v3.1 compatibilities
if rpctypes.Error(err) == rpctypes.ErrLeaseNotFound {
return true, nil
}
} else {
return resp.TTL == -1, nil
}
lg.Warn(
"hasLeaseExpired getLeaseByID failed",
zap.Any("endpoint", cli.Endpoints()),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
zap.Error(err),
)
}
return false, ctx.Err()
}
// The keys attached to the lease has the format of "<leaseID>_<idx>" where idx is the ordering key creation
// Since the format of keys contains about leaseID, finding keys base on "<leaseID>" prefix
// determines whether the attached keys for a given leaseID has been deleted or not
func hasKeysAttachedToLeaseExpired(ctx context.Context, lg *zap.Logger, cli *clientv3.Client, leaseID int64) (bool, error) {
resp, err := cli.Get(ctx, fmt.Sprintf("%d", leaseID), clientv3.WithPrefix())
if err != nil {
lg.Warn(
"hasKeysAttachedToLeaseExpired failed",
zap.Any("endpoint", cli.Endpoints()),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
zap.Error(err),
)
return false, err
}
return len(resp.Kvs) == 0, nil
}

View File

@ -1,24 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import "go.etcd.io/etcd/tests/v3/functional/rpcpb"
type noCheck struct{}
func newNoChecker() Checker { return &noCheck{} }
func (nc *noCheck) Type() rpcpb.Checker { return rpcpb.Checker_NO_CHECK }
func (nc *noCheck) EtcdClientEndpoints() []string { return nil }
func (nc *noCheck) Check() error { return nil }

View File

@ -1,48 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import "go.etcd.io/etcd/tests/v3/functional/rpcpb"
type runnerChecker struct {
ctype rpcpb.Checker
etcdClientEndpoint string
errc chan error
}
func newRunnerChecker(ep string, errc chan error) Checker {
return &runnerChecker{
ctype: rpcpb.Checker_RUNNER,
etcdClientEndpoint: ep,
errc: errc,
}
}
func (rc *runnerChecker) Type() rpcpb.Checker {
return rc.ctype
}
func (rc *runnerChecker) EtcdClientEndpoints() []string {
return []string{rc.etcdClientEndpoint}
}
func (rc *runnerChecker) Check() error {
select {
case err := <-rc.errc:
return err
default:
return nil
}
}

View File

@ -1,77 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"fmt"
"time"
clientv3 "go.etcd.io/etcd/client/v3"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
"go.uber.org/zap"
"google.golang.org/grpc"
)
type shortTTLLeaseExpireChecker struct {
ctype rpcpb.Checker
lg *zap.Logger
m *rpcpb.Member
ls *leaseStresser
cli *clientv3.Client
}
func newShortTTLLeaseExpireChecker(ls *leaseStresser) Checker {
return &shortTTLLeaseExpireChecker{
ctype: rpcpb.Checker_SHORT_TTL_LEASE_EXPIRE,
lg: ls.lg,
m: ls.m,
ls: ls,
}
}
func (lc *shortTTLLeaseExpireChecker) Type() rpcpb.Checker {
return lc.ctype
}
func (lc *shortTTLLeaseExpireChecker) EtcdClientEndpoints() []string {
return []string{lc.m.EtcdClientEndpoint}
}
func (lc *shortTTLLeaseExpireChecker) Check() error {
if lc.ls == nil {
return nil
}
if lc.ls != nil && lc.ls.alivedLeasesWithShortTTL == nil {
return nil
}
cli, err := lc.m.CreateEtcdClient(grpc.WithBackoffMaxDelay(time.Second))
if err != nil {
return fmt.Errorf("%v (%q)", err, lc.m.EtcdClientEndpoint)
}
defer func() {
if cli != nil {
cli.Close()
}
}()
lc.cli = cli
if err := check(lc.lg, lc.cli, false, lc.ls.alivedLeasesWithShortTTL.leases); err != nil {
lc.lg.Error("failed to check alivedLeasesWithShortTTL", zap.Error(err))
return err
}
lc.lg.Info("check alivedLeasesWithShortTTL succ", zap.Int("num", len(lc.ls.alivedLeasesWithShortTTL.leases)))
return nil
}

View File

@ -1,737 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"context"
"errors"
"fmt"
"io"
"log"
"math/rand"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"sync"
"time"
"go.etcd.io/etcd/client/pkg/v3/fileutil"
"go.etcd.io/etcd/pkg/v3/debugutil"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.uber.org/zap"
"golang.org/x/time/rate"
"google.golang.org/grpc"
)
// Cluster defines tester cluster.
type Cluster struct {
lg *zap.Logger
agentConns []*grpc.ClientConn
agentClients []rpcpb.TransportClient
agentStreams []rpcpb.Transport_TransportClient
testerHTTPServer *http.Server
Members []*rpcpb.Member `yaml:"agent-configs"`
Tester *rpcpb.Tester `yaml:"tester-config"`
cases []Case
rateLimiter *rate.Limiter
stresser Stresser
checkers []Checker
currentRevision int64
rd int
cs int
}
var dialOpts = []grpc.DialOption{
grpc.WithInsecure(),
grpc.WithTimeout(5 * time.Second),
grpc.WithBlock(),
}
// NewCluster creates a cluster from a tester configuration.
func NewCluster(lg *zap.Logger, fpath string) (*Cluster, error) {
clus, err := read(lg, fpath)
if err != nil {
return nil, err
}
clus.agentConns = make([]*grpc.ClientConn, len(clus.Members))
clus.agentClients = make([]rpcpb.TransportClient, len(clus.Members))
clus.agentStreams = make([]rpcpb.Transport_TransportClient, len(clus.Members))
clus.cases = make([]Case, 0)
lg.Info("creating members")
for i, ap := range clus.Members {
var err error
clus.agentConns[i], err = grpc.Dial(ap.AgentAddr, dialOpts...)
if err != nil {
return nil, fmt.Errorf("cannot dial agent %v: %v", ap.AgentAddr, err)
}
clus.agentClients[i] = rpcpb.NewTransportClient(clus.agentConns[i])
lg.Info("connected", zap.String("agent-address", ap.AgentAddr))
clus.agentStreams[i], err = clus.agentClients[i].Transport(context.Background())
if err != nil {
return nil, err
}
lg.Info("created stream", zap.String("agent-address", ap.AgentAddr))
}
lg.Info("agents configured.")
mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.Handler())
if clus.Tester.EnablePprof {
for p, h := range debugutil.PProfHandlers() {
mux.Handle(p, h)
}
}
clus.testerHTTPServer = &http.Server{
Addr: clus.Tester.Addr,
Handler: mux,
ErrorLog: log.New(io.Discard, "net/http", 0),
}
go clus.serveTesterServer()
lg.Info("tester server started")
clus.rateLimiter = rate.NewLimiter(
rate.Limit(int(clus.Tester.StressQPS)),
int(clus.Tester.StressQPS),
)
clus.setStresserChecker()
return clus, nil
}
// EtcdClientEndpoints returns all etcd client endpoints.
func (clus *Cluster) EtcdClientEndpoints() (css []string) {
css = make([]string, len(clus.Members))
for i := range clus.Members {
css[i] = clus.Members[i].EtcdClientEndpoint
}
return css
}
func (clus *Cluster) serveTesterServer() {
clus.lg.Info(
"started tester HTTP server",
zap.String("tester-address", clus.Tester.Addr),
)
err := clus.testerHTTPServer.ListenAndServe()
clus.lg.Info(
"tester HTTP server returned",
zap.String("tester-address", clus.Tester.Addr),
zap.Error(err),
)
if err != nil && err != http.ErrServerClosed {
clus.lg.Fatal("tester HTTP errored", zap.Error(err))
}
}
func (clus *Cluster) updateCases() {
for _, cs := range clus.Tester.Cases {
switch cs {
case "SIGTERM_ONE_FOLLOWER":
clus.cases = append(clus.cases,
new_Case_SIGTERM_ONE_FOLLOWER(clus))
case "SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
clus.cases = append(clus.cases,
new_Case_SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus))
case "SIGTERM_LEADER":
clus.cases = append(clus.cases,
new_Case_SIGTERM_LEADER(clus))
case "SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT":
clus.cases = append(clus.cases,
new_Case_SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus))
case "SIGTERM_QUORUM":
clus.cases = append(clus.cases,
new_Case_SIGTERM_QUORUM(clus))
case "SIGTERM_ALL":
clus.cases = append(clus.cases,
new_Case_SIGTERM_ALL(clus))
case "SIGQUIT_AND_REMOVE_ONE_FOLLOWER":
clus.cases = append(clus.cases,
new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus))
case "SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
clus.cases = append(clus.cases,
new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus))
case "SIGQUIT_AND_REMOVE_LEADER":
clus.cases = append(clus.cases,
new_Case_SIGQUIT_AND_REMOVE_LEADER(clus))
case "SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT":
clus.cases = append(clus.cases,
new_Case_SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus))
case "SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH":
clus.cases = append(clus.cases,
new_Case_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH(clus))
case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER":
clus.cases = append(clus.cases,
new_Case_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER(clus))
case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
clus.cases = append(clus.cases,
new_Case_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT())
case "BLACKHOLE_PEER_PORT_TX_RX_LEADER":
clus.cases = append(clus.cases,
new_Case_BLACKHOLE_PEER_PORT_TX_RX_LEADER(clus))
case "BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
clus.cases = append(clus.cases,
new_Case_BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT())
case "BLACKHOLE_PEER_PORT_TX_RX_QUORUM":
clus.cases = append(clus.cases,
new_Case_BLACKHOLE_PEER_PORT_TX_RX_QUORUM(clus))
case "BLACKHOLE_PEER_PORT_TX_RX_ALL":
clus.cases = append(clus.cases,
new_Case_BLACKHOLE_PEER_PORT_TX_RX_ALL(clus))
case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER":
clus.cases = append(clus.cases,
new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER(clus, false))
case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER":
clus.cases = append(clus.cases,
new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER(clus, true))
case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
clus.cases = append(clus.cases,
new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus, false))
case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
clus.cases = append(clus.cases,
new_Case_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus, true))
case "DELAY_PEER_PORT_TX_RX_LEADER":
clus.cases = append(clus.cases,
new_Case_DELAY_PEER_PORT_TX_RX_LEADER(clus, false))
case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER":
clus.cases = append(clus.cases,
new_Case_DELAY_PEER_PORT_TX_RX_LEADER(clus, true))
case "DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
clus.cases = append(clus.cases,
new_Case_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus, false))
case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
clus.cases = append(clus.cases,
new_Case_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus, true))
case "DELAY_PEER_PORT_TX_RX_QUORUM":
clus.cases = append(clus.cases,
new_Case_DELAY_PEER_PORT_TX_RX_QUORUM(clus, false))
case "RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM":
clus.cases = append(clus.cases,
new_Case_DELAY_PEER_PORT_TX_RX_QUORUM(clus, true))
case "DELAY_PEER_PORT_TX_RX_ALL":
clus.cases = append(clus.cases,
new_Case_DELAY_PEER_PORT_TX_RX_ALL(clus, false))
case "RANDOM_DELAY_PEER_PORT_TX_RX_ALL":
clus.cases = append(clus.cases,
new_Case_DELAY_PEER_PORT_TX_RX_ALL(clus, true))
case "NO_FAIL_WITH_STRESS":
clus.cases = append(clus.cases,
new_Case_NO_FAIL_WITH_STRESS(clus))
case "NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS":
clus.cases = append(clus.cases,
new_Case_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS(clus))
case "EXTERNAL":
clus.cases = append(clus.cases,
new_Case_EXTERNAL(clus.Tester.ExternalExecPath))
case "FAILPOINTS":
fpFailures, fperr := failpointFailures(clus)
if len(fpFailures) == 0 {
clus.lg.Info("no failpoints found!", zap.Error(fperr))
} else {
clus.cases = append(clus.cases, fpFailures...)
}
case "FAILPOINTS_WITH_DISK_IO_LATENCY":
fpFailures, fperr := failpointDiskIOFailures(clus)
if len(fpFailures) == 0 {
clus.lg.Info("no failpoints found!", zap.Error(fperr))
} else {
clus.cases = append(clus.cases, fpFailures...)
}
}
}
}
func (clus *Cluster) listCases() (css []string) {
css = make([]string, len(clus.cases))
for i := range clus.cases {
css[i] = clus.cases[i].Desc()
}
return css
}
// UpdateDelayLatencyMs updates delay latency with random value
// within election timeout.
func (clus *Cluster) UpdateDelayLatencyMs() {
rand.Seed(time.Now().UnixNano())
clus.Tester.UpdatedDelayLatencyMs = uint32(rand.Int63n(clus.Members[0].Etcd.ElectionTimeoutMs))
minLatRv := clus.Tester.DelayLatencyMsRv + clus.Tester.DelayLatencyMsRv/5
if clus.Tester.UpdatedDelayLatencyMs <= minLatRv {
clus.Tester.UpdatedDelayLatencyMs += minLatRv
}
}
func (clus *Cluster) setStresserChecker() {
css := &compositeStresser{}
var lss []*leaseStresser
var rss []*runnerStresser
for _, m := range clus.Members {
sss := newStresser(clus, m)
css.stressers = append(css.stressers, &compositeStresser{sss})
for _, s := range sss {
if v, ok := s.(*leaseStresser); ok {
lss = append(lss, v)
clus.lg.Info("added lease stresser", zap.String("endpoint", m.EtcdClientEndpoint))
}
if v, ok := s.(*runnerStresser); ok {
rss = append(rss, v)
clus.lg.Info("added lease stresser", zap.String("endpoint", m.EtcdClientEndpoint))
}
}
}
clus.stresser = css
for _, cs := range clus.Tester.Checkers {
switch cs {
case "KV_HASH":
clus.checkers = append(clus.checkers, newKVHashChecker(clus))
case "LEASE_EXPIRE":
for _, ls := range lss {
clus.checkers = append(clus.checkers, newLeaseExpireChecker(ls))
}
case "RUNNER":
for _, rs := range rss {
clus.checkers = append(clus.checkers, newRunnerChecker(rs.etcdClientEndpoint, rs.errc))
}
case "NO_CHECK":
clus.checkers = append(clus.checkers, newNoChecker())
case "SHORT_TTL_LEASE_EXPIRE":
for _, ls := range lss {
clus.checkers = append(clus.checkers, newShortTTLLeaseExpireChecker(ls))
}
}
}
clus.lg.Info("updated stressers")
}
func (clus *Cluster) runCheckers(exceptions ...rpcpb.Checker) (err error) {
defer func() {
if err != nil {
return
}
if err = clus.updateRevision(); err != nil {
clus.lg.Warn(
"updateRevision failed",
zap.Error(err),
)
return
}
}()
exs := make(map[rpcpb.Checker]struct{})
for _, e := range exceptions {
exs[e] = struct{}{}
}
for _, chk := range clus.checkers {
clus.lg.Warn(
"consistency check START",
zap.String("checker", chk.Type().String()),
zap.Strings("client-endpoints", chk.EtcdClientEndpoints()),
)
err = chk.Check()
clus.lg.Warn(
"consistency check END",
zap.String("checker", chk.Type().String()),
zap.Strings("client-endpoints", chk.EtcdClientEndpoints()),
zap.Error(err),
)
if err != nil {
_, ok := exs[chk.Type()]
if !ok {
return err
}
clus.lg.Warn(
"consistency check SKIP FAIL",
zap.String("checker", chk.Type().String()),
zap.Strings("client-endpoints", chk.EtcdClientEndpoints()),
zap.Error(err),
)
}
}
return nil
}
// Send_INITIAL_START_ETCD bootstraps etcd cluster the very first time.
// After this, just continue to call kill/restart.
func (clus *Cluster) Send_INITIAL_START_ETCD() error {
// this is the only time that creates request from scratch
return clus.broadcast(rpcpb.Operation_INITIAL_START_ETCD)
}
// send_SIGQUIT_ETCD_AND_ARCHIVE_DATA sends "Operation_SIGQUIT_ETCD_AND_ARCHIVE_DATA" operation.
func (clus *Cluster) send_SIGQUIT_ETCD_AND_ARCHIVE_DATA() error {
return clus.broadcast(rpcpb.Operation_SIGQUIT_ETCD_AND_ARCHIVE_DATA)
}
// Send_SIGQUIT_ETCD_AND_REMOVE_DATA sends "Operation_SIGQUIT_ETCD_AND_REMOVE_DATA" operation.
func (clus *Cluster) Send_SIGQUIT_ETCD_AND_REMOVE_DATA() error {
return clus.broadcast(rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA)
}
// send_RESTART_ETCD sends restart operation.
func (clus *Cluster) send_RESTART_ETCD() error {
return clus.broadcast(rpcpb.Operation_RESTART_ETCD)
}
func (clus *Cluster) broadcast(op rpcpb.Operation) error {
var wg sync.WaitGroup
wg.Add(len(clus.agentStreams))
errc := make(chan error, len(clus.agentStreams))
for i := range clus.agentStreams {
go func(idx int, o rpcpb.Operation) {
defer wg.Done()
errc <- clus.sendOp(idx, o)
}(i, op)
}
wg.Wait()
close(errc)
var errs []string
for err := range errc {
if err == nil {
continue
}
errs = append(errs, err.Error())
}
if len(errs) == 0 {
return nil
}
return errors.New(strings.Join(errs, ", "))
}
func (clus *Cluster) sendOp(idx int, op rpcpb.Operation) error {
_, err := clus.sendOpWithResp(idx, op)
return err
}
func (clus *Cluster) sendOpWithResp(idx int, op rpcpb.Operation) (*rpcpb.Response, error) {
// maintain the initial member object
// throughout the test time
req := &rpcpb.Request{
Operation: op,
Member: clus.Members[idx],
Tester: clus.Tester,
}
err := clus.agentStreams[idx].Send(req)
clus.lg.Info(
"sent request",
zap.String("operation", op.String()),
zap.String("to", clus.Members[idx].EtcdClientEndpoint),
zap.Error(err),
)
if err != nil {
return nil, err
}
resp, err := clus.agentStreams[idx].Recv()
if resp != nil {
clus.lg.Info(
"received response",
zap.String("operation", op.String()),
zap.String("from", clus.Members[idx].EtcdClientEndpoint),
zap.Bool("success", resp.Success),
zap.String("status", resp.Status),
zap.Error(err),
)
} else {
clus.lg.Info(
"received empty response",
zap.String("operation", op.String()),
zap.String("from", clus.Members[idx].EtcdClientEndpoint),
zap.Error(err),
)
}
if err != nil {
return nil, err
}
if !resp.Success {
return nil, errors.New(resp.Status)
}
m, secure := clus.Members[idx], false
for _, cu := range m.Etcd.AdvertiseClientURLs {
u, perr := url.Parse(cu)
if perr != nil {
return nil, perr
}
if u.Scheme == "https" { // TODO: handle unix
secure = true
}
}
// store TLS assets from agents/servers onto disk
if secure && (op == rpcpb.Operation_INITIAL_START_ETCD || op == rpcpb.Operation_RESTART_ETCD) {
dirClient := filepath.Join(
clus.Tester.DataDir,
clus.Members[idx].Etcd.Name,
"fixtures",
"client",
)
if err = fileutil.TouchDirAll(clus.lg, dirClient); err != nil {
return nil, err
}
clientCertData := []byte(resp.Member.ClientCertData)
if len(clientCertData) == 0 {
return nil, fmt.Errorf("got empty client cert from %q", m.EtcdClientEndpoint)
}
clientCertPath := filepath.Join(dirClient, "cert.pem")
if err = os.WriteFile(clientCertPath, clientCertData, 0644); err != nil { // overwrite if exists
return nil, err
}
resp.Member.ClientCertPath = clientCertPath
clus.lg.Info(
"saved client cert file",
zap.String("path", clientCertPath),
)
clientKeyData := []byte(resp.Member.ClientKeyData)
if len(clientKeyData) == 0 {
return nil, fmt.Errorf("got empty client key from %q", m.EtcdClientEndpoint)
}
clientKeyPath := filepath.Join(dirClient, "key.pem")
if err = os.WriteFile(clientKeyPath, clientKeyData, 0644); err != nil { // overwrite if exists
return nil, err
}
resp.Member.ClientKeyPath = clientKeyPath
clus.lg.Info(
"saved client key file",
zap.String("path", clientKeyPath),
)
clientTrustedCAData := []byte(resp.Member.ClientTrustedCAData)
if len(clientTrustedCAData) != 0 {
// TODO: disable this when auto TLS is deprecated
clientTrustedCAPath := filepath.Join(dirClient, "ca.pem")
if err = os.WriteFile(clientTrustedCAPath, clientTrustedCAData, 0644); err != nil { // overwrite if exists
return nil, err
}
resp.Member.ClientTrustedCAPath = clientTrustedCAPath
clus.lg.Info(
"saved client trusted CA file",
zap.String("path", clientTrustedCAPath),
)
}
// no need to store peer certs for tester clients
clus.Members[idx] = resp.Member
}
return resp, nil
}
// WaitHealth ensures all members are healthy
// by writing a test key to etcd cluster.
func (clus *Cluster) WaitHealth() error {
var err error
// wait 60s to check cluster health.
// TODO: set it to a reasonable value. It is set that high because
// follower may use long time to catch up the leader when reboot under
// reasonable workload (https://github.com/etcd-io/etcd/issues/2698)
for i := 0; i < 60; i++ {
for _, m := range clus.Members {
if err = m.WriteHealthKey(); err != nil {
clus.lg.Warn(
"health check FAIL",
zap.Int("retries", i),
zap.String("endpoint", m.EtcdClientEndpoint),
zap.Error(err),
)
break
}
clus.lg.Info(
"health check PASS",
zap.Int("retries", i),
zap.String("endpoint", m.EtcdClientEndpoint),
)
}
if err == nil {
clus.lg.Info("health check ALL PASS")
return nil
}
time.Sleep(time.Second)
}
return err
}
// GetLeader returns the index of leader and error if any.
func (clus *Cluster) GetLeader() (int, error) {
for i, m := range clus.Members {
isLeader, err := m.IsLeader()
if isLeader || err != nil {
return i, err
}
}
return 0, fmt.Errorf("no leader found")
}
// maxRev returns the maximum revision found on the cluster.
func (clus *Cluster) maxRev() (rev int64, err error) {
ctx, cancel := context.WithTimeout(context.TODO(), time.Second)
defer cancel()
revc, errc := make(chan int64, len(clus.Members)), make(chan error, len(clus.Members))
for i := range clus.Members {
go func(m *rpcpb.Member) {
mrev, merr := m.Rev(ctx)
revc <- mrev
errc <- merr
}(clus.Members[i])
}
for i := 0; i < len(clus.Members); i++ {
if merr := <-errc; merr != nil {
err = merr
}
if mrev := <-revc; mrev > rev {
rev = mrev
}
}
return rev, err
}
func (clus *Cluster) getRevisionHash() (map[string]int64, map[string]int64, error) {
revs := make(map[string]int64)
hashes := make(map[string]int64)
for _, m := range clus.Members {
rev, hash, err := m.RevHash()
if err != nil {
return nil, nil, err
}
revs[m.EtcdClientEndpoint] = rev
hashes[m.EtcdClientEndpoint] = hash
}
return revs, hashes, nil
}
func (clus *Cluster) compactKV(rev int64, timeout time.Duration) (err error) {
if rev <= 0 {
return nil
}
for i, m := range clus.Members {
clus.lg.Info(
"compact START",
zap.String("endpoint", m.EtcdClientEndpoint),
zap.Int64("compact-revision", rev),
zap.Duration("timeout", timeout),
)
now := time.Now()
cerr := m.Compact(rev, timeout)
succeed := true
if cerr != nil {
if strings.Contains(cerr.Error(), "required revision has been compacted") && i > 0 {
clus.lg.Info(
"compact error is ignored",
zap.String("endpoint", m.EtcdClientEndpoint),
zap.Int64("compact-revision", rev),
zap.String("expected-error-msg", cerr.Error()),
)
} else {
clus.lg.Warn(
"compact FAIL",
zap.String("endpoint", m.EtcdClientEndpoint),
zap.Int64("compact-revision", rev),
zap.Error(cerr),
)
err = cerr
succeed = false
}
}
if succeed {
clus.lg.Info(
"compact PASS",
zap.String("endpoint", m.EtcdClientEndpoint),
zap.Int64("compact-revision", rev),
zap.Duration("timeout", timeout),
zap.Duration("took", time.Since(now)),
)
}
}
return err
}
func (clus *Cluster) checkCompact(rev int64) error {
if rev == 0 {
return nil
}
for _, m := range clus.Members {
if err := m.CheckCompact(rev); err != nil {
return err
}
}
return nil
}
func (clus *Cluster) defrag() error {
for _, m := range clus.Members {
if err := m.Defrag(); err != nil {
clus.lg.Warn(
"defrag FAIL",
zap.String("endpoint", m.EtcdClientEndpoint),
zap.Error(err),
)
return err
}
clus.lg.Info(
"defrag PASS",
zap.String("endpoint", m.EtcdClientEndpoint),
)
}
clus.lg.Info(
"defrag ALL PASS",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.cases)),
)
return nil
}
// GetCaseDelayDuration computes failure delay duration.
func (clus *Cluster) GetCaseDelayDuration() time.Duration {
return time.Duration(clus.Tester.CaseDelayMs) * time.Millisecond
}
// Report reports the number of modified keys.
func (clus *Cluster) Report() int64 {
return clus.stresser.ModifiedKeys()
}

View File

@ -1,376 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"errors"
"fmt"
"net/url"
"os"
"path/filepath"
"strings"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
"go.uber.org/zap"
yaml "gopkg.in/yaml.v2"
)
func read(lg *zap.Logger, fpath string) (*Cluster, error) {
bts, err := os.ReadFile(fpath)
if err != nil {
return nil, err
}
lg.Info("opened configuration file", zap.String("path", fpath))
clus := &Cluster{lg: lg}
if err = yaml.Unmarshal(bts, clus); err != nil {
return nil, err
}
if len(clus.Members) < 3 {
return nil, fmt.Errorf("len(clus.Members) expects at least 3, got %d", len(clus.Members))
}
failpointsEnabled := false
for _, c := range clus.Tester.Cases {
if c == rpcpb.Case_FAILPOINTS.String() {
failpointsEnabled = true
break
}
}
if len(clus.Tester.Cases) == 0 {
return nil, errors.New("cases not found")
}
if clus.Tester.DelayLatencyMs <= clus.Tester.DelayLatencyMsRv*5 {
return nil, fmt.Errorf("delay latency %d ms must be greater than 5x of delay latency random variable %d ms", clus.Tester.DelayLatencyMs, clus.Tester.DelayLatencyMsRv)
}
if clus.Tester.UpdatedDelayLatencyMs == 0 {
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
}
for _, v := range clus.Tester.Cases {
if _, ok := rpcpb.Case_value[v]; !ok {
return nil, fmt.Errorf("%q is not defined in 'rpcpb.Case_value'", v)
}
}
for _, s := range clus.Tester.Stressers {
if _, ok := rpcpb.StresserType_value[s.Type]; !ok {
return nil, fmt.Errorf("unknown 'StresserType' %+v", s)
}
}
for _, v := range clus.Tester.Checkers {
if _, ok := rpcpb.Checker_value[v]; !ok {
return nil, fmt.Errorf("Checker is unknown; got %q", v)
}
}
if clus.Tester.StressKeySuffixRangeTxn > 100 {
return nil, fmt.Errorf("StressKeySuffixRangeTxn maximum value is 100, got %v", clus.Tester.StressKeySuffixRangeTxn)
}
if clus.Tester.StressKeyTxnOps > 64 {
return nil, fmt.Errorf("StressKeyTxnOps maximum value is 64, got %v", clus.Tester.StressKeyTxnOps)
}
for i, mem := range clus.Members {
if mem.EtcdExec == "embed" && failpointsEnabled {
return nil, errors.New("EtcdExec 'embed' cannot be run with failpoints enabled")
}
if mem.BaseDir == "" {
return nil, fmt.Errorf("BaseDir cannot be empty (got %q)", mem.BaseDir)
}
if mem.Etcd.Name == "" {
return nil, fmt.Errorf("'--name' cannot be empty (got %+v)", mem)
}
if mem.Etcd.DataDir == "" {
return nil, fmt.Errorf("'--data-dir' cannot be empty (got %+v)", mem)
}
if mem.Etcd.SnapshotCount == 0 {
return nil, fmt.Errorf("'--snapshot-count' cannot be 0 (got %+v)", mem.Etcd.SnapshotCount)
}
if mem.Etcd.DataDir == "" {
return nil, fmt.Errorf("'--data-dir' cannot be empty (got %q)", mem.Etcd.DataDir)
}
if mem.Etcd.WALDir == "" {
clus.Members[i].Etcd.WALDir = filepath.Join(mem.Etcd.DataDir, "member", "wal")
}
switch mem.Etcd.InitialClusterState {
case "new":
case "existing":
default:
return nil, fmt.Errorf("'--initial-cluster-state' got %q", mem.Etcd.InitialClusterState)
}
if mem.Etcd.HeartbeatIntervalMs == 0 {
return nil, fmt.Errorf("'--heartbeat-interval' cannot be 0 (got %+v)", mem.Etcd)
}
if mem.Etcd.ElectionTimeoutMs == 0 {
return nil, fmt.Errorf("'--election-timeout' cannot be 0 (got %+v)", mem.Etcd)
}
if int64(clus.Tester.DelayLatencyMs) <= mem.Etcd.ElectionTimeoutMs {
return nil, fmt.Errorf("delay latency %d ms must be greater than election timeout %d ms", clus.Tester.DelayLatencyMs, mem.Etcd.ElectionTimeoutMs)
}
port := ""
listenClientPorts := make([]string, len(clus.Members))
for i, u := range mem.Etcd.ListenClientURLs {
if !isValidURL(u) {
return nil, fmt.Errorf("'--listen-client-urls' has valid URL %q", u)
}
listenClientPorts[i], err = getPort(u)
if err != nil {
return nil, fmt.Errorf("'--listen-client-urls' has no port %q", u)
}
}
for i, u := range mem.Etcd.AdvertiseClientURLs {
if !isValidURL(u) {
return nil, fmt.Errorf("'--advertise-client-urls' has valid URL %q", u)
}
port, err = getPort(u)
if err != nil {
return nil, fmt.Errorf("'--advertise-client-urls' has no port %q", u)
}
if mem.EtcdClientProxy && listenClientPorts[i] == port {
return nil, fmt.Errorf("clus.Members[%d] requires client port proxy, but advertise port %q conflicts with listener port %q", i, port, listenClientPorts[i])
}
}
listenPeerPorts := make([]string, len(clus.Members))
for i, u := range mem.Etcd.ListenPeerURLs {
if !isValidURL(u) {
return nil, fmt.Errorf("'--listen-peer-urls' has valid URL %q", u)
}
listenPeerPorts[i], err = getPort(u)
if err != nil {
return nil, fmt.Errorf("'--listen-peer-urls' has no port %q", u)
}
}
for j, u := range mem.Etcd.AdvertisePeerURLs {
if !isValidURL(u) {
return nil, fmt.Errorf("'--initial-advertise-peer-urls' has valid URL %q", u)
}
port, err = getPort(u)
if err != nil {
return nil, fmt.Errorf("'--initial-advertise-peer-urls' has no port %q", u)
}
if mem.EtcdPeerProxy && listenPeerPorts[j] == port {
return nil, fmt.Errorf("clus.Members[%d] requires peer port proxy, but advertise port %q conflicts with listener port %q", i, port, listenPeerPorts[j])
}
}
if !strings.HasPrefix(mem.Etcd.DataDir, mem.BaseDir) {
return nil, fmt.Errorf("Etcd.DataDir must be prefixed with BaseDir (got %q)", mem.Etcd.DataDir)
}
// TODO: support separate WALDir that can be handled via failure-archive
if !strings.HasPrefix(mem.Etcd.WALDir, mem.BaseDir) {
return nil, fmt.Errorf("Etcd.WALDir must be prefixed with BaseDir (got %q)", mem.Etcd.WALDir)
}
// TODO: only support generated certs with TLS generator
// deprecate auto TLS
if mem.Etcd.PeerAutoTLS && mem.Etcd.PeerCertFile != "" {
return nil, fmt.Errorf("Etcd.PeerAutoTLS 'true', but Etcd.PeerCertFile is %q", mem.Etcd.PeerCertFile)
}
if mem.Etcd.PeerAutoTLS && mem.Etcd.PeerKeyFile != "" {
return nil, fmt.Errorf("Etcd.PeerAutoTLS 'true', but Etcd.PeerKeyFile is %q", mem.Etcd.PeerKeyFile)
}
if mem.Etcd.PeerAutoTLS && mem.Etcd.PeerTrustedCAFile != "" {
return nil, fmt.Errorf("Etcd.PeerAutoTLS 'true', but Etcd.PeerTrustedCAFile is %q", mem.Etcd.PeerTrustedCAFile)
}
if mem.Etcd.ClientAutoTLS && mem.Etcd.ClientCertFile != "" {
return nil, fmt.Errorf("Etcd.ClientAutoTLS 'true', but Etcd.ClientCertFile is %q", mem.Etcd.ClientCertFile)
}
if mem.Etcd.ClientAutoTLS && mem.Etcd.ClientKeyFile != "" {
return nil, fmt.Errorf("Etcd.ClientAutoTLS 'true', but Etcd.ClientKeyFile is %q", mem.Etcd.ClientKeyFile)
}
if mem.Etcd.ClientAutoTLS && mem.Etcd.ClientTrustedCAFile != "" {
return nil, fmt.Errorf("Etcd.ClientAutoTLS 'true', but Etcd.ClientTrustedCAFile is %q", mem.Etcd.ClientTrustedCAFile)
}
if mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerCertFile == "" {
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'true', but Etcd.PeerCertFile is %q", mem.Etcd.PeerCertFile)
}
if mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerKeyFile == "" {
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'true', but Etcd.PeerKeyFile is %q", mem.Etcd.PeerCertFile)
}
// only support self-signed certs
if mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerTrustedCAFile == "" {
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'true', but Etcd.PeerTrustedCAFile is %q", mem.Etcd.PeerCertFile)
}
if !mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerCertFile != "" {
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'false', but Etcd.PeerCertFile is %q", mem.Etcd.PeerCertFile)
}
if !mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerKeyFile != "" {
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'false', but Etcd.PeerKeyFile is %q", mem.Etcd.PeerCertFile)
}
if !mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerTrustedCAFile != "" {
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'false', but Etcd.PeerTrustedCAFile is %q", mem.Etcd.PeerTrustedCAFile)
}
if mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerAutoTLS {
return nil, fmt.Errorf("Etcd.PeerClientCertAuth and Etcd.PeerAutoTLS cannot be both 'true'")
}
if (mem.Etcd.PeerCertFile == "") != (mem.Etcd.PeerKeyFile == "") {
return nil, fmt.Errorf("both Etcd.PeerCertFile %q and Etcd.PeerKeyFile %q must be either empty or non-empty", mem.Etcd.PeerCertFile, mem.Etcd.PeerKeyFile)
}
if mem.Etcd.ClientCertAuth && mem.Etcd.ClientAutoTLS {
return nil, fmt.Errorf("Etcd.ClientCertAuth and Etcd.ClientAutoTLS cannot be both 'true'")
}
if mem.Etcd.ClientCertAuth && mem.Etcd.ClientCertFile == "" {
return nil, fmt.Errorf("Etcd.ClientCertAuth 'true', but Etcd.ClientCertFile is %q", mem.Etcd.PeerCertFile)
}
if mem.Etcd.ClientCertAuth && mem.Etcd.ClientKeyFile == "" {
return nil, fmt.Errorf("Etcd.ClientCertAuth 'true', but Etcd.ClientKeyFile is %q", mem.Etcd.PeerCertFile)
}
if mem.Etcd.ClientCertAuth && mem.Etcd.ClientTrustedCAFile == "" {
return nil, fmt.Errorf("Etcd.ClientCertAuth 'true', but Etcd.ClientTrustedCAFile is %q", mem.Etcd.ClientTrustedCAFile)
}
if !mem.Etcd.ClientCertAuth && mem.Etcd.ClientCertFile != "" {
return nil, fmt.Errorf("Etcd.ClientCertAuth 'false', but Etcd.ClientCertFile is %q", mem.Etcd.PeerCertFile)
}
if !mem.Etcd.ClientCertAuth && mem.Etcd.ClientKeyFile != "" {
return nil, fmt.Errorf("Etcd.ClientCertAuth 'false', but Etcd.ClientKeyFile is %q", mem.Etcd.PeerCertFile)
}
if !mem.Etcd.ClientCertAuth && mem.Etcd.ClientTrustedCAFile != "" {
return nil, fmt.Errorf("Etcd.ClientCertAuth 'false', but Etcd.ClientTrustedCAFile is %q", mem.Etcd.PeerCertFile)
}
if (mem.Etcd.ClientCertFile == "") != (mem.Etcd.ClientKeyFile == "") {
return nil, fmt.Errorf("both Etcd.ClientCertFile %q and Etcd.ClientKeyFile %q must be either empty or non-empty", mem.Etcd.ClientCertFile, mem.Etcd.ClientKeyFile)
}
peerTLS := mem.Etcd.PeerAutoTLS ||
(mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerCertFile != "" && mem.Etcd.PeerKeyFile != "" && mem.Etcd.PeerTrustedCAFile != "")
if peerTLS {
for _, cu := range mem.Etcd.ListenPeerURLs {
var u *url.URL
u, err = url.Parse(cu)
if err != nil {
return nil, err
}
if u.Scheme != "https" { // TODO: support unix
return nil, fmt.Errorf("peer TLS is enabled with wrong scheme %q", cu)
}
}
for _, cu := range mem.Etcd.AdvertisePeerURLs {
var u *url.URL
u, err = url.Parse(cu)
if err != nil {
return nil, err
}
if u.Scheme != "https" { // TODO: support unix
return nil, fmt.Errorf("peer TLS is enabled with wrong scheme %q", cu)
}
}
clus.Members[i].PeerCertPath = mem.Etcd.PeerCertFile
if mem.Etcd.PeerCertFile != "" {
var data []byte
data, err = os.ReadFile(mem.Etcd.PeerCertFile)
if err != nil {
return nil, fmt.Errorf("failed to read %q (%v)", mem.Etcd.PeerCertFile, err)
}
clus.Members[i].PeerCertData = string(data)
}
clus.Members[i].PeerKeyPath = mem.Etcd.PeerKeyFile
if mem.Etcd.PeerKeyFile != "" {
var data []byte
data, err = os.ReadFile(mem.Etcd.PeerKeyFile)
if err != nil {
return nil, fmt.Errorf("failed to read %q (%v)", mem.Etcd.PeerKeyFile, err)
}
clus.Members[i].PeerCertData = string(data)
}
clus.Members[i].PeerTrustedCAPath = mem.Etcd.PeerTrustedCAFile
if mem.Etcd.PeerTrustedCAFile != "" {
var data []byte
data, err = os.ReadFile(mem.Etcd.PeerTrustedCAFile)
if err != nil {
return nil, fmt.Errorf("failed to read %q (%v)", mem.Etcd.PeerTrustedCAFile, err)
}
clus.Members[i].PeerCertData = string(data)
}
}
clientTLS := mem.Etcd.ClientAutoTLS ||
(mem.Etcd.ClientCertAuth && mem.Etcd.ClientCertFile != "" && mem.Etcd.ClientKeyFile != "" && mem.Etcd.ClientTrustedCAFile != "")
if clientTLS {
for _, cu := range mem.Etcd.ListenClientURLs {
var u *url.URL
u, err = url.Parse(cu)
if err != nil {
return nil, err
}
if u.Scheme != "https" { // TODO: support unix
return nil, fmt.Errorf("client TLS is enabled with wrong scheme %q", cu)
}
}
for _, cu := range mem.Etcd.AdvertiseClientURLs {
var u *url.URL
u, err = url.Parse(cu)
if err != nil {
return nil, err
}
if u.Scheme != "https" { // TODO: support unix
return nil, fmt.Errorf("client TLS is enabled with wrong scheme %q", cu)
}
}
clus.Members[i].ClientCertPath = mem.Etcd.ClientCertFile
if mem.Etcd.ClientCertFile != "" {
var data []byte
data, err = os.ReadFile(mem.Etcd.ClientCertFile)
if err != nil {
return nil, fmt.Errorf("failed to read %q (%v)", mem.Etcd.ClientCertFile, err)
}
clus.Members[i].ClientCertData = string(data)
}
clus.Members[i].ClientKeyPath = mem.Etcd.ClientKeyFile
if mem.Etcd.ClientKeyFile != "" {
var data []byte
data, err = os.ReadFile(mem.Etcd.ClientKeyFile)
if err != nil {
return nil, fmt.Errorf("failed to read %q (%v)", mem.Etcd.ClientKeyFile, err)
}
clus.Members[i].ClientCertData = string(data)
}
clus.Members[i].ClientTrustedCAPath = mem.Etcd.ClientTrustedCAFile
if mem.Etcd.ClientTrustedCAFile != "" {
var data []byte
data, err = os.ReadFile(mem.Etcd.ClientTrustedCAFile)
if err != nil {
return nil, fmt.Errorf("failed to read %q (%v)", mem.Etcd.ClientTrustedCAFile, err)
}
clus.Members[i].ClientCertData = string(data)
}
}
if len(mem.Etcd.LogOutputs) == 0 {
return nil, fmt.Errorf("mem.Etcd.LogOutputs cannot be empty")
}
for _, v := range mem.Etcd.LogOutputs {
switch v {
case "stderr", "stdout", "/dev/null", "default":
default:
if !strings.HasPrefix(v, mem.BaseDir) {
return nil, fmt.Errorf("LogOutput %q must be prefixed with BaseDir %q", v, mem.BaseDir)
}
}
}
}
return clus, err
}

View File

@ -1,329 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"fmt"
"os"
"testing"
"time"
"go.etcd.io/etcd/client/pkg/v3/fileutil"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
"go.uber.org/zap"
)
// compactQPS is rough number of compact requests per second.
// Previous tests showed etcd can compact about 60,000 entries per second.
const compactQPS = 50000
// Run starts tester.
func (clus *Cluster) Run(t *testing.T) error {
defer printReport()
// updateCases must be executed after etcd is started, because the FAILPOINTS case
// needs to obtain all the failpoints from the etcd member.
clus.updateCases()
if err := fileutil.TouchDirAll(clus.lg, clus.Tester.DataDir); err != nil {
clus.lg.Panic(
"failed to create test data directory",
zap.String("dir", clus.Tester.DataDir),
zap.Error(err),
)
}
var (
preModifiedKey int64
err error
)
for round := 0; round < int(clus.Tester.RoundLimit) || clus.Tester.RoundLimit == -1; round++ {
t.Run(fmt.Sprintf("round-%d", round), func(t *testing.T) {
preModifiedKey, err = clus.doRoundAndCompact(t, round, preModifiedKey)
})
if err != nil {
clus.failed(t, err)
return err
}
if round > 0 && round%500 == 0 { // every 500 rounds
t.Logf("Defragmenting in round: %v", round)
if err := clus.defrag(); err != nil {
clus.failed(t, err)
return err
}
}
}
clus.lg.Info(
"functional-tester PASS",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.cases)),
)
return nil
}
func (clus *Cluster) doRoundAndCompact(t *testing.T, round int, preModifiedKey int64) (postModifiedKey int64, err error) {
roundTotalCounter.Inc()
clus.rd = round
if err = clus.doRound(t); err != nil {
clus.failed(t, fmt.Errorf("doRound FAIL: %w", err))
return
}
// -1 so that logPrefix doesn't print out 'case'
clus.cs = -1
revToCompact := max(0, clus.currentRevision-10000)
currentModifiedKey := clus.stresser.ModifiedKeys()
modifiedKey := currentModifiedKey - preModifiedKey
timeout := 10 * time.Second
timeout += time.Duration(modifiedKey/compactQPS) * time.Second
clus.lg.Info(
"compact START",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.cases)),
zap.Duration("timeout", timeout),
)
if err = clus.compact(revToCompact, timeout); err != nil {
clus.failed(t, fmt.Errorf("compact FAIL: %w", err))
} else {
postModifiedKey = currentModifiedKey
}
return
}
func (clus *Cluster) doRound(t *testing.T) error {
if clus.Tester.CaseShuffle {
clus.shuffleCases()
}
roundNow := time.Now()
clus.lg.Info(
"round START",
zap.Int("round", clus.rd),
zap.Int("case-total", len(clus.cases)),
zap.Strings("cases", clus.listCases()),
)
for i, fa := range clus.cases {
clus.cs = i
t.Run(fmt.Sprintf("%v_%s", i, fa.TestCase()),
func(t *testing.T) {
clus.doTestCase(t, fa)
})
}
clus.lg.Info(
"round ALL PASS",
zap.Int("round", clus.rd),
zap.Strings("cases", clus.listCases()),
zap.Int("case-total", len(clus.cases)),
zap.Duration("took", time.Since(roundNow)),
)
return nil
}
func (clus *Cluster) doTestCase(t *testing.T, fa Case) {
caseTotal[fa.Desc()]++
caseTotalCounter.WithLabelValues(fa.Desc()).Inc()
caseNow := time.Now()
clus.lg.Info(
"case START",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.cases)),
zap.String("desc", fa.Desc()),
)
clus.lg.Info("wait health before injecting failures")
if err := clus.WaitHealth(); err != nil {
clus.failed(t, fmt.Errorf("wait full health error before starting test case: %w", err))
}
stressStarted := false
fcase := fa.TestCase()
if fcase != rpcpb.Case_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS {
clus.lg.Info(
"stress START",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.cases)),
zap.String("desc", fa.Desc()),
)
if err := clus.stresser.Stress(); err != nil {
clus.failed(t, fmt.Errorf("start stresser error: %w", err))
}
stressStarted = true
}
clus.lg.Info(
"inject START",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.cases)),
zap.String("desc", fa.Desc()),
)
if err := fa.Inject(clus); err != nil {
clus.failed(t, fmt.Errorf("injection error: %w", err))
}
// if run local, recovering server may conflict
// with stressing client ports
// TODO: use unix for local tests
clus.lg.Info(
"recover START",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.cases)),
zap.String("desc", fa.Desc()),
)
if err := fa.Recover(clus); err != nil {
clus.failed(t, fmt.Errorf("recovery error: %w", err))
}
if stressStarted {
clus.lg.Info(
"stress PAUSE",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.cases)),
zap.String("desc", fa.Desc()),
)
ems := clus.stresser.Pause()
if fcase == rpcpb.Case_NO_FAIL_WITH_STRESS && len(ems) > 0 {
ess := make([]string, 0, len(ems))
cnt := 0
for k, v := range ems {
ess = append(ess, fmt.Sprintf("%s (count: %d)", k, v))
cnt += v
}
clus.lg.Warn(
"expected no errors",
zap.String("desc", fa.Desc()),
zap.Strings("errors", ess),
)
// with network delay, some ongoing requests may fail
// only return error, if more than 30% of QPS requests fail
if cnt > int(float64(clus.Tester.StressQPS)*0.3) {
clus.failed(t, fmt.Errorf("expected no error in %q, got %q", fcase.String(), ess))
}
}
}
clus.lg.Info(
"health check START",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.cases)),
zap.String("desc", fa.Desc()),
)
if err := clus.WaitHealth(); err != nil {
clus.failed(t, fmt.Errorf("wait full health error after test finished: %w", err))
}
var checkerFailExceptions []rpcpb.Checker
switch fcase {
case rpcpb.Case_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH:
// TODO: restore from snapshot
checkerFailExceptions = append(checkerFailExceptions, rpcpb.Checker_LEASE_EXPIRE)
}
clus.lg.Info(
"consistency check START",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.cases)),
zap.String("desc", fa.Desc()),
)
if err := clus.runCheckers(checkerFailExceptions...); err != nil {
clus.failed(t, fmt.Errorf("consistency check error: %w", err))
}
clus.lg.Info(
"consistency check PASS",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.cases)),
zap.String("desc", fa.Desc()),
zap.Duration("took", time.Since(caseNow)),
)
}
func (clus *Cluster) updateRevision() error {
revs, _, err := clus.getRevisionHash()
for _, rev := range revs {
clus.currentRevision = rev
break // just need get one of the current revisions
}
clus.lg.Info(
"updated current revision",
zap.Int64("current-revision", clus.currentRevision),
)
return err
}
func (clus *Cluster) compact(rev int64, timeout time.Duration) (err error) {
if err = clus.compactKV(rev, timeout); err != nil {
clus.lg.Warn(
"compact FAIL",
zap.Int64("current-revision", clus.currentRevision),
zap.Int64("compact-revision", rev),
zap.Error(err),
)
return err
}
clus.lg.Info(
"compact DONE",
zap.Int64("current-revision", clus.currentRevision),
zap.Int64("compact-revision", rev),
)
if err = clus.checkCompact(rev); err != nil {
clus.lg.Warn(
"check compact FAIL",
zap.Int64("current-revision", clus.currentRevision),
zap.Int64("compact-revision", rev),
zap.Error(err),
)
return err
}
clus.lg.Info(
"check compact DONE",
zap.Int64("current-revision", clus.currentRevision),
zap.Int64("compact-revision", rev),
)
return nil
}
func (clus *Cluster) failed(t *testing.T, err error) {
clus.lg.Error(
"functional-tester FAIL",
zap.Int("round", clus.rd),
zap.String("case-name", t.Name()),
zap.Int("case-number", clus.cs),
zap.Int("case-total", len(clus.cases)),
zap.Error(err),
)
os.Exit(2)
}

View File

@ -1,65 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"math/rand"
"time"
"go.uber.org/zap"
)
func (clus *Cluster) shuffleCases() {
rand.Seed(time.Now().UnixNano())
offset := rand.Intn(1000)
n := len(clus.cases)
cp := coprime(n)
css := make([]Case, n)
for i := 0; i < n; i++ {
css[i] = clus.cases[(cp*i+offset)%n]
}
clus.cases = css
clus.lg.Info("shuffled test failure cases", zap.Int("total", n))
}
/*
x and y of GCD 1 are coprime to each other
x1 = ( coprime of n * idx1 + offset ) % n
x2 = ( coprime of n * idx2 + offset ) % n
(x2 - x1) = coprime of n * (idx2 - idx1) % n
= (idx2 - idx1) = 1
Consecutive x's are guaranteed to be distinct
*/
func coprime(n int) int {
coprime := 1
for i := n / 2; i < n; i++ {
if gcd(i, n) == 1 {
coprime = i
break
}
}
return coprime
}
func gcd(x, y int) int {
if y == 0 {
return x
}
return gcd(y, x%y)
}

View File

@ -1,309 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"reflect"
"sort"
"testing"
"go.uber.org/zap/zaptest"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
)
func Test_read(t *testing.T) {
exp := &Cluster{
Members: []*rpcpb.Member{
{
EtcdExec: "./bin/etcd",
AgentAddr: "127.0.0.1:19027",
FailpointHTTPAddr: "http://127.0.0.1:7381",
BaseDir: "/tmp/etcd-functional-1",
EtcdClientProxy: false,
EtcdPeerProxy: true,
EtcdClientEndpoint: "127.0.0.1:1379",
Etcd: &rpcpb.Etcd{
Name: "s1",
DataDir: "/tmp/etcd-functional-1/etcd.data",
WALDir: "/tmp/etcd-functional-1/etcd.data/member/wal",
HeartbeatIntervalMs: 100,
ElectionTimeoutMs: 1000,
ListenClientURLs: []string{"https://127.0.0.1:1379"},
AdvertiseClientURLs: []string{"https://127.0.0.1:1379"},
ClientAutoTLS: true,
ClientCertAuth: false,
ClientCertFile: "",
ClientKeyFile: "",
ClientTrustedCAFile: "",
ListenPeerURLs: []string{"https://127.0.0.1:1380"},
AdvertisePeerURLs: []string{"https://127.0.0.1:1381"},
PeerAutoTLS: true,
PeerClientCertAuth: false,
PeerCertFile: "",
PeerKeyFile: "",
PeerTrustedCAFile: "",
InitialCluster: "s1=https://127.0.0.1:1381,s2=https://127.0.0.1:2381,s3=https://127.0.0.1:3381",
InitialClusterState: "new",
InitialClusterToken: "tkn",
SnapshotCount: 2000,
QuotaBackendBytes: 10740000000,
PreVote: true,
InitialCorruptCheck: true,
Logger: "zap",
LogOutputs: []string{"/tmp/etcd-functional-1/etcd.log"},
LogLevel: "info",
SocketReuseAddress: true,
SocketReusePort: true,
},
ClientCertData: "",
ClientCertPath: "",
ClientKeyData: "",
ClientKeyPath: "",
ClientTrustedCAData: "",
ClientTrustedCAPath: "",
PeerCertData: "",
PeerCertPath: "",
PeerKeyData: "",
PeerKeyPath: "",
PeerTrustedCAData: "",
PeerTrustedCAPath: "",
SnapshotPath: "/tmp/etcd-functional-1.snapshot.db",
},
{
EtcdExec: "./bin/etcd",
AgentAddr: "127.0.0.1:29027",
FailpointHTTPAddr: "http://127.0.0.1:7382",
BaseDir: "/tmp/etcd-functional-2",
EtcdClientProxy: false,
EtcdPeerProxy: true,
EtcdClientEndpoint: "127.0.0.1:2379",
Etcd: &rpcpb.Etcd{
Name: "s2",
DataDir: "/tmp/etcd-functional-2/etcd.data",
WALDir: "/tmp/etcd-functional-2/etcd.data/member/wal",
HeartbeatIntervalMs: 100,
ElectionTimeoutMs: 1000,
ListenClientURLs: []string{"https://127.0.0.1:2379"},
AdvertiseClientURLs: []string{"https://127.0.0.1:2379"},
ClientAutoTLS: true,
ClientCertAuth: false,
ClientCertFile: "",
ClientKeyFile: "",
ClientTrustedCAFile: "",
ListenPeerURLs: []string{"https://127.0.0.1:2380"},
AdvertisePeerURLs: []string{"https://127.0.0.1:2381"},
PeerAutoTLS: true,
PeerClientCertAuth: false,
PeerCertFile: "",
PeerKeyFile: "",
PeerTrustedCAFile: "",
InitialCluster: "s1=https://127.0.0.1:1381,s2=https://127.0.0.1:2381,s3=https://127.0.0.1:3381",
InitialClusterState: "new",
InitialClusterToken: "tkn",
SnapshotCount: 2000,
QuotaBackendBytes: 10740000000,
PreVote: true,
InitialCorruptCheck: true,
Logger: "zap",
LogOutputs: []string{"/tmp/etcd-functional-2/etcd.log"},
LogLevel: "info",
SocketReuseAddress: true,
SocketReusePort: true,
},
ClientCertData: "",
ClientCertPath: "",
ClientKeyData: "",
ClientKeyPath: "",
ClientTrustedCAData: "",
ClientTrustedCAPath: "",
PeerCertData: "",
PeerCertPath: "",
PeerKeyData: "",
PeerKeyPath: "",
PeerTrustedCAData: "",
PeerTrustedCAPath: "",
SnapshotPath: "/tmp/etcd-functional-2.snapshot.db",
},
{
EtcdExec: "./bin/etcd",
AgentAddr: "127.0.0.1:39027",
FailpointHTTPAddr: "http://127.0.0.1:7383",
BaseDir: "/tmp/etcd-functional-3",
EtcdClientProxy: false,
EtcdPeerProxy: true,
EtcdClientEndpoint: "127.0.0.1:3379",
Etcd: &rpcpb.Etcd{
Name: "s3",
DataDir: "/tmp/etcd-functional-3/etcd.data",
WALDir: "/tmp/etcd-functional-3/etcd.data/member/wal",
HeartbeatIntervalMs: 100,
ElectionTimeoutMs: 1000,
ListenClientURLs: []string{"https://127.0.0.1:3379"},
AdvertiseClientURLs: []string{"https://127.0.0.1:3379"},
ClientAutoTLS: true,
ClientCertAuth: false,
ClientCertFile: "",
ClientKeyFile: "",
ClientTrustedCAFile: "",
ListenPeerURLs: []string{"https://127.0.0.1:3380"},
AdvertisePeerURLs: []string{"https://127.0.0.1:3381"},
PeerAutoTLS: true,
PeerClientCertAuth: false,
PeerCertFile: "",
PeerKeyFile: "",
PeerTrustedCAFile: "",
InitialCluster: "s1=https://127.0.0.1:1381,s2=https://127.0.0.1:2381,s3=https://127.0.0.1:3381",
InitialClusterState: "new",
InitialClusterToken: "tkn",
SnapshotCount: 2000,
QuotaBackendBytes: 10740000000,
PreVote: true,
InitialCorruptCheck: true,
Logger: "zap",
LogOutputs: []string{"/tmp/etcd-functional-3/etcd.log"},
LogLevel: "info",
SocketReuseAddress: true,
SocketReusePort: true,
},
ClientCertData: "",
ClientCertPath: "",
ClientKeyData: "",
ClientKeyPath: "",
ClientTrustedCAData: "",
ClientTrustedCAPath: "",
PeerCertData: "",
PeerCertPath: "",
PeerKeyData: "",
PeerKeyPath: "",
PeerTrustedCAData: "",
PeerTrustedCAPath: "",
SnapshotPath: "/tmp/etcd-functional-3.snapshot.db",
},
},
Tester: &rpcpb.Tester{
DataDir: "/tmp/etcd-tester-data",
Network: "tcp",
Addr: "127.0.0.1:9028",
DelayLatencyMs: 5000,
DelayLatencyMsRv: 500,
UpdatedDelayLatencyMs: 5000,
RoundLimit: 1,
ExitOnCaseFail: true,
EnablePprof: true,
CaseDelayMs: 7000,
CaseShuffle: true,
Cases: []string{
"SIGTERM_ONE_FOLLOWER",
"SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
"SIGTERM_LEADER",
"SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT",
"SIGTERM_QUORUM",
"SIGTERM_ALL",
"SIGQUIT_AND_REMOVE_ONE_FOLLOWER",
"SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
// "SIGQUIT_AND_REMOVE_LEADER",
// "SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT",
// "SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_LEADER_SNAPSHOT_FROM_SCRATCH",
// "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER",
// "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
"BLACKHOLE_PEER_PORT_TX_RX_LEADER",
"BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT",
"BLACKHOLE_PEER_PORT_TX_RX_QUORUM",
"BLACKHOLE_PEER_PORT_TX_RX_ALL",
// "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER",
// "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER",
// "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
// "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
"DELAY_PEER_PORT_TX_RX_LEADER",
"RANDOM_DELAY_PEER_PORT_TX_RX_LEADER",
"DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT",
"RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT",
"DELAY_PEER_PORT_TX_RX_QUORUM",
"RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM",
"DELAY_PEER_PORT_TX_RX_ALL",
"RANDOM_DELAY_PEER_PORT_TX_RX_ALL",
"NO_FAIL_WITH_STRESS",
"NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS",
},
FailpointCommands: []string{`panic("etcd-tester")`},
RunnerExecPath: "./bin/etcd-runner",
ExternalExecPath: "",
Stressers: []*rpcpb.Stresser{
{Type: "KV_WRITE_SMALL", Weight: 0.35},
{Type: "KV_WRITE_LARGE", Weight: 0.002},
{Type: "KV_READ_ONE_KEY", Weight: 0.07},
{Type: "KV_READ_RANGE", Weight: 0.07},
{Type: "KV_DELETE_ONE_KEY", Weight: 0.07},
{Type: "KV_DELETE_RANGE", Weight: 0.07},
{Type: "KV_TXN_WRITE_DELETE", Weight: 0.35},
{Type: "LEASE", Weight: 0.0},
},
Checkers: []string{"KV_HASH", "LEASE_EXPIRE"},
StressKeySize: 100,
StressKeySizeLarge: 32769,
StressKeySuffixRange: 250000,
StressKeySuffixRangeTxn: 100,
StressKeyTxnOps: 10,
StressClients: 100,
StressQPS: 2000,
},
}
logger := zaptest.NewLogger(t)
defer logger.Sync()
cfg, err := read(logger, "../functional.yaml")
if err != nil {
t.Fatal(err)
}
cfg.lg = nil
if !reflect.DeepEqual(exp, cfg) {
t.Fatalf(`exp != cfg:
expected %+v
got %+v`, exp, cfg)
}
cfg.lg = logger
cfg.updateCases()
fs1 := cfg.listCases()
cfg.shuffleCases()
fs2 := cfg.listCases()
if reflect.DeepEqual(fs1, fs2) {
t.Fatalf("expected shuffled failure cases, got %q", fs2)
}
cfg.shuffleCases()
fs3 := cfg.listCases()
if reflect.DeepEqual(fs2, fs3) {
t.Fatalf("expected reshuffled failure cases from %q, got %q", fs2, fs3)
}
// shuffle ensures visit all exactly once
// so when sorted, failure cases must be equal
sort.Strings(fs1)
sort.Strings(fs2)
sort.Strings(fs3)
if !reflect.DeepEqual(fs1, fs2) {
t.Fatalf("expected %q, got %q", fs1, fs2)
}
if !reflect.DeepEqual(fs2, fs3) {
t.Fatalf("expected %q, got %q", fs2, fs3)
}
}

View File

@ -1,16 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package tester implements functional-tester tester server.
package tester

View File

@ -1,83 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"fmt"
"sort"
"github.com/prometheus/client_golang/prometheus"
)
var (
caseTotal = make(map[string]int)
caseTotalCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "etcd",
Subsystem: "funcational_tester",
Name: "case_total",
Help: "Total number of finished test cases",
},
[]string{"desc"},
)
caseFailedTotalCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "etcd",
Subsystem: "funcational_tester",
Name: "case_failed_total",
Help: "Total number of failed test cases",
},
[]string{"desc"},
)
roundTotalCounter = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: "etcd",
Subsystem: "funcational_tester",
Name: "round_total",
Help: "Total number of finished test rounds.",
})
roundFailedTotalCounter = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: "etcd",
Subsystem: "funcational_tester",
Name: "round_failed_total",
Help: "Total number of failed test rounds.",
})
)
func init() {
prometheus.MustRegister(caseTotalCounter)
prometheus.MustRegister(caseFailedTotalCounter)
prometheus.MustRegister(roundTotalCounter)
prometheus.MustRegister(roundFailedTotalCounter)
}
func printReport() {
rows := make([]string, 0, len(caseTotal))
for k, v := range caseTotal {
rows = append(rows, fmt.Sprintf("%s: %d", k, v))
}
sort.Strings(rows)
println()
for _, row := range rows {
fmt.Println(row)
}
println()
}

View File

@ -1,180 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"fmt"
"time"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
"go.uber.org/zap"
)
// Stresser defines stressing client operations.
type Stresser interface {
// Stress starts to stress the etcd cluster
Stress() error
// Pause stops the stresser from sending requests to etcd. Resume by calling Stress.
Pause() map[string]int
// Close releases all of the Stresser's resources.
Close() map[string]int
// ModifiedKeys reports the number of keys created and deleted by stresser
ModifiedKeys() int64
}
// newStresser creates stresser from a comma separated list of stresser types.
func newStresser(clus *Cluster, m *rpcpb.Member) (stressers []Stresser) {
// TODO: Too intensive stressing clients can panic etcd member with
// 'out of memory' error. Put rate limits in server side.
ks := &keyStresser{
lg: clus.lg,
m: m,
keySize: int(clus.Tester.StressKeySize),
keyLargeSize: int(clus.Tester.StressKeySizeLarge),
keySuffixRange: int(clus.Tester.StressKeySuffixRange),
keyTxnSuffixRange: int(clus.Tester.StressKeySuffixRangeTxn),
keyTxnOps: int(clus.Tester.StressKeyTxnOps),
clientsN: int(clus.Tester.StressClients),
rateLimiter: clus.rateLimiter,
}
ksExist := false
for _, s := range clus.Tester.Stressers {
clus.lg.Info(
"creating stresser",
zap.String("type", s.Type),
zap.Float64("weight", s.Weight),
zap.String("endpoint", m.EtcdClientEndpoint),
)
switch s.Type {
case "KV_WRITE_SMALL":
ksExist = true
ks.weightKVWriteSmall = s.Weight
case "KV_WRITE_LARGE":
ksExist = true
ks.weightKVWriteLarge = s.Weight
case "KV_READ_ONE_KEY":
ksExist = true
ks.weightKVReadOneKey = s.Weight
case "KV_READ_RANGE":
ksExist = true
ks.weightKVReadRange = s.Weight
case "KV_DELETE_ONE_KEY":
ksExist = true
ks.weightKVDeleteOneKey = s.Weight
case "KV_DELETE_RANGE":
ksExist = true
ks.weightKVDeleteRange = s.Weight
case "KV_TXN_WRITE_DELETE":
ksExist = true
ks.weightKVTxnWriteDelete = s.Weight
case "LEASE":
stressers = append(stressers, &leaseStresser{
stype: rpcpb.StresserType_LEASE,
lg: clus.lg,
m: m,
numLeases: 10, // TODO: configurable
keysPerLease: 10, // TODO: configurable
rateLimiter: clus.rateLimiter,
})
case "ELECTION_RUNNER":
reqRate := 100
args := []string{
"election",
fmt.Sprintf("%v", time.Now().UnixNano()), // election name as current nano time
"--dial-timeout=10s",
"--endpoints", m.EtcdClientEndpoint,
"--total-client-connections=10",
"--rounds=0", // runs forever
"--req-rate", fmt.Sprintf("%v", reqRate),
}
stressers = append(stressers, newRunnerStresser(
rpcpb.StresserType_ELECTION_RUNNER,
m.EtcdClientEndpoint,
clus.lg,
clus.Tester.RunnerExecPath,
args,
clus.rateLimiter,
reqRate,
))
case "WATCH_RUNNER":
reqRate := 100
args := []string{
"watcher",
"--prefix", fmt.Sprintf("%v", time.Now().UnixNano()), // prefix all keys with nano time
"--total-keys=1",
"--total-prefixes=1",
"--watch-per-prefix=1",
"--endpoints", m.EtcdClientEndpoint,
"--rounds=0", // runs forever
"--req-rate", fmt.Sprintf("%v", reqRate),
}
stressers = append(stressers, newRunnerStresser(
rpcpb.StresserType_WATCH_RUNNER,
m.EtcdClientEndpoint,
clus.lg,
clus.Tester.RunnerExecPath,
args,
clus.rateLimiter,
reqRate,
))
case "LOCK_RACER_RUNNER":
reqRate := 100
args := []string{
"lock-racer",
fmt.Sprintf("%v", time.Now().UnixNano()), // locker name as current nano time
"--endpoints", m.EtcdClientEndpoint,
"--total-client-connections=10",
"--rounds=0", // runs forever
"--req-rate", fmt.Sprintf("%v", reqRate),
}
stressers = append(stressers, newRunnerStresser(
rpcpb.StresserType_LOCK_RACER_RUNNER,
m.EtcdClientEndpoint,
clus.lg,
clus.Tester.RunnerExecPath,
args,
clus.rateLimiter,
reqRate,
))
case "LEASE_RUNNER":
args := []string{
"lease-renewer",
"--ttl=30",
"--endpoints", m.EtcdClientEndpoint,
}
stressers = append(stressers, newRunnerStresser(
rpcpb.StresserType_LEASE_RUNNER,
m.EtcdClientEndpoint,
clus.lg,
clus.Tester.RunnerExecPath,
args,
clus.rateLimiter,
0,
))
}
}
if ksExist {
return append(stressers, ks)
}
return stressers
}

View File

@ -1,82 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import "sync"
// compositeStresser implements a Stresser that runs a slice of
// stressing clients concurrently.
type compositeStresser struct {
stressers []Stresser
}
func (cs *compositeStresser) Stress() error {
for i, s := range cs.stressers {
if err := s.Stress(); err != nil {
for j := 0; j < i; j++ {
cs.stressers[j].Close()
}
return err
}
}
return nil
}
func (cs *compositeStresser) Pause() (ems map[string]int) {
var emu sync.Mutex
ems = make(map[string]int)
var wg sync.WaitGroup
wg.Add(len(cs.stressers))
for i := range cs.stressers {
go func(s Stresser) {
defer wg.Done()
errs := s.Pause()
for k, v := range errs {
emu.Lock()
ems[k] += v
emu.Unlock()
}
}(cs.stressers[i])
}
wg.Wait()
return ems
}
func (cs *compositeStresser) Close() (ems map[string]int) {
var emu sync.Mutex
ems = make(map[string]int)
var wg sync.WaitGroup
wg.Add(len(cs.stressers))
for i := range cs.stressers {
go func(s Stresser) {
defer wg.Done()
errs := s.Close()
for k, v := range errs {
emu.Lock()
ems[k] += v
emu.Unlock()
}
}(cs.stressers[i])
}
wg.Wait()
return ems
}
func (cs *compositeStresser) ModifiedKeys() (modifiedKey int64) {
for _, stress := range cs.stressers {
modifiedKey += stress.ModifiedKeys()
}
return modifiedKey
}

View File

@ -1,361 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"context"
"fmt"
"math/rand"
"reflect"
"sync"
"sync/atomic"
"time"
"go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
clientv3 "go.etcd.io/etcd/client/v3"
"go.etcd.io/etcd/server/v3/etcdserver/errors"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
"go.etcd.io/raft/v3"
"go.uber.org/zap"
"golang.org/x/time/rate"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)
type keyStresser struct {
lg *zap.Logger
m *rpcpb.Member
weightKVWriteSmall float64
weightKVWriteLarge float64
weightKVReadOneKey float64
weightKVReadRange float64
weightKVDeleteOneKey float64
weightKVDeleteRange float64
weightKVTxnWriteDelete float64
keySize int
keyLargeSize int
keySuffixRange int
keyTxnSuffixRange int
keyTxnOps int
rateLimiter *rate.Limiter
wg sync.WaitGroup
clientsN int
ctx context.Context
cancel func()
cli *clientv3.Client
emu sync.RWMutex
ems map[string]int
paused bool
// atomicModifiedKeys records the number of keys created and deleted by the stresser.
atomicModifiedKeys int64
stressTable *stressTable
}
func (s *keyStresser) Stress() error {
var err error
s.cli, err = s.m.CreateEtcdClient(grpc.WithBackoffMaxDelay(1 * time.Second))
if err != nil {
return fmt.Errorf("%v (%q)", err, s.m.EtcdClientEndpoint)
}
s.ctx, s.cancel = context.WithCancel(context.Background())
s.wg.Add(s.clientsN)
s.stressTable = createStressTable([]stressEntry{
{weight: s.weightKVWriteSmall, f: newStressPut(s.cli, s.keySuffixRange, s.keySize)},
{weight: s.weightKVWriteLarge, f: newStressPut(s.cli, s.keySuffixRange, s.keyLargeSize)},
{weight: s.weightKVReadOneKey, f: newStressRange(s.cli, s.keySuffixRange)},
{weight: s.weightKVReadRange, f: newStressRangeInterval(s.cli, s.keySuffixRange)},
{weight: s.weightKVDeleteOneKey, f: newStressDelete(s.cli, s.keySuffixRange)},
{weight: s.weightKVDeleteRange, f: newStressDeleteInterval(s.cli, s.keySuffixRange)},
{weight: s.weightKVTxnWriteDelete, f: newStressTxn(s.cli, s.keyTxnSuffixRange, s.keyTxnOps)},
})
s.emu.Lock()
s.paused = false
s.ems = make(map[string]int, 100)
s.emu.Unlock()
for i := 0; i < s.clientsN; i++ {
go s.run()
}
s.lg.Info(
"stress START",
zap.String("stress-type", "KV"),
zap.String("endpoint", s.m.EtcdClientEndpoint),
)
return nil
}
func (s *keyStresser) run() {
defer s.wg.Done()
for {
if err := s.rateLimiter.Wait(s.ctx); err == context.Canceled {
return
}
// TODO: 10-second is enough timeout to cover leader failure
// and immediate leader election. Find out what other cases this
// could be timed out.
sctx, scancel := context.WithTimeout(s.ctx, 10*time.Second)
modifiedKeys, err := s.stressTable.choose()(sctx)
scancel()
if err == nil {
atomic.AddInt64(&s.atomicModifiedKeys, modifiedKeys)
continue
}
if !s.isRetryableError(err) {
return
}
// only record errors before pausing stressers
s.emu.Lock()
if !s.paused {
s.ems[err.Error()]++
}
s.emu.Unlock()
}
}
func (s *keyStresser) isRetryableError(err error) bool {
switch rpctypes.ErrorDesc(err) {
// retryable
case context.DeadlineExceeded.Error():
// This retries when request is triggered at the same time as
// leader failure. When we terminate the leader, the request to
// that leader cannot be processed, and times out. Also requests
// to followers cannot be forwarded to the old leader, so timing out
// as well. We want to keep stressing until the cluster elects a
// new leader and start processing requests again.
return true
case errors.ErrTimeoutDueToLeaderFail.Error(), errors.ErrTimeout.Error():
// This retries when request is triggered at the same time as
// leader failure and follower nodes receive time out errors
// from losing their leader. Followers should retry to connect
// to the new leader.
return true
case errors.ErrStopped.Error():
// one of the etcd nodes stopped from failure injection
return true
case rpctypes.ErrNotCapable.Error():
// capability check has not been done (in the beginning)
return true
case rpctypes.ErrTooManyRequests.Error():
// hitting the recovering member.
return true
case raft.ErrProposalDropped.Error():
// removed member, or leadership has changed (old leader got raftpb.MsgProp)
return true
// not retryable.
case context.Canceled.Error():
// from stresser.Cancel method:
return false
}
if status.Convert(err).Code() == codes.Unavailable {
// gRPC connection errors are translated to status.Unavailable
return true
}
s.lg.Warn(
"stress run exiting",
zap.String("stress-type", "KV"),
zap.String("endpoint", s.m.EtcdClientEndpoint),
zap.String("error-type", reflect.TypeOf(err).String()),
zap.String("error-desc", rpctypes.ErrorDesc(err)),
zap.Error(err),
)
return false
}
func (s *keyStresser) Pause() map[string]int {
return s.Close()
}
func (s *keyStresser) Close() map[string]int {
s.cancel()
s.cli.Close()
s.wg.Wait()
s.emu.Lock()
s.paused = true
ess := s.ems
s.ems = make(map[string]int, 100)
s.emu.Unlock()
s.lg.Info(
"stress STOP",
zap.String("stress-type", "KV"),
zap.String("endpoint", s.m.EtcdClientEndpoint),
)
return ess
}
func (s *keyStresser) ModifiedKeys() int64 {
return atomic.LoadInt64(&s.atomicModifiedKeys)
}
type stressFunc func(ctx context.Context) (modifiedKeys int64, err error)
type stressEntry struct {
weight float64
f stressFunc
}
type stressTable struct {
entries []stressEntry
sumWeights float64
}
func createStressTable(entries []stressEntry) *stressTable {
st := stressTable{entries: entries}
for _, entry := range st.entries {
st.sumWeights += entry.weight
}
return &st
}
func (st *stressTable) choose() stressFunc {
v := rand.Float64() * st.sumWeights
var sum float64
var idx int
for i := range st.entries {
sum += st.entries[i].weight
if sum >= v {
idx = i
break
}
}
return st.entries[idx].f
}
func newStressPut(cli *clientv3.Client, keySuffixRange, keySize int) stressFunc {
return func(ctx context.Context) (int64, error) {
_, err := cli.Put(
ctx,
fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)),
string(randBytes(keySize)),
)
return 1, err
}
}
func newStressTxn(cli *clientv3.Client, keyTxnSuffixRange, txnOps int) stressFunc {
keys := make([]string, keyTxnSuffixRange)
for i := range keys {
keys[i] = fmt.Sprintf("/k%03d", i)
}
return writeTxn(cli, keys, txnOps)
}
func writeTxn(cli *clientv3.Client, keys []string, txnOps int) stressFunc {
return func(ctx context.Context) (int64, error) {
ks := make(map[string]struct{}, txnOps)
for len(ks) != txnOps {
ks[keys[rand.Intn(len(keys))]] = struct{}{}
}
selected := make([]string, 0, txnOps)
for k := range ks {
selected = append(selected, k)
}
com, delOp, putOp := getTxnOps(selected[0], "bar00")
thenOps := []clientv3.Op{delOp}
elseOps := []clientv3.Op{putOp}
for i := 1; i < txnOps; i++ { // nested txns
k, v := selected[i], fmt.Sprintf("bar%02d", i)
com, delOp, putOp = getTxnOps(k, v)
txnOp := clientv3.OpTxn(
[]clientv3.Cmp{com},
[]clientv3.Op{delOp},
[]clientv3.Op{putOp},
)
thenOps = append(thenOps, txnOp)
elseOps = append(elseOps, txnOp)
}
_, err := cli.Txn(ctx).
If(com).
Then(thenOps...).
Else(elseOps...).
Commit()
return int64(txnOps), err
}
}
func getTxnOps(k, v string) (
cmp clientv3.Cmp,
dop clientv3.Op,
pop clientv3.Op) {
// if key exists (version > 0)
cmp = clientv3.Compare(clientv3.Version(k), ">", 0)
dop = clientv3.OpDelete(k)
pop = clientv3.OpPut(k, v)
return cmp, dop, pop
}
func newStressRange(cli *clientv3.Client, keySuffixRange int) stressFunc {
return func(ctx context.Context) (int64, error) {
_, err := cli.Get(ctx, fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)))
return 0, err
}
}
func newStressRangeInterval(cli *clientv3.Client, keySuffixRange int) stressFunc {
return func(ctx context.Context) (int64, error) {
start := rand.Intn(keySuffixRange)
end := start + 500
_, err := cli.Get(
ctx,
fmt.Sprintf("foo%016x", start),
clientv3.WithRange(fmt.Sprintf("foo%016x", end)),
)
return 0, err
}
}
func newStressDelete(cli *clientv3.Client, keySuffixRange int) stressFunc {
return func(ctx context.Context) (int64, error) {
_, err := cli.Delete(ctx, fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)))
return 1, err
}
}
func newStressDeleteInterval(cli *clientv3.Client, keySuffixRange int) stressFunc {
return func(ctx context.Context) (int64, error) {
start := rand.Intn(keySuffixRange)
end := start + 500
resp, err := cli.Delete(ctx,
fmt.Sprintf("foo%016x", start),
clientv3.WithRange(fmt.Sprintf("foo%016x", end)),
)
if err == nil {
return resp.Deleted, nil
}
return 0, err
}
}

View File

@ -1,512 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"context"
"fmt"
"math/rand"
"sync"
"sync/atomic"
"time"
"go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
clientv3 "go.etcd.io/etcd/client/v3"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
"go.uber.org/zap"
"golang.org/x/time/rate"
"google.golang.org/grpc"
)
const (
// time to live for lease
defaultTTL = 120
defaultTTLShort = 2
)
type leaseStresser struct {
stype rpcpb.StresserType
lg *zap.Logger
m *rpcpb.Member
cli *clientv3.Client
ctx context.Context
cancel func()
rateLimiter *rate.Limiter
// atomicModifiedKey records the number of keys created and deleted during a test case
atomicModifiedKey int64
numLeases int
keysPerLease int
aliveLeases *atomicLeases
alivedLeasesWithShortTTL *atomicLeases
revokedLeases *atomicLeases
// The tester doesn't keep alive the shortLivedLeases,
// so they will expire after the TTL.
shortLivedLeases *atomicLeases
runWg sync.WaitGroup
aliveWg sync.WaitGroup
}
type atomicLeases struct {
// rwLock is used to protect read/write access of leases map
// which are accessed and modified by different goroutines.
rwLock sync.RWMutex
leases map[int64]time.Time
}
func (al *atomicLeases) add(leaseID int64, t time.Time) {
al.rwLock.Lock()
al.leases[leaseID] = t
al.rwLock.Unlock()
}
func (al *atomicLeases) update(leaseID int64, t time.Time) {
al.rwLock.Lock()
_, ok := al.leases[leaseID]
if ok {
al.leases[leaseID] = t
}
al.rwLock.Unlock()
}
func (al *atomicLeases) read(leaseID int64) (rv time.Time, ok bool) {
al.rwLock.RLock()
rv, ok = al.leases[leaseID]
al.rwLock.RUnlock()
return rv, ok
}
func (al *atomicLeases) remove(leaseID int64) {
al.rwLock.Lock()
delete(al.leases, leaseID)
al.rwLock.Unlock()
}
func (al *atomicLeases) getLeasesMap() map[int64]time.Time {
leasesCopy := make(map[int64]time.Time)
al.rwLock.RLock()
for k, v := range al.leases {
leasesCopy[k] = v
}
al.rwLock.RUnlock()
return leasesCopy
}
func (ls *leaseStresser) setupOnce() error {
if ls.aliveLeases != nil {
return nil
}
if ls.numLeases == 0 {
panic("expect numLeases to be set")
}
if ls.keysPerLease == 0 {
panic("expect keysPerLease to be set")
}
ls.aliveLeases = &atomicLeases{leases: make(map[int64]time.Time)}
return nil
}
func (ls *leaseStresser) Stress() error {
ls.lg.Info(
"stress START",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
)
if err := ls.setupOnce(); err != nil {
return err
}
ctx, cancel := context.WithCancel(context.Background())
ls.ctx = ctx
ls.cancel = cancel
cli, err := ls.m.CreateEtcdClient(grpc.WithBackoffMaxDelay(1 * time.Second))
if err != nil {
return fmt.Errorf("%v (%s)", err, ls.m.EtcdClientEndpoint)
}
ls.cli = cli
ls.revokedLeases = &atomicLeases{leases: make(map[int64]time.Time)}
ls.shortLivedLeases = &atomicLeases{leases: make(map[int64]time.Time)}
ls.alivedLeasesWithShortTTL = &atomicLeases{leases: make(map[int64]time.Time)}
ls.runWg.Add(1)
go ls.run()
return nil
}
func (ls *leaseStresser) run() {
defer ls.runWg.Done()
ls.restartKeepAlives()
for {
// the number of keys created and deleted is roughly 2x the number of created keys for an iteration.
// the rateLimiter therefore consumes 2x ls.numLeases*ls.keysPerLease tokens where each token represents a create/delete operation for key.
err := ls.rateLimiter.WaitN(ls.ctx, 2*ls.numLeases*ls.keysPerLease)
if err == context.Canceled {
return
}
ls.lg.Debug(
"stress creating leases",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
)
ls.createLeases()
ls.lg.Debug(
"stress created leases",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
)
ls.lg.Debug(
"stress dropped leases",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
)
ls.randomlyDropLeases()
ls.lg.Debug(
"stress dropped leases",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
)
}
}
func (ls *leaseStresser) restartKeepAlives() {
f := func(leases *atomicLeases) {
for leaseID := range leases.getLeasesMap() {
ls.aliveWg.Add(1)
go func(id int64) {
ls.keepLeaseAlive(id)
}(leaseID)
}
}
f(ls.aliveLeases)
f(ls.alivedLeasesWithShortTTL)
}
func (ls *leaseStresser) createLeases() {
ls.createAliveLeasesWithShortTTL()
ls.createAliveLeases()
ls.createShortLivedLeases()
}
func (ls *leaseStresser) createAliveLeases() {
neededLeases := ls.numLeases - len(ls.aliveLeases.getLeasesMap())
var wg sync.WaitGroup
for i := 0; i < neededLeases; i++ {
wg.Add(1)
go func() {
defer wg.Done()
leaseID, err := ls.createLeaseWithKeys(defaultTTL)
if err != nil {
ls.lg.Debug(
"createLeaseWithKeys failed",
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.Error(err),
)
return
}
ls.aliveLeases.add(leaseID, time.Now())
// keep track of all the keep lease alive goroutines
ls.aliveWg.Add(1)
go ls.keepLeaseAlive(leaseID)
}()
}
wg.Wait()
}
func (ls *leaseStresser) createAliveLeasesWithShortTTL() {
neededLeases := 2
var wg sync.WaitGroup
for i := 0; i < neededLeases; i++ {
wg.Add(1)
go func() {
defer wg.Done()
leaseID, err := ls.createLeaseWithKeys(defaultTTLShort)
if err != nil {
ls.lg.Debug(
"createLeaseWithKeys failed",
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.Error(err),
)
return
}
ls.lg.Debug("createAliveLeasesWithShortTTL", zap.Int64("lease-id", leaseID))
ls.alivedLeasesWithShortTTL.add(leaseID, time.Now())
// keep track of all the keep lease alive goroutines
ls.aliveWg.Add(1)
go ls.keepLeaseAlive(leaseID)
}()
}
wg.Wait()
}
func (ls *leaseStresser) createShortLivedLeases() {
// one round of createLeases() might not create all the short lived leases we want due to failures.
// thus, we want to create remaining short lived leases in the future round.
neededLeases := ls.numLeases - len(ls.shortLivedLeases.getLeasesMap())
var wg sync.WaitGroup
for i := 0; i < neededLeases; i++ {
wg.Add(1)
go func() {
defer wg.Done()
leaseID, err := ls.createLeaseWithKeys(defaultTTLShort)
if err != nil {
return
}
ls.shortLivedLeases.add(leaseID, time.Now())
}()
}
wg.Wait()
}
func (ls *leaseStresser) createLeaseWithKeys(ttl int64) (int64, error) {
leaseID, err := ls.createLease(ttl)
if err != nil {
ls.lg.Debug(
"createLease failed",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.Error(err),
)
return -1, err
}
ls.lg.Debug(
"createLease created lease",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
)
if err := ls.attachKeysWithLease(leaseID); err != nil {
return -1, err
}
return leaseID, nil
}
func (ls *leaseStresser) randomlyDropLeases() {
var wg sync.WaitGroup
for l := range ls.aliveLeases.getLeasesMap() {
wg.Add(1)
go func(leaseID int64) {
defer wg.Done()
dropped, err := ls.randomlyDropLease(leaseID)
// if randomlyDropLease encountered an error such as context is cancelled, remove the lease from aliveLeases
// because we can't tell whether the lease is dropped or not.
if err != nil {
ls.lg.Debug(
"randomlyDropLease failed",
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
zap.Error(err),
)
ls.aliveLeases.remove(leaseID)
return
}
if !dropped {
return
}
ls.lg.Debug(
"randomlyDropLease dropped a lease",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
)
ls.revokedLeases.add(leaseID, time.Now())
ls.aliveLeases.remove(leaseID)
}(l)
}
wg.Wait()
}
func (ls *leaseStresser) createLease(ttl int64) (int64, error) {
resp, err := ls.cli.Grant(ls.ctx, ttl)
if err != nil {
return -1, err
}
return int64(resp.ID), nil
}
func (ls *leaseStresser) keepLeaseAlive(leaseID int64) {
defer ls.aliveWg.Done()
ctx, cancel := context.WithCancel(ls.ctx)
stream, err := ls.cli.KeepAlive(ctx, clientv3.LeaseID(leaseID))
if err != nil {
ls.lg.Error(
"keepLeaseAlive lease creates stream error",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
zap.Error(err),
)
}
defer cancel()
for {
select {
case <-time.After(500 * time.Millisecond):
case <-ls.ctx.Done():
ls.lg.Debug(
"keepLeaseAlive context canceled",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
zap.Error(ls.ctx.Err()),
)
// It is possible that lease expires at invariant checking phase
// but not at keepLeaseAlive() phase. This scenario is possible
// when alive lease is just about to expire when keepLeaseAlive()
// exists and expires at invariant checking phase. To circumvent
// this scenario, we check each lease before keepalive loop exist
// to see if it has been renewed in last TTL/2 duration. If it is
// renewed, it means that invariant checking have at least ttl/2
// time before lease expires which is long enough for the checking
// to finish. If it is not renewed, we remove the lease from the
// alive map so that the lease doesn't expire during invariant
// checking.
renewTime, ok := ls.aliveLeases.read(leaseID)
if ok && renewTime.Add(defaultTTL/2*time.Second).Before(time.Now()) {
ls.aliveLeases.remove(leaseID)
ls.lg.Debug(
"keepLeaseAlive lease has not been renewed, dropped it",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
)
}
return
}
ls.lg.Debug(
"keepLeaseAlive waiting on lease stream",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
)
leaseRenewTime := time.Now()
respRC := <-stream
if respRC == nil {
ls.lg.Debug(
"keepLeaseAlive received nil lease keepalive response",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
)
continue
}
// lease expires after TTL become 0
// don't send keepalive if the lease has expired
if respRC.TTL <= 0 {
ls.lg.Debug(
"keepLeaseAlive stream received lease keepalive response TTL <= 0",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
zap.Int64("ttl", respRC.TTL),
)
ls.aliveLeases.remove(leaseID)
return
}
// renew lease timestamp only if lease is present
ls.lg.Debug(
"keepLeaseAlive renewed a lease",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
)
ls.aliveLeases.update(leaseID, leaseRenewTime)
}
}
// attachKeysWithLease function attaches keys to the lease.
// the format of key is the concat of leaseID + '_' + '<order of key creation>'
// e.g 5186835655248304152_0 for first created key and 5186835655248304152_1 for second created key
func (ls *leaseStresser) attachKeysWithLease(leaseID int64) error {
var txnPuts []clientv3.Op
for j := 0; j < ls.keysPerLease; j++ {
txnput := clientv3.OpPut(
fmt.Sprintf("%d%s%d", leaseID, "_", j),
"bar",
clientv3.WithLease(clientv3.LeaseID(leaseID)),
)
txnPuts = append(txnPuts, txnput)
}
// keep retrying until lease is not found or ctx is being canceled
for ls.ctx.Err() == nil {
_, err := ls.cli.Txn(ls.ctx).Then(txnPuts...).Commit()
if err == nil {
// since all created keys will be deleted too, the number of operations on keys will be roughly 2x the number of created keys
atomic.AddInt64(&ls.atomicModifiedKey, 2*int64(ls.keysPerLease))
return nil
}
if rpctypes.Error(err) == rpctypes.ErrLeaseNotFound {
return err
}
}
return ls.ctx.Err()
}
// randomlyDropLease drops the lease only when the rand.Int(2) returns 1.
// This creates a 50/50 percents chance of dropping a lease
func (ls *leaseStresser) randomlyDropLease(leaseID int64) (bool, error) {
if rand.Intn(2) != 0 {
return false, nil
}
// keep retrying until a lease is dropped or ctx is being canceled
for ls.ctx.Err() == nil {
_, err := ls.cli.Revoke(ls.ctx, clientv3.LeaseID(leaseID))
if err == nil || rpctypes.Error(err) == rpctypes.ErrLeaseNotFound {
return true, nil
}
}
ls.lg.Debug(
"randomlyDropLease error",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
zap.Error(ls.ctx.Err()),
)
return false, ls.ctx.Err()
}
func (ls *leaseStresser) Pause() map[string]int {
return ls.Close()
}
func (ls *leaseStresser) Close() map[string]int {
ls.cancel()
ls.runWg.Wait()
ls.aliveWg.Wait()
ls.cli.Close()
ls.lg.Info(
"stress STOP",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
)
return nil
}
func (ls *leaseStresser) ModifiedKeys() int64 {
return atomic.LoadInt64(&ls.atomicModifiedKey)
}

View File

@ -1,121 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"fmt"
"io"
"os/exec"
"syscall"
"go.etcd.io/etcd/tests/v3/functional/rpcpb"
"go.uber.org/zap"
"golang.org/x/time/rate"
)
type runnerStresser struct {
stype rpcpb.StresserType
etcdClientEndpoint string
lg *zap.Logger
cmd *exec.Cmd
cmdStr string
args []string
rl *rate.Limiter
reqRate int
errc chan error
donec chan struct{}
}
func newRunnerStresser(
stype rpcpb.StresserType,
ep string,
lg *zap.Logger,
cmdStr string,
args []string,
rl *rate.Limiter,
reqRate int,
) *runnerStresser {
rl.SetLimit(rl.Limit() - rate.Limit(reqRate))
return &runnerStresser{
stype: stype,
etcdClientEndpoint: ep,
lg: lg,
cmdStr: cmdStr,
args: args,
rl: rl,
reqRate: reqRate,
errc: make(chan error, 1),
donec: make(chan struct{}),
}
}
func (rs *runnerStresser) setupOnce() (err error) {
if rs.cmd != nil {
return nil
}
rs.cmd = exec.Command(rs.cmdStr, rs.args...)
stderr, err := rs.cmd.StderrPipe()
if err != nil {
return err
}
go func() {
defer close(rs.donec)
out, err := io.ReadAll(stderr)
if err != nil {
rs.errc <- err
} else {
rs.errc <- fmt.Errorf("(%v %v) stderr %v", rs.cmdStr, rs.args, string(out))
}
}()
return rs.cmd.Start()
}
func (rs *runnerStresser) Stress() (err error) {
rs.lg.Info(
"stress START",
zap.String("stress-type", rs.stype.String()),
)
if err = rs.setupOnce(); err != nil {
return err
}
return syscall.Kill(rs.cmd.Process.Pid, syscall.SIGCONT)
}
func (rs *runnerStresser) Pause() map[string]int {
rs.lg.Info(
"stress STOP",
zap.String("stress-type", rs.stype.String()),
)
syscall.Kill(rs.cmd.Process.Pid, syscall.SIGSTOP)
return nil
}
func (rs *runnerStresser) Close() map[string]int {
syscall.Kill(rs.cmd.Process.Pid, syscall.SIGINT)
rs.cmd.Wait()
<-rs.donec
rs.rl.SetLimit(rs.rl.Limit() + rate.Limit(rs.reqRate))
return nil
}
func (rs *runnerStresser) ModifiedKeys() int64 {
return 1
}

View File

@ -1,79 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"errors"
"math/rand"
"net"
"net/url"
"strings"
)
func isValidURL(u string) bool {
_, err := url.Parse(u)
return err == nil
}
func getPort(addr string) (port string, err error) {
urlAddr, err := url.Parse(addr)
if err != nil {
return "", err
}
_, port, err = net.SplitHostPort(urlAddr.Host)
if err != nil {
return "", err
}
return port, nil
}
func getSameValue(vals map[string]int64) bool {
var rv int64
for _, v := range vals {
if rv == 0 {
rv = v
}
if rv != v {
return false
}
}
return true
}
func max(n1, n2 int64) int64 {
if n1 > n2 {
return n1
}
return n2
}
func errsToError(errs []error) error {
if len(errs) == 0 {
return nil
}
stringArr := make([]string, len(errs))
for i, err := range errs {
stringArr[i] = err.Error()
}
return errors.New(strings.Join(stringArr, ", "))
}
func randBytes(size int) []byte {
data := make([]byte, size)
for i := 0; i < size; i++ {
data[i] = byte(int('a') + rand.Intn(26))
}
return data
}

View File

@ -16,17 +16,12 @@ replace (
require (
github.com/anishathalye/porcupine v0.1.4
github.com/coreos/go-semver v0.3.1
github.com/dustin/go-humanize v1.0.1
github.com/gogo/protobuf v1.3.2
github.com/golang/protobuf v1.5.2
github.com/google/go-cmp v0.5.9
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0
github.com/grpc-ecosystem/grpc-gateway v1.16.0
github.com/prometheus/client_golang v1.14.0
github.com/soheilhy/cmux v0.1.5
github.com/spf13/cobra v1.6.1
github.com/spf13/pflag v1.0.5
github.com/stretchr/testify v1.8.1
go.etcd.io/etcd/api/v3 v3.6.0-alpha.0
go.etcd.io/etcd/client/pkg/v3 v3.6.0-alpha.0
@ -47,7 +42,6 @@ require (
golang.org/x/sync v0.0.0-20220601150217-0de741cfad7f
golang.org/x/time v0.0.0-20220609170525-579cf78fd858
google.golang.org/grpc v1.51.0
gopkg.in/yaml.v2 v2.4.0
)
require (
@ -61,11 +55,14 @@ require (
github.com/coreos/go-systemd/v22 v22.5.0 // indirect
github.com/creack/pty v1.1.18 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/fatih/color v1.13.0 // indirect
github.com/go-logr/logr v1.2.3 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang-jwt/jwt/v4 v4.4.3 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.2 // indirect
github.com/google/btree v1.1.2 // indirect
github.com/gorilla/websocket v1.4.2 // indirect
github.com/grpc-ecosystem/grpc-gateway/v2 v2.7.0 // indirect
@ -82,6 +79,8 @@ require (
github.com/prometheus/procfs v0.8.0 // indirect
github.com/rivo/uniseg v0.2.0 // indirect
github.com/sirupsen/logrus v1.8.1 // indirect
github.com/spf13/cobra v1.6.1 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/tmc/grpc-websocket-proxy v0.0.0-20201229170055-e5319fda7802 // indirect
github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2 // indirect
go.etcd.io/bbolt v1.3.7 // indirect
@ -97,6 +96,7 @@ require (
google.golang.org/genproto v0.0.0-20211118181313-81c1377c94b1 // indirect
google.golang.org/protobuf v1.28.1 // indirect
gopkg.in/natefinch/lumberjack.v2 v2.0.0 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
sigs.k8s.io/json v0.0.0-20211020170558-c049b76a60c6 // indirect
sigs.k8s.io/yaml v1.3.0 // indirect