tests/robustness: Document analysing watch issue
Signed-off-by: Marek Siarkowicz <siarkowicz@google.com>storage-doc
parent
f6bb874298
commit
1e41d95ab2
|
@ -53,9 +53,10 @@ Report includes multiple types of files:
|
||||||
|
|
||||||
### Example analysis of linearization issue
|
### Example analysis of linearization issue
|
||||||
|
|
||||||
Let's analyse issue [#14370].
|
Let's reproduce and analyse robustness test report for issue [#14370].
|
||||||
To reproduce the issue by yourself run `make test-robustness-issue14370`.
|
To reproduce the issue by yourself run `make test-robustness-issue14370`.
|
||||||
After a couple of tries robustness tests should report `Model is not linearizable` and save report locally.
|
After a couple of tries robustness tests should fail with a log `Model is not linearizable` and save report locally.
|
||||||
|
|
||||||
Lineralization issues are easiest to analyse via history visualization.
|
Lineralization issues are easiest to analyse via history visualization.
|
||||||
Open `/tmp/TestRobustness_Issue14370/history.html` file in your browser.
|
Open `/tmp/TestRobustness_Issue14370/history.html` file in your browser.
|
||||||
Jump to the error in linearization by clicking `[ jump to first error ]` on the top of the page.
|
Jump to the error in linearization by clicking `[ jump to first error ]` on the top of the page.
|
||||||
|
@ -69,3 +70,30 @@ Etcd guarantee that revision is non-decreasing, so this shows a bug in etcd as t
|
||||||
This is consistent with the root cause of [#14370] as it was issue with process crash causing last write to be lost.
|
This is consistent with the root cause of [#14370] as it was issue with process crash causing last write to be lost.
|
||||||
|
|
||||||
[#14370]: https://github.com/etcd-io/etcd/issues/14370
|
[#14370]: https://github.com/etcd-io/etcd/issues/14370
|
||||||
|
|
||||||
|
### Example analysis of watch issue
|
||||||
|
|
||||||
|
Let's reproduce and analyse robustness test report for issue [#15271].
|
||||||
|
To reproduce the issue by yourself run `make test-robustness-issue15271`.
|
||||||
|
After a couple of tries robustness tests should fail with a logs `Broke watch guarantee` and save report locally.
|
||||||
|
|
||||||
|
Watch issues are easiest to analyse by reading the recorded watch history.
|
||||||
|
Watch history is recorded for each member separated in different subdirectory under `/tmp/TestRobustness_Issue15271/`
|
||||||
|
Open `responses.json` for member mentioned in log `Broke watch guarantee`.
|
||||||
|
For example for member `TestRobustnessIssue15271-test-1` open `/tmp/TestRobustness_Issue15271/TestRobustnessIssue15271-test-1/responses.json`.
|
||||||
|
|
||||||
|
Each line consists of json blob corresponding to single watch response observed by client.
|
||||||
|
Look for lines with `mod_revision` equal to revision mentioned in the first log with `Broke watch guarantee`
|
||||||
|
You should see two lines where the `mod_revision` decreases like ones below:
|
||||||
|
```
|
||||||
|
{"Header":{"cluster_id":12951239930360520062,"member_id":16914881897345358027,"revision":2574,"raft_term":2},"Events":[{"kv":{"key":"Ng==","create_revision":2303,"mod_revision":2574,"version":46,"value":"Mjg5OA=="}}],"CompactRevision":0,"Canceled":false,"Created":false}
|
||||||
|
{"Header":{"cluster_id":12951239930360520062,"member_id":16914881897345358027,"revision":7708,"raft_term":2},"Events":[{"kv":{"key":"NQ==","create_revision":5,"mod_revision":91,"version":10,"value":"MTAy"}}, ... }
|
||||||
|
```
|
||||||
|
|
||||||
|
Up to the first line the `mod_revision` of events within responses only increased up to a value of `2574`.
|
||||||
|
However, the following line includes an event with `mod_revision` equal `91`.
|
||||||
|
If you follow the `mod_revision` throughout the file you should notice that watch replayed revisions second time.
|
||||||
|
This is incorrect and breaks `Ordered` and `Unique` [watch guarantees].
|
||||||
|
This is consistent with the root cause of [#14370] where member reconnecting to cluster will incorrectly resend revisions.
|
||||||
|
|
||||||
|
[#15271]: https://github.com/etcd-io/etcd/issues/15271
|
||||||
|
|
|
@ -167,6 +167,18 @@ func TestRobustness(t *testing.T) {
|
||||||
e2e.WithClusterSize(1),
|
e2e.WithClusterSize(1),
|
||||||
),
|
),
|
||||||
})
|
})
|
||||||
|
if v.Compare(version.V3_5) >= 0 {
|
||||||
|
scenarios = append(scenarios, scenario{
|
||||||
|
name: "Issue15271",
|
||||||
|
failpoint: BlackholeUntilSnapshot,
|
||||||
|
traffic: &HighTraffic,
|
||||||
|
config: *e2e.NewConfig(
|
||||||
|
e2e.WithSnapshotCount(100),
|
||||||
|
e2e.WithPeerProxy(true),
|
||||||
|
e2e.WithIsPeerTLS(true),
|
||||||
|
),
|
||||||
|
})
|
||||||
|
}
|
||||||
snapshotOptions := []e2e.EPClusterOption{
|
snapshotOptions := []e2e.EPClusterOption{
|
||||||
e2e.WithGoFailEnabled(true),
|
e2e.WithGoFailEnabled(true),
|
||||||
e2e.WithSnapshotCount(100),
|
e2e.WithSnapshotCount(100),
|
||||||
|
@ -217,7 +229,7 @@ func testRobustness(ctx context.Context, t *testing.T, lg *zap.Logger, config e2
|
||||||
forcestopCluster(r.clus)
|
forcestopCluster(r.clus)
|
||||||
|
|
||||||
watchProgressNotifyEnabled := r.clus.Cfg.WatchProcessNotifyInterval != 0
|
watchProgressNotifyEnabled := r.clus.Cfg.WatchProcessNotifyInterval != 0
|
||||||
validateWatchResponses(t, r.responses, traffic.requestProgress || watchProgressNotifyEnabled)
|
validateWatchResponses(t, r.clus, r.responses, traffic.requestProgress || watchProgressNotifyEnabled)
|
||||||
|
|
||||||
r.events = watchEvents(r.responses)
|
r.events = watchEvents(r.responses)
|
||||||
validateEventsMatch(t, r.events)
|
validateEventsMatch(t, r.events)
|
||||||
|
|
|
@ -15,6 +15,11 @@ test-robustness-issue14685: /tmp/etcd-v3.5.5-failpoints/bin
|
||||||
GO_TEST_FLAGS='-v --run=TestRobustness/Issue14685 --count 100 --failfast --bin-dir=/tmp/etcd-v3.5.5-failpoints/bin' make test-robustness && \
|
GO_TEST_FLAGS='-v --run=TestRobustness/Issue14685 --count 100 --failfast --bin-dir=/tmp/etcd-v3.5.5-failpoints/bin' make test-robustness && \
|
||||||
echo "Failed to reproduce" || echo "Successful reproduction"
|
echo "Failed to reproduce" || echo "Successful reproduction"
|
||||||
|
|
||||||
|
.PHONY: test-robustness-issue15271
|
||||||
|
test-robustness-issue15271: /tmp/etcd-v3.5.7-failpoints/bin
|
||||||
|
GO_TEST_FLAGS='-v --run=TestRobustness/Issue15271 --count 100 --failfast --bin-dir=/tmp/etcd-v3.5.7-failpoints/bin' make test-robustness && \
|
||||||
|
echo "Failed to reproduce" || echo "Successful reproduction"
|
||||||
|
|
||||||
# Failpoints
|
# Failpoints
|
||||||
|
|
||||||
GOFAIL_VERSION = $(shell cd tools/mod && go list -m -f {{.Version}} go.etcd.io/gofail)
|
GOFAIL_VERSION = $(shell cd tools/mod && go list -m -f {{.Version}} go.etcd.io/gofail)
|
||||||
|
|
|
@ -132,25 +132,25 @@ func watchResponsesMaxRevision(responses []watchResponse) int64 {
|
||||||
return maxRevision
|
return maxRevision
|
||||||
}
|
}
|
||||||
|
|
||||||
func validateWatchResponses(t *testing.T, responses [][]watchResponse, expectProgressNotify bool) {
|
func validateWatchResponses(t *testing.T, clus *e2e.EtcdProcessCluster, responses [][]watchResponse, expectProgressNotify bool) {
|
||||||
for _, memberResponses := range responses {
|
for i, member := range clus.Procs {
|
||||||
validateMemberWatchResponses(t, memberResponses, expectProgressNotify)
|
validateMemberWatchResponses(t, member.Config().Name, responses[i], expectProgressNotify)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func validateMemberWatchResponses(t *testing.T, responses []watchResponse, expectProgressNotify bool) {
|
func validateMemberWatchResponses(t *testing.T, memberId string, responses []watchResponse, expectProgressNotify bool) {
|
||||||
// Validate watch is correctly configured to ensure proper testing
|
// Validate watch is correctly configured to ensure proper testing
|
||||||
validateGotAtLeastOneProgressNotify(t, responses, expectProgressNotify)
|
validateGotAtLeastOneProgressNotify(t, memberId, responses, expectProgressNotify)
|
||||||
|
|
||||||
// Validate etcd watch properties defined in https://etcd.io/docs/v3.6/learning/api/#watch-streams
|
// Validate etcd watch properties defined in https://etcd.io/docs/v3.6/learning/api/#watch-streams
|
||||||
validateOrderedAndReliable(t, responses)
|
validateOrderedAndReliable(t, memberId, responses)
|
||||||
validateUnique(t, responses)
|
validateUnique(t, memberId, responses)
|
||||||
validateAtomic(t, responses)
|
validateAtomic(t, memberId, responses)
|
||||||
// Validate kubernetes usage of watch
|
// Validate kubernetes usage of watch
|
||||||
validateRenewable(t, responses)
|
validateRenewable(t, memberId, responses)
|
||||||
}
|
}
|
||||||
|
|
||||||
func validateGotAtLeastOneProgressNotify(t *testing.T, responses []watchResponse, expectProgressNotify bool) {
|
func validateGotAtLeastOneProgressNotify(t *testing.T, memberId string, responses []watchResponse, expectProgressNotify bool) {
|
||||||
var gotProgressNotify = false
|
var gotProgressNotify = false
|
||||||
var lastHeadRevision int64 = 1
|
var lastHeadRevision int64 = 1
|
||||||
for _, resp := range responses {
|
for _, resp := range responses {
|
||||||
|
@ -161,16 +161,16 @@ func validateGotAtLeastOneProgressNotify(t *testing.T, responses []watchResponse
|
||||||
lastHeadRevision = resp.Header.Revision
|
lastHeadRevision = resp.Header.Revision
|
||||||
}
|
}
|
||||||
if gotProgressNotify != expectProgressNotify {
|
if gotProgressNotify != expectProgressNotify {
|
||||||
t.Errorf("Expected at least one progress notify: %v, got: %v", expectProgressNotify, gotProgressNotify)
|
t.Errorf("Progress notify does not match, expect: %v, got: %v, member: %q", expectProgressNotify, gotProgressNotify, memberId)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func validateRenewable(t *testing.T, responses []watchResponse) {
|
func validateRenewable(t *testing.T, memberId string, responses []watchResponse) {
|
||||||
var lastProgressNotifyRevision int64 = 0
|
var lastProgressNotifyRevision int64 = 0
|
||||||
for _, resp := range responses {
|
for _, resp := range responses {
|
||||||
for _, event := range resp.Events {
|
for _, event := range resp.Events {
|
||||||
if event.Kv.ModRevision <= lastProgressNotifyRevision {
|
if event.Kv.ModRevision <= lastProgressNotifyRevision {
|
||||||
t.Errorf("BROKE: Renewable - watch can renewed using revision in last progress notification; Progress notification guarantees that previous events have been already delivered, eventRevision: %d, progressNotifyRevision: %d", event.Kv.ModRevision, lastProgressNotifyRevision)
|
t.Errorf("Broke watch guarantee: Renewable - watch can renewed using revision in last progress notification; Progress notification guarantees that previous events have been already delivered, eventRevision: %d, progressNotifyRevision: %d, member: %q", event.Kv.ModRevision, lastProgressNotifyRevision, memberId)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if resp.IsProgressNotify() {
|
if resp.IsProgressNotify() {
|
||||||
|
@ -179,19 +179,23 @@ func validateRenewable(t *testing.T, responses []watchResponse) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func validateOrderedAndReliable(t *testing.T, responses []watchResponse) {
|
func validateOrderedAndReliable(t *testing.T, memberId string, responses []watchResponse) {
|
||||||
var lastEventRevision int64 = 1
|
var lastEventRevision int64 = 1
|
||||||
for _, resp := range responses {
|
for _, resp := range responses {
|
||||||
for _, event := range resp.Events {
|
for _, event := range resp.Events {
|
||||||
if event.Kv.ModRevision != lastEventRevision && event.Kv.ModRevision != lastEventRevision+1 {
|
if event.Kv.ModRevision != lastEventRevision && event.Kv.ModRevision != lastEventRevision+1 {
|
||||||
t.Errorf("BROKE: Reliable - a sequence of events will never drop any subsequence of events; if there are events ordered in time as a < b < c, then if the watch receives events a and c, it is guaranteed to receive b, lastRevision: %d, currentRevision: %d", lastEventRevision, event.Kv.ModRevision)
|
if event.Kv.ModRevision < lastEventRevision {
|
||||||
|
t.Errorf("Broke watch guarantee: Ordered - events are ordered by revision; an event will never appear on a watch if it precedes an event in time that has already been posted, lastRevision: %d, currentRevision: %d, member: %q", lastEventRevision, event.Kv.ModRevision, memberId)
|
||||||
|
} else {
|
||||||
|
t.Errorf("Broke watch guarantee: Reliable - a sequence of events will never drop any subsequence of events; if there are events ordered in time as a < b < c, then if the watch receives events a and c, it is guaranteed to receive b, lastRevision: %d, currentRevision: %d, member: %q", lastEventRevision, event.Kv.ModRevision, memberId)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
lastEventRevision = event.Kv.ModRevision
|
lastEventRevision = event.Kv.ModRevision
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func validateUnique(t *testing.T, responses []watchResponse) {
|
func validateUnique(t *testing.T, memberId string, responses []watchResponse) {
|
||||||
type revisionKey struct {
|
type revisionKey struct {
|
||||||
revision int64
|
revision int64
|
||||||
key string
|
key string
|
||||||
|
@ -201,19 +205,19 @@ func validateUnique(t *testing.T, responses []watchResponse) {
|
||||||
for _, event := range resp.Events {
|
for _, event := range resp.Events {
|
||||||
rk := revisionKey{key: string(event.Kv.Key), revision: event.Kv.ModRevision}
|
rk := revisionKey{key: string(event.Kv.Key), revision: event.Kv.ModRevision}
|
||||||
if _, found := uniqueOperations[rk]; found {
|
if _, found := uniqueOperations[rk]; found {
|
||||||
t.Errorf("BROKE: Unique - an event will never appear on a watch twice, key: %q, revision: %d", rk.key, rk.revision)
|
t.Errorf("Broke watch guarantee: Unique - an event will never appear on a watch twice, key: %q, revision: %d, member: %q", rk.key, rk.revision, memberId)
|
||||||
}
|
}
|
||||||
uniqueOperations[rk] = struct{}{}
|
uniqueOperations[rk] = struct{}{}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func validateAtomic(t *testing.T, responses []watchResponse) {
|
func validateAtomic(t *testing.T, memberId string, responses []watchResponse) {
|
||||||
var lastEventRevision int64 = 1
|
var lastEventRevision int64 = 1
|
||||||
for _, resp := range responses {
|
for _, resp := range responses {
|
||||||
if len(resp.Events) > 0 {
|
if len(resp.Events) > 0 {
|
||||||
if resp.Events[0].Kv.ModRevision == lastEventRevision {
|
if resp.Events[0].Kv.ModRevision == lastEventRevision {
|
||||||
t.Errorf("BROKE: Atomic - a list of events is guaranteed to encompass complete revisions; updates in the same revision over multiple keys will not be split over several lists of events, previousListEventRevision: %d, currentListEventRevision: %d", lastEventRevision, resp.Events[0].Kv.ModRevision)
|
t.Errorf("Broke watch guarantee: Atomic - a list of events is guaranteed to encompass complete revisions; updates in the same revision over multiple keys will not be split over several lists of events, previousListEventRevision: %d, currentListEventRevision: %d, member: %q", lastEventRevision, resp.Events[0].Kv.ModRevision, memberId)
|
||||||
}
|
}
|
||||||
lastEventRevision = resp.Events[len(resp.Events)-1].Kv.ModRevision
|
lastEventRevision = resp.Events[len(resp.Events)-1].Kv.ModRevision
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue