etcd-tester: fix, clean up multiple things (#5462)

* etcd-tester: more logging, fix typo

* etcd-tester: fix prevCompactRev scope

Fix https://github.com/coreos/etcd/issues/5440.

* etcd-tester: move utils to bottom, clean up logs

And remove stresser operation inside defrag

* etcd-tester: separate update revision call

* etcd-tester: fix cleanup when case is -1
release-3.0
Gyu-Ho Lee 2016-05-26 11:37:49 -07:00
parent 6acb3d67fb
commit 3ed5d28e2e
3 changed files with 104 additions and 96 deletions

View File

@ -175,6 +175,7 @@ func (c *cluster) WaitHealth() error {
if err == nil { if err == nil {
return nil return nil
} }
plog.Warningf("#%d setHealthKey error (%v)", i, err)
time.Sleep(time.Second) time.Sleep(time.Second)
} }
return err return err
@ -271,7 +272,7 @@ func setHealthKey(us []string) error {
cancel() cancel()
conn.Close() conn.Close()
if err != nil { if err != nil {
return err return fmt.Errorf("%v (%s)", err, u)
} }
} }
return nil return nil

View File

@ -26,7 +26,7 @@ const (
randomVariation = 50 randomVariation = 50
// Wait more when it recovers from slow network, because network layer // Wait more when it recovers from slow network, because network layer
// needs extra time to propogate traffic control (tc command) change. // needs extra time to propagate traffic control (tc command) change.
// Otherwise, we get different hash values from the previous revision. // Otherwise, we get different hash values from the previous revision.
// For more detail, please see https://github.com/coreos/etcd/issues/5121. // For more detail, please see https://github.com/coreos/etcd/issues/5121.
waitRecover = 5 * time.Second waitRecover = 5 * time.Second

View File

@ -40,16 +40,17 @@ func (tt *tester) runLoop() {
for _, f := range tt.failures { for _, f := range tt.failures {
tt.status.Failures = append(tt.status.Failures, f.Desc()) tt.status.Failures = append(tt.status.Failures, f.Desc())
} }
round := 0
var (
round int
prevCompactRev int64
)
for { for {
tt.status.setRound(round) tt.status.setRound(round)
tt.status.setCase(-1) // -1 so that logPrefix doesn't print out 'case' tt.status.setCase(-1) // -1 so that logPrefix doesn't print out 'case'
roundTotalCounter.Inc() roundTotalCounter.Inc()
var ( var failed bool
prevCompactRev int64
failed bool
)
for j, f := range tt.failures { for j, f := range tt.failures {
caseTotalCounter.WithLabelValues(f.Desc()).Inc() caseTotalCounter.WithLabelValues(f.Desc()).Inc()
tt.status.setCase(j) tt.status.setCase(j)
@ -63,9 +64,7 @@ func (tt *tester) runLoop() {
break break
} }
plog.Printf("%s starting failure %s", tt.logPrefix(), f.Desc()) plog.Printf("%s injecting failure %q", tt.logPrefix(), f.Desc())
plog.Printf("%s injecting failure...", tt.logPrefix())
if err := f.Inject(tt.cluster, round); err != nil { if err := f.Inject(tt.cluster, round); err != nil {
plog.Printf("%s injection error: %v", tt.logPrefix(), err) plog.Printf("%s injection error: %v", tt.logPrefix(), err)
if err := tt.cleanup(); err != nil { if err := tt.cleanup(); err != nil {
@ -76,7 +75,7 @@ func (tt *tester) runLoop() {
} }
plog.Printf("%s injected failure", tt.logPrefix()) plog.Printf("%s injected failure", tt.logPrefix())
plog.Printf("%s recovering failure...", tt.logPrefix()) plog.Printf("%s recovering failure %q", tt.logPrefix(), f.Desc())
if err := f.Recover(tt.cluster, round); err != nil { if err := f.Recover(tt.cluster, round); err != nil {
plog.Printf("%s recovery error: %v", tt.logPrefix(), err) plog.Printf("%s recovery error: %v", tt.logPrefix(), err)
if err := tt.cleanup(); err != nil { if err := tt.cleanup(); err != nil {
@ -92,10 +91,18 @@ func (tt *tester) runLoop() {
continue continue
} }
if !tt.consistencyCheck {
if err := tt.updateRevision(); err != nil {
plog.Warningf("%s functional-tester returning with tt.updateRevision error (%v)", tt.logPrefix(), err)
return
}
continue
}
var err error var err error
failed, err = tt.updateCurrentRevisionHash(tt.consistencyCheck) failed, err = tt.checkConsistency()
if err != nil { if err != nil {
plog.Warningf("%s functional-tester returning with error (%v)", tt.logPrefix(), err) plog.Warningf("%s functional-tester returning with tt.checkConsistency error (%v)", tt.logPrefix(), err)
return return
} }
if failed { if failed {
@ -139,44 +146,59 @@ func (tt *tester) runLoop() {
} }
} }
func (tt *tester) logPrefix() string { func (tt *tester) updateRevision() error {
var ( revs, _, err := tt.cluster.getRevisionHash()
rd = tt.status.getRound() for _, rev := range revs {
cs = tt.status.getCase() tt.currentRevision = rev
prefix = fmt.Sprintf("[round#%d case#%d]", rd, cs) break // just need get one of the current revisions
)
if cs == -1 {
prefix = fmt.Sprintf("[round#%d]", rd)
} }
return prefix
}
func (tt *tester) cleanup() error {
roundFailedTotalCounter.Inc()
caseFailedTotalCounter.WithLabelValues(tt.failures[tt.status.Case].Desc()).Inc()
plog.Printf("%s cleaning up...", tt.logPrefix())
if err := tt.cluster.Cleanup(); err != nil {
plog.Printf("%s cleanup error: %v", tt.logPrefix(), err)
return err return err
} }
return tt.cluster.Bootstrap()
func (tt *tester) checkConsistency() (failed bool, err error) {
tt.cancelStressers()
defer tt.startStressers()
plog.Printf("%s updating current revisions...", tt.logPrefix())
var (
revs map[string]int64
hashes map[string]int64
rerr error
ok bool
)
for i := 0; i < 7; i++ {
time.Sleep(time.Second)
revs, hashes, rerr = tt.cluster.getRevisionHash()
if rerr != nil {
plog.Printf("%s #%d failed to get current revisions (%v)", tt.logPrefix(), i, rerr)
continue
}
if tt.currentRevision, ok = getSameValue(revs); ok {
break
} }
func (tt *tester) cancelStressers() { plog.Printf("%s #%d inconsistent current revisions %+v", tt.logPrefix(), i, revs)
plog.Printf("%s canceling the stressers...", tt.logPrefix())
for _, s := range tt.cluster.Stressers {
s.Cancel()
}
plog.Printf("%s canceled stressers", tt.logPrefix())
} }
plog.Printf("%s updated current revisions with %d", tt.logPrefix(), tt.currentRevision)
func (tt *tester) startStressers() { if !ok || rerr != nil {
plog.Printf("%s starting the stressers...", tt.logPrefix()) plog.Printf("%s checking current revisions failed [revisions: %v]", tt.logPrefix(), revs)
for _, s := range tt.cluster.Stressers { failed = true
go s.Stress() err = tt.cleanup()
return
} }
plog.Printf("%s started stressers", tt.logPrefix()) plog.Printf("%s all members are consistent with current revisions [revisions: %v]", tt.logPrefix(), revs)
plog.Printf("%s checking current storage hashes...", tt.logPrefix())
if _, ok = getSameValue(hashes); !ok {
plog.Printf("%s checking current storage hashes failed [hashes: %v]", tt.logPrefix(), hashes)
failed = true
err = tt.cleanup()
return
}
plog.Printf("%s all members are consistent with storage hashes", tt.logPrefix())
return
} }
func (tt *tester) compact(rev int64, timeout time.Duration) error { func (tt *tester) compact(rev int64, timeout time.Duration) error {
@ -204,9 +226,6 @@ func (tt *tester) compact(rev int64, timeout time.Duration) error {
} }
func (tt *tester) defrag() error { func (tt *tester) defrag() error {
tt.cancelStressers()
defer tt.startStressers()
plog.Printf("%s defragmenting...", tt.logPrefix()) plog.Printf("%s defragmenting...", tt.logPrefix())
if err := tt.cluster.defrag(); err != nil { if err := tt.cluster.defrag(); err != nil {
plog.Printf("%s defrag error (%v)", tt.logPrefix(), err) plog.Printf("%s defrag error (%v)", tt.logPrefix(), err)
@ -220,58 +239,46 @@ func (tt *tester) defrag() error {
return nil return nil
} }
func (tt *tester) updateCurrentRevisionHash(check bool) (failed bool, err error) { func (tt *tester) logPrefix() string {
if check {
tt.cancelStressers()
defer tt.startStressers()
}
plog.Printf("%s updating current revisions...", tt.logPrefix())
var ( var (
revs map[string]int64 rd = tt.status.getRound()
hashes map[string]int64 cs = tt.status.getCase()
rerr error prefix = fmt.Sprintf("[round#%d case#%d]", rd, cs)
ok bool
) )
for i := 0; i < 7; i++ { if cs == -1 {
revs, hashes, rerr = tt.cluster.getRevisionHash() prefix = fmt.Sprintf("[round#%d]", rd)
if rerr != nil {
plog.Printf("%s #%d failed to get current revisions (%v)", tt.logPrefix(), i, rerr)
continue
} }
if tt.currentRevision, ok = getSameValue(revs); ok { return prefix
break
} }
plog.Printf("%s #%d inconsistent current revisions %+v", tt.logPrefix(), i, revs) func (tt *tester) cleanup() error {
if !check { roundFailedTotalCounter.Inc()
break // if consistency check is false, just try once desc := "compact/defrag"
if tt.status.Case != -1 {
desc = tt.failures[tt.status.Case].Desc()
}
caseFailedTotalCounter.WithLabelValues(desc).Inc()
plog.Printf("%s cleaning up...", tt.logPrefix())
if err := tt.cluster.Cleanup(); err != nil {
plog.Printf("%s cleanup error: %v", tt.logPrefix(), err)
return err
}
return tt.cluster.Bootstrap()
} }
time.Sleep(time.Second) func (tt *tester) cancelStressers() {
plog.Printf("%s canceling the stressers...", tt.logPrefix())
for _, s := range tt.cluster.Stressers {
s.Cancel()
} }
plog.Printf("%s updated current revisions with %d", tt.logPrefix(), tt.currentRevision) plog.Printf("%s canceled stressers", tt.logPrefix())
if !check {
failed = false
return
} }
if !ok || rerr != nil { func (tt *tester) startStressers() {
plog.Printf("%s checking current revisions failed [revisions: %v]", tt.logPrefix(), revs) plog.Printf("%s starting the stressers...", tt.logPrefix())
failed = true for _, s := range tt.cluster.Stressers {
err = tt.cleanup() go s.Stress()
return
} }
plog.Printf("%s all members are consistent with current revisions [revisions: %v]", tt.logPrefix(), revs) plog.Printf("%s started stressers", tt.logPrefix())
plog.Printf("%s checking current storage hashes...", tt.logPrefix())
if _, ok = getSameValue(hashes); !ok {
plog.Printf("%s checking current storage hashes failed [hashes: %v]", tt.logPrefix(), hashes)
failed = true
err = tt.cleanup()
return
}
plog.Printf("%s all members are consistent with storage hashes", tt.logPrefix())
return
} }