Merge pull request #2453 from yichengq/334

tools/etcd-tester: add kill one member tests
release-2.1
Yicheng Qin 2015-03-10 13:17:57 -07:00
commit 9a9d00b482
7 changed files with 184 additions and 149 deletions

View File

@ -32,6 +32,7 @@ type cluster struct {
Size int
Agents []client.Agent
Stressers []Stresser
Names []string
ClientURLs []string
}
@ -98,8 +99,19 @@ func (c *cluster) Bootstrap() error {
}
}
stressers := make([]Stresser, len(clientURLs))
for i, u := range clientURLs {
s := &stresser{
Endpoint: u,
N: 200,
}
go s.Stress()
stressers[i] = s
}
c.Size = size
c.Agents = agents
c.Stressers = stressers
c.Names = names
c.ClientURLs = clientURLs
return nil
@ -117,19 +129,35 @@ func (c *cluster) WaitHealth() error {
return err
}
func (c *cluster) Report() (success, failure int) {
for _, stress := range c.Stressers {
s, f := stress.Report()
success += s
failure += f
}
return
}
func (c *cluster) Cleanup() error {
var lasterr error
for _, a := range c.Agents {
if err := a.Cleanup(); err != nil {
return err
lasterr = err
}
}
return nil
for _, s := range c.Stressers {
s.Cancel()
}
return lasterr
}
func (c *cluster) Terminate() {
for _, a := range c.Agents {
a.Terminate()
}
for _, s := range c.Stressers {
s.Cancel()
}
}
// setHealthKey sets health key on all given urls.

View File

@ -14,6 +14,14 @@
package main
import (
"fmt"
"math/rand"
"time"
)
const snapshotCount = 10000
type failure interface {
// Inject injeccts the failure into the testing cluster at the given
// round. When calling the function, the cluster should be in health.
@ -28,3 +36,139 @@ type failure interface {
type description string
func (d description) Desc() string { return string(d) }
type failureKillAll struct {
description
}
func newFailureKillAll() *failureKillAll {
return &failureKillAll{
description: "kill all members",
}
}
func (f *failureKillAll) Inject(c *cluster, round int) error {
for _, a := range c.Agents {
if err := a.Stop(); err != nil {
return err
}
}
return nil
}
func (f *failureKillAll) Recover(c *cluster, round int) error {
for _, a := range c.Agents {
if _, err := a.Restart(); err != nil {
return err
}
}
return c.WaitHealth()
}
type failureKillMajority struct {
description
}
func newFailureKillMajority() *failureKillMajority {
return &failureKillMajority{
description: "kill majority of the cluster",
}
}
func (f *failureKillMajority) Inject(c *cluster, round int) error {
for i := range getToKillMap(c.Size, round) {
if err := c.Agents[i].Stop(); err != nil {
return err
}
}
return nil
}
func (f *failureKillMajority) Recover(c *cluster, round int) error {
for i := range getToKillMap(c.Size, round) {
if _, err := c.Agents[i].Restart(); err != nil {
return err
}
}
return c.WaitHealth()
}
func getToKillMap(size int, seed int) map[int]bool {
m := make(map[int]bool)
r := rand.New(rand.NewSource(int64(seed)))
majority := size/2 + 1
for {
m[r.Intn(size)] = true
if len(m) >= majority {
return m
}
}
}
type failureKillOne struct {
description
}
func newFailureKillOne() *failureKillOne {
return &failureKillOne{
description: "kill one random member",
}
}
func (f *failureKillOne) Inject(c *cluster, round int) error {
i := round % c.Size
return c.Agents[i].Stop()
}
func (f *failureKillOne) Recover(c *cluster, round int) error {
i := round % c.Size
if _, err := c.Agents[i].Restart(); err != nil {
return err
}
return c.WaitHealth()
}
// failureKillOneForLongTime kills one member for long time, and restart
// after a snapshot is required.
type failureKillOneForLongTime struct {
description
}
func newFailureKillOneForLongTime() *failureKillOneForLongTime {
return &failureKillOneForLongTime{
description: "kill one member for long time and expect it to recover from incoming snapshot",
}
}
func (f *failureKillOneForLongTime) Inject(c *cluster, round int) error {
i := round % c.Size
if err := c.Agents[i].Stop(); err != nil {
return err
}
if c.Size >= 3 {
start, _ := c.Report()
var end int
// Normal healthy cluster could accept 1000req/s at least.
// Give it 3-times time to create a new snapshot.
retry := snapshotCount / 1000 * 3
for j := 0; j < retry; j++ {
end, _ = c.Report()
// If the number of proposals committed is bigger than snapshot count,
// a new snapshot should have been created.
if end-start > snapshotCount {
return nil
}
time.Sleep(time.Second)
}
return fmt.Errorf("cluster too slow: only commit %d requests in %ds", end-start, retry)
}
return nil
}
func (f *failureKillOneForLongTime) Recover(c *cluster, round int) error {
i := round % c.Size
if _, err := c.Agents[i].Restart(); err != nil {
return err
}
return c.WaitHealth()
}

View File

@ -1,43 +0,0 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
type failureKillAll struct {
description
}
func newFailureKillAll() *failureKillAll {
return &failureKillAll{
description: "kill all members",
}
}
func (f *failureKillAll) Inject(c *cluster, round int) error {
for _, a := range c.Agents {
if err := a.Stop(); err != nil {
return err
}
}
return nil
}
func (f *failureKillAll) Recover(c *cluster, round int) error {
for _, a := range c.Agents {
if _, err := a.Restart(); err != nil {
return err
}
}
return c.WaitHealth()
}

View File

@ -1,57 +0,0 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import "math/rand"
type failureKillMajority struct {
description
}
func newFailureKillMajority() *failureKillMajority {
return &failureKillMajority{
description: "kill majority of the cluster",
}
}
func (f *failureKillMajority) Inject(c *cluster, round int) error {
for i := range getToKillMap(c.Size, round) {
if err := c.Agents[i].Stop(); err != nil {
return err
}
}
return nil
}
func (f *failureKillMajority) Recover(c *cluster, round int) error {
for i := range getToKillMap(c.Size, round) {
if _, err := c.Agents[i].Restart(); err != nil {
return err
}
}
return c.WaitHealth()
}
func getToKillMap(size int, seed int) map[int]bool {
m := make(map[int]bool)
r := rand.New(rand.NewSource(int64(seed)))
majority := size/2 + 1
for {
m[r.Intn(size)] = true
if len(m) >= majority {
return m
}
}
}

View File

@ -1,29 +0,0 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
type failureBase struct {
description
}
func newFailureBase() *failureBase {
return &failureBase{
description: "do nothing",
}
}
func (f *failureBase) Inject(c *cluster, round int) error { return nil }
func (f *failureBase) Recover(c *cluster, round int) error { return nil }

View File

@ -33,24 +33,15 @@ func main() {
}
defer c.Terminate()
stressers := make([]Stresser, len(c.ClientURLs))
for i, u := range c.ClientURLs {
s := &stresser{
Endpoint: u,
N: 200,
}
go s.Stress()
stressers[i] = s
}
t := &tester{
failures: []failure{newFailureBase(), newFailureKillAll(), newFailureKillMajority()},
cluster: c,
limit: *limit,
failures: []failure{
newFailureKillAll(),
newFailureKillMajority(),
newFailureKillOne(),
newFailureKillOneForLongTime(),
},
cluster: c,
limit: *limit,
}
t.runLoop()
for _, s := range stressers {
s.Cancel()
}
}

View File

@ -65,8 +65,9 @@ func (s *stresser) Stress() error {
s.mu.Lock()
if err != nil {
s.failure++
} else {
s.success++
}
s.success++
s.mu.Unlock()
}
}()