2018-03-28 03:01:18 +03:00
|
|
|
syntax = "proto3";
|
|
|
|
package rpcpb;
|
|
|
|
|
|
|
|
import "github.com/gogo/protobuf/gogoproto/gogo.proto";
|
|
|
|
|
|
|
|
option (gogoproto.marshaler_all) = true;
|
|
|
|
option (gogoproto.sizer_all) = true;
|
|
|
|
option (gogoproto.unmarshaler_all) = true;
|
|
|
|
option (gogoproto.goproto_getters_all) = false;
|
|
|
|
|
|
|
|
service Transport {
|
|
|
|
rpc Transport(stream Request) returns (stream Response) {}
|
|
|
|
}
|
|
|
|
|
|
|
|
enum Operation {
|
|
|
|
NotStarted = 0;
|
|
|
|
|
|
|
|
// InitialStartEtcd is only called to start etcd very first time.
|
|
|
|
InitialStartEtcd = 1;
|
|
|
|
// RestartEtcd is sent to restart killed etcd.
|
|
|
|
RestartEtcd = 2;
|
|
|
|
// KillEtcd pauses etcd process while keeping data directories
|
|
|
|
// and previous etcd configurations.
|
|
|
|
KillEtcd = 3;
|
|
|
|
// FailArchive is sent when consistency check failed,
|
|
|
|
// thus need to archive etcd data directories.
|
|
|
|
FailArchive = 4;
|
|
|
|
// DestroyEtcdAgent destroys etcd process, etcd data, and agent server.
|
|
|
|
DestroyEtcdAgent = 5;
|
|
|
|
|
2018-04-09 23:39:42 +03:00
|
|
|
// BlackholePeerPortTxRx drops all outgoing/incoming packets from/to the
|
|
|
|
// peer port on target member's peer port.
|
2018-03-28 03:01:18 +03:00
|
|
|
BlackholePeerPortTxRx = 100;
|
2018-04-09 23:39:42 +03:00
|
|
|
// UnblackholePeerPortTxRx removes outgoing/incoming packet dropping.
|
2018-03-28 03:01:18 +03:00
|
|
|
UnblackholePeerPortTxRx = 101;
|
2018-04-09 23:39:42 +03:00
|
|
|
// DelayPeerPortTxRx delays all outgoing/incoming packets from/to the
|
|
|
|
// peer port on target member's peer port.
|
2018-03-28 03:01:18 +03:00
|
|
|
DelayPeerPortTxRx = 102;
|
2018-04-09 23:39:42 +03:00
|
|
|
// UndelayPeerPortTxRx removes all outgoing/incoming delays.
|
2018-03-28 03:01:18 +03:00
|
|
|
UndelayPeerPortTxRx = 103;
|
|
|
|
}
|
|
|
|
|
|
|
|
message Etcd {
|
|
|
|
string Name = 1 [(gogoproto.moretags) = "yaml:\"name\""];
|
|
|
|
string DataDir = 2 [(gogoproto.moretags) = "yaml:\"data-dir\""];
|
|
|
|
string WALDir = 3 [(gogoproto.moretags) = "yaml:\"wal-dir\""];
|
2018-04-05 20:37:16 +03:00
|
|
|
|
|
|
|
// HeartbeatIntervalMs is the time (in milliseconds) of a heartbeat interval.
|
|
|
|
// Default value is 100, which is 100ms.
|
|
|
|
int64 HeartbeatIntervalMs = 11 [(gogoproto.moretags) = "yaml:\"heartbeat-interval\""];
|
|
|
|
// ElectionTimeoutMs is the time (in milliseconds) for an election to timeout.
|
|
|
|
// Default value is 1000, which is 1s.
|
|
|
|
int64 ElectionTimeoutMs = 12 [(gogoproto.moretags) = "yaml:\"election-timeout\""];
|
|
|
|
|
|
|
|
repeated string ListenClientURLs = 21 [(gogoproto.moretags) = "yaml:\"listen-client-urls\""];
|
|
|
|
repeated string AdvertiseClientURLs = 22 [(gogoproto.moretags) = "yaml:\"advertise-client-urls\""];
|
2018-04-05 23:36:49 +03:00
|
|
|
bool ClientAutoTLS = 23 [(gogoproto.moretags) = "yaml:\"auto-tls\""];
|
|
|
|
bool ClientCertAuth = 24 [(gogoproto.moretags) = "yaml:\"client-cert-auth\""];
|
|
|
|
string ClientCertFile = 25 [(gogoproto.moretags) = "yaml:\"cert-file\""];
|
|
|
|
string ClientKeyFile = 26 [(gogoproto.moretags) = "yaml:\"key-file\""];
|
|
|
|
string ClientTrustedCAFile = 27 [(gogoproto.moretags) = "yaml:\"trusted-ca-file\""];
|
|
|
|
|
|
|
|
repeated string ListenPeerURLs = 31 [(gogoproto.moretags) = "yaml:\"listen-peer-urls\""];
|
2018-04-06 00:01:05 +03:00
|
|
|
repeated string AdvertisePeerURLs = 32 [(gogoproto.moretags) = "yaml:\"initial-advertise-peer-urls\""];
|
2018-04-05 23:36:49 +03:00
|
|
|
bool PeerAutoTLS = 33 [(gogoproto.moretags) = "yaml:\"peer-auto-tls\""];
|
|
|
|
bool PeerClientCertAuth = 34 [(gogoproto.moretags) = "yaml:\"peer-client-cert-auth\""];
|
|
|
|
string PeerCertFile = 35 [(gogoproto.moretags) = "yaml:\"peer-cert-file\""];
|
|
|
|
string PeerKeyFile = 36 [(gogoproto.moretags) = "yaml:\"peer-key-file\""];
|
|
|
|
string PeerTrustedCAFile = 37 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-file\""];
|
|
|
|
|
|
|
|
string InitialCluster = 41 [(gogoproto.moretags) = "yaml:\"initial-cluster\""];
|
|
|
|
string InitialClusterState = 42 [(gogoproto.moretags) = "yaml:\"initial-cluster-state\""];
|
|
|
|
string InitialClusterToken = 43 [(gogoproto.moretags) = "yaml:\"initial-cluster-token\""];
|
|
|
|
|
|
|
|
int64 SnapshotCount = 51 [(gogoproto.moretags) = "yaml:\"snapshot-count\""];
|
|
|
|
int64 QuotaBackendBytes = 52 [(gogoproto.moretags) = "yaml:\"quota-backend-bytes\""];
|
|
|
|
|
|
|
|
bool PreVote = 63 [(gogoproto.moretags) = "yaml:\"pre-vote\""];
|
|
|
|
bool InitialCorruptCheck = 64 [(gogoproto.moretags) = "yaml:\"initial-corrupt-check\""];
|
2018-03-28 03:01:18 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
message Member {
|
|
|
|
// EtcdExecPath is the executable etcd binary path in agent server.
|
|
|
|
string EtcdExecPath = 1 [(gogoproto.moretags) = "yaml:\"etcd-exec-path\""];
|
|
|
|
|
|
|
|
// TODO: support embedded etcd
|
|
|
|
|
|
|
|
// AgentAddr is the agent HTTP server address.
|
|
|
|
string AgentAddr = 11 [(gogoproto.moretags) = "yaml:\"agent-addr\""];
|
|
|
|
// FailpointHTTPAddr is the agent's failpoints HTTP server address.
|
|
|
|
string FailpointHTTPAddr = 12 [(gogoproto.moretags) = "yaml:\"failpoint-http-addr\""];
|
|
|
|
|
|
|
|
// BaseDir is the base directory where all logs and etcd data are stored.
|
|
|
|
string BaseDir = 101 [(gogoproto.moretags) = "yaml:\"base-dir\""];
|
|
|
|
// EtcdLogPath is the log file to store current etcd server logs.
|
|
|
|
string EtcdLogPath = 102 [(gogoproto.moretags) = "yaml:\"etcd-log-path\""];
|
|
|
|
|
|
|
|
// EtcdClientProxy is true when client traffic needs to be proxied.
|
|
|
|
// If true, listen client URL port must be different than advertise client URL port.
|
2018-04-06 00:45:40 +03:00
|
|
|
bool EtcdClientProxy = 201 [(gogoproto.moretags) = "yaml:\"etcd-client-proxy\""];
|
2018-03-28 03:01:18 +03:00
|
|
|
// EtcdPeerProxy is true when peer traffic needs to be proxied.
|
|
|
|
// If true, listen peer URL port must be different than advertise peer URL port.
|
2018-04-06 00:45:40 +03:00
|
|
|
bool EtcdPeerProxy = 202 [(gogoproto.moretags) = "yaml:\"etcd-peer-proxy\""];
|
2018-03-28 03:01:18 +03:00
|
|
|
|
2018-04-06 00:45:40 +03:00
|
|
|
// EtcdClientEndpoint is the etcd client endpoint.
|
|
|
|
string EtcdClientEndpoint = 301 [(gogoproto.moretags) = "yaml:\"etcd-client-endpoint\""];
|
2018-03-28 03:01:18 +03:00
|
|
|
// Etcd defines etcd binary configuration flags.
|
2018-04-06 00:45:40 +03:00
|
|
|
Etcd Etcd = 302 [(gogoproto.moretags) = "yaml:\"etcd\""];
|
2018-04-06 00:15:50 +03:00
|
|
|
|
|
|
|
// ClientCertData contains cert file contents from this member's etcd server.
|
|
|
|
string ClientCertData = 401 [(gogoproto.moretags) = "yaml:\"client-cert-data\""];
|
2018-04-06 01:08:05 +03:00
|
|
|
string ClientCertPath = 402 [(gogoproto.moretags) = "yaml:\"client-cert-path\""];
|
2018-04-06 00:15:50 +03:00
|
|
|
// ClientKeyData contains key file contents from this member's etcd server.
|
2018-04-06 01:08:05 +03:00
|
|
|
string ClientKeyData = 403 [(gogoproto.moretags) = "yaml:\"client-key-data\""];
|
|
|
|
string ClientKeyPath = 404 [(gogoproto.moretags) = "yaml:\"client-key-path\""];
|
2018-04-06 00:15:50 +03:00
|
|
|
// ClientTrustedCAData contains trusted CA file contents from this member's etcd server.
|
2018-04-06 01:08:05 +03:00
|
|
|
string ClientTrustedCAData = 405 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-data\""];
|
|
|
|
string ClientTrustedCAPath = 406 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-path\""];
|
2018-04-06 00:15:50 +03:00
|
|
|
|
|
|
|
// PeerCertData contains cert file contents from this member's etcd server.
|
|
|
|
string PeerCertData = 501 [(gogoproto.moretags) = "yaml:\"peer-cert-data\""];
|
2018-04-06 01:08:05 +03:00
|
|
|
string PeerCertPath = 502 [(gogoproto.moretags) = "yaml:\"peer-cert-path\""];
|
2018-04-06 00:15:50 +03:00
|
|
|
// PeerKeyData contains key file contents from this member's etcd server.
|
2018-04-06 01:08:05 +03:00
|
|
|
string PeerKeyData = 503 [(gogoproto.moretags) = "yaml:\"peer-key-data\""];
|
|
|
|
string PeerKeyPath = 504 [(gogoproto.moretags) = "yaml:\"peer-key-path\""];
|
2018-04-06 00:15:50 +03:00
|
|
|
// PeerTrustedCAData contains trusted CA file contents from this member's etcd server.
|
2018-04-06 01:08:05 +03:00
|
|
|
string PeerTrustedCAData = 505 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-data\""];
|
|
|
|
string PeerTrustedCAPath = 506 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-path\""];
|
2018-03-28 03:01:18 +03:00
|
|
|
}
|
|
|
|
|
2018-04-09 23:39:42 +03:00
|
|
|
// FailureCase defines various system faults in distributed systems,
|
|
|
|
// in order to verify correct behavior of etcd servers and clients.
|
2018-03-28 03:01:18 +03:00
|
|
|
enum FailureCase {
|
2018-04-09 23:39:42 +03:00
|
|
|
// KILL_ONE_FOLLOWER stops a randomly chosen follower (non-leader)
|
|
|
|
// but does not delete its data directories on disk for next restart.
|
|
|
|
// It waits "failure-delay-ms" before recovering this failure.
|
|
|
|
// The expected behavior is that the follower comes back online
|
|
|
|
// and rejoins the cluster, and then each member continues to process
|
|
|
|
// client requests ('Put' request that requires Raft consensus).
|
2018-03-28 03:01:18 +03:00
|
|
|
KILL_ONE_FOLLOWER = 0;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly chosen
|
|
|
|
// follower but does not delete its data directories on disk for next
|
|
|
|
// restart. And waits until most up-to-date node (leader) applies the
|
|
|
|
// snapshot count of entries since the stop operation.
|
|
|
|
// The expected behavior is that the follower comes back online and
|
|
|
|
// rejoins the cluster, and then active leader sends snapshot
|
|
|
|
// to the follower to force it to follow the leader's log.
|
|
|
|
// As always, after recovery, each member must be able to process
|
|
|
|
// client requests.
|
2018-04-05 05:24:48 +03:00
|
|
|
KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 1;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// KILL_LEADER stops the active leader node but does not delete its
|
|
|
|
// data directories on disk for next restart. Then it waits
|
|
|
|
// "failure-delay-ms" before recovering this failure, in order to
|
|
|
|
// trigger election timeouts.
|
|
|
|
// The expected behavior is that a new leader gets elected, and the
|
|
|
|
// old leader comes back online and rejoins the cluster as a follower.
|
|
|
|
// As always, after recovery, each member must be able to process
|
|
|
|
// client requests.
|
2018-04-05 05:24:48 +03:00
|
|
|
KILL_LEADER = 2;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader node
|
|
|
|
// but does not delete its data directories on disk for next restart.
|
|
|
|
// And waits until most up-to-date node ("new" leader) applies the
|
|
|
|
// snapshot count of entries since the stop operation.
|
|
|
|
// The expected behavior is that cluster elects a new leader, and the
|
|
|
|
// old leader comes back online and rejoins the cluster as a follower.
|
|
|
|
// And it receives the snapshot from the new leader to overwrite its
|
|
|
|
// store. As always, after recovery, each member must be able to
|
|
|
|
// process client requests.
|
2018-04-05 05:24:48 +03:00
|
|
|
KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT = 3;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// KILL_QUORUM stops majority number of nodes to make the whole cluster
|
|
|
|
// inoperable but does not delete data directories on stopped nodes
|
|
|
|
// for next restart. And it waits "failure-delay-ms" before recovering
|
|
|
|
// this failure.
|
|
|
|
// The expected behavior is that nodes come back online, thus cluster
|
|
|
|
// comes back operative as well. As always, after recovery, each member
|
|
|
|
// must be able to process client requests.
|
2018-03-28 03:01:18 +03:00
|
|
|
KILL_QUORUM = 4;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// KILL_ALL stops the whole cluster but does not delete data directories
|
|
|
|
// on disk for next restart. And it waits "failure-delay-ms" before
|
|
|
|
// recovering this failure.
|
|
|
|
// The expected behavior is that nodes come back online, thus cluster
|
|
|
|
// comes back operative as well. As always, after recovery, each member
|
|
|
|
// must be able to process client requests.
|
2018-03-28 03:01:18 +03:00
|
|
|
KILL_ALL = 5;
|
|
|
|
|
2018-04-09 23:39:42 +03:00
|
|
|
// BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER drops all outgoing/incoming
|
|
|
|
// packets from/to the peer port on a randomly chosen follower
|
|
|
|
// (non-leader), and waits for "failure-delay-ms" until recovery.
|
|
|
|
// The expected behavior is that once dropping operation is undone,
|
|
|
|
// each member must be able to process client requests.
|
2018-04-05 05:24:48 +03:00
|
|
|
BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER = 100;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT drops
|
|
|
|
// all outgoing/incoming packets from/to the peer port on a randomly
|
|
|
|
// chosen follower (non-leader), and waits for most up-to-date node
|
|
|
|
// (leader) applies the snapshot count of entries since the blackhole
|
|
|
|
// operation.
|
|
|
|
// The expected behavior is that once packet drop operation is undone,
|
|
|
|
// the slow follower tries to catch up, possibly receiving the snapshot
|
|
|
|
// from the active leader. As always, after recovery, each member must
|
|
|
|
// be able to process client requests.
|
2018-04-05 05:24:48 +03:00
|
|
|
BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 101;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// BLACKHOLE_PEER_PORT_TX_RX_LEADER drops all outgoing/incoming packets
|
|
|
|
// from/to the peer port on the active leader (isolated), and waits for
|
|
|
|
// "failure-delay-ms" until recovery, in order to trigger election timeout.
|
|
|
|
// The expected behavior is that after election timeout, a new leader gets
|
|
|
|
// elected, and once dropping operation is undone, the old leader comes
|
|
|
|
// back and rejoins the cluster as a follower. As always, after recovery,
|
|
|
|
// each member must be able to process client requests.
|
2018-04-05 05:24:48 +03:00
|
|
|
BLACKHOLE_PEER_PORT_TX_RX_LEADER = 102;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT drops all
|
|
|
|
// outgoing/incoming packets from/to the peer port on the active leader,
|
|
|
|
// and waits for most up-to-date node (leader) applies the snapshot
|
|
|
|
// count of entries since the blackhole operation.
|
|
|
|
// The expected behavior is that cluster elects a new leader, and once
|
|
|
|
// dropping operation is undone, the old leader comes back and rejoins
|
|
|
|
// the cluster as a follower. The slow follower tries to catch up, likely
|
|
|
|
// receiving the snapshot from the new active leader. As always, after
|
|
|
|
// recovery, each member must be able to process client requests.
|
2018-04-05 05:24:48 +03:00
|
|
|
BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 103;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// BLACKHOLE_PEER_PORT_TX_RX_QUORUM drops all outgoing/incoming packets
|
|
|
|
// from/to the peer ports on majority nodes of cluster, thus losing its
|
|
|
|
// leader and cluster being inoperable. And it waits for "failure-delay-ms"
|
|
|
|
// until recovery.
|
|
|
|
// The expected behavior is that once packet drop operation is undone,
|
|
|
|
// nodes come back online, thus cluster comes back operative. As always,
|
|
|
|
// after recovery, each member must be able to process client requests.
|
2018-04-05 05:24:48 +03:00
|
|
|
BLACKHOLE_PEER_PORT_TX_RX_QUORUM = 104;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// BLACKHOLE_PEER_PORT_TX_RX_ALL drops all outgoing/incoming packets
|
|
|
|
// from/to the peer ports on all nodes, thus making cluster totally
|
|
|
|
// inoperable. It waits for "failure-delay-ms" until recovery.
|
|
|
|
// The expected behavior is that once packet drop operation is undone,
|
|
|
|
// nodes come back online, thus cluster comes back operative. As always,
|
|
|
|
// after recovery, each member must be able to process client requests.
|
2018-04-05 05:24:48 +03:00
|
|
|
BLACKHOLE_PEER_PORT_TX_RX_ALL = 105;
|
2018-03-28 03:01:18 +03:00
|
|
|
|
2018-04-09 23:39:42 +03:00
|
|
|
// DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming packets
|
|
|
|
// from/to the peer port on a randomly chosen follower (non-leader).
|
|
|
|
// It waits for "failure-delay-ms" until recovery.
|
|
|
|
// The expected behavior is that once packet delay operation is undone,
|
|
|
|
// the follower comes back and tries to catch up with latest changes from
|
|
|
|
// cluster. And as always, after recovery, each member must be able to
|
|
|
|
// process client requests.
|
2018-04-05 05:24:48 +03:00
|
|
|
DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 200;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming
|
|
|
|
// packets from/to the peer port on a randomly chosen follower
|
|
|
|
// (non-leader) with a randomized time duration (thus isolated). It waits
|
|
|
|
// for "failure-delay-ms" until recovery.
|
|
|
|
// The expected behavior is that once packet delay operation is undone,
|
|
|
|
// each member must be able to process client requests.
|
2018-04-05 21:12:02 +03:00
|
|
|
RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 201;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
|
|
|
|
// outgoing/incoming packets from/to the peer port on a randomly chosen
|
|
|
|
// follower (non-leader), and waits for most up-to-date node (leader)
|
|
|
|
// applies the snapshot count of entries since the delay operation.
|
|
|
|
// The expected behavior is that the delayed follower gets isolated
|
|
|
|
// and behind the current active leader, and once delay operation is undone,
|
|
|
|
// the slow follower comes back and catches up possibly receiving snapshot
|
|
|
|
// from the active leader. As always, after recovery, each member must be
|
|
|
|
// able to process client requests.
|
2018-04-05 21:12:02 +03:00
|
|
|
DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 202;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
|
|
|
|
// outgoing/incoming packets from/to the peer port on a randomly chosen
|
|
|
|
// follower (non-leader) with a randomized time duration, and waits for
|
|
|
|
// most up-to-date node (leader) applies the snapshot count of entries
|
|
|
|
// since the delay operation.
|
|
|
|
// The expected behavior is that the delayed follower gets isolated
|
|
|
|
// and behind the current active leader, and once delay operation is undone,
|
|
|
|
// the slow follower comes back and catches up, possibly receiving a
|
|
|
|
// snapshot from the active leader. As always, after recovery, each member
|
|
|
|
// must be able to process client requests.
|
2018-04-05 21:12:02 +03:00
|
|
|
RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 203;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets from/to
|
|
|
|
// the peer port on the active leader. And waits for "failure-delay-ms"
|
|
|
|
// until recovery.
|
|
|
|
// The expected behavior is that cluster may elect a new leader, and
|
|
|
|
// once packet delay operation is undone, the (old) leader comes back
|
|
|
|
// and tries to catch up with latest changes from cluster. As always,
|
|
|
|
// after recovery, each member must be able to process client requests.
|
2018-04-05 21:12:02 +03:00
|
|
|
DELAY_PEER_PORT_TX_RX_LEADER = 204;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// RANDOM_DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets
|
|
|
|
// from/to the peer port on the active leader with a randomized time
|
|
|
|
// duration. And waits for "failure-delay-ms" until recovery.
|
|
|
|
// The expected behavior is that cluster may elect a new leader, and
|
|
|
|
// once packet delay operation is undone, the (old) leader comes back
|
|
|
|
// and tries to catch up with latest changes from cluster. As always,
|
|
|
|
// after recovery, each member must be able to process client requests.
|
2018-04-05 21:12:02 +03:00
|
|
|
RANDOM_DELAY_PEER_PORT_TX_RX_LEADER = 205;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
|
|
|
|
// outgoing/incoming packets from/to the peer port on the active leader,
|
|
|
|
// and waits for most up-to-date node (current or new leader) applies the
|
|
|
|
// snapshot count of entries since the delay operation.
|
|
|
|
// The expected behavior is that cluster may elect a new leader, and
|
|
|
|
// the old leader gets isolated and behind the current active leader,
|
|
|
|
// and once delay operation is undone, the slow follower comes back
|
|
|
|
// and catches up, likely receiving a snapshot from the active leader.
|
|
|
|
// As always, after recovery, each member must be able to process client
|
|
|
|
// requests.
|
2018-04-05 21:12:02 +03:00
|
|
|
DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 206;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
|
|
|
|
// outgoing/incoming packets from/to the peer port on the active leader,
|
|
|
|
// with a randomized time duration. And it waits for most up-to-date node
|
|
|
|
// (current or new leader) applies the snapshot count of entries since the
|
|
|
|
// delay operation.
|
|
|
|
// The expected behavior is that cluster may elect a new leader, and
|
|
|
|
// the old leader gets isolated and behind the current active leader,
|
|
|
|
// and once delay operation is undone, the slow follower comes back
|
|
|
|
// and catches up, likely receiving a snapshot from the active leader.
|
|
|
|
// As always, after recovery, each member must be able to process client
|
|
|
|
// requests.
|
2018-04-05 21:12:02 +03:00
|
|
|
RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 207;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets from/to
|
|
|
|
// the peer ports on majority nodes of cluster. And it waits for
|
|
|
|
// "failure-delay-ms" until recovery, likely to trigger election timeouts.
|
|
|
|
// The expected behavior is that cluster may elect a new leader, while
|
|
|
|
// quorum of nodes struggle with slow networks, and once delay operation
|
|
|
|
// is undone, nodes come back and cluster comes back operative. As always,
|
|
|
|
// after recovery, each member must be able to process client requests.
|
2018-04-05 21:12:02 +03:00
|
|
|
DELAY_PEER_PORT_TX_RX_QUORUM = 208;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets
|
|
|
|
// from/to the peer ports on majority nodes of cluster, with randomized
|
|
|
|
// time durations. And it waits for "failure-delay-ms" until recovery,
|
|
|
|
// likely to trigger election timeouts.
|
|
|
|
// The expected behavior is that cluster may elect a new leader, while
|
|
|
|
// quorum of nodes struggle with slow networks, and once delay operation
|
|
|
|
// is undone, nodes come back and cluster comes back operative. As always,
|
|
|
|
// after recovery, each member must be able to process client requests.
|
2018-04-05 21:12:02 +03:00
|
|
|
RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM = 209;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets from/to the
|
|
|
|
// peer ports on all nodes. And it waits for "failure-delay-ms" until
|
|
|
|
// recovery, likely to trigger election timeouts.
|
|
|
|
// The expected behavior is that cluster may become totally inoperable,
|
|
|
|
// struggling with slow networks across the whole cluster. Once delay
|
|
|
|
// operation is undone, nodes come back and cluster comes back operative.
|
|
|
|
// As always, after recovery, each member must be able to process client
|
|
|
|
// requests.
|
2018-04-05 21:12:02 +03:00
|
|
|
DELAY_PEER_PORT_TX_RX_ALL = 210;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// RANDOM_DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets
|
|
|
|
// from/to the peer ports on all nodes, with randomized time durations.
|
|
|
|
// And it waits for "failure-delay-ms" until recovery, likely to trigger
|
|
|
|
// election timeouts.
|
|
|
|
// The expected behavior is that cluster may become totally inoperable,
|
|
|
|
// struggling with slow networks across the whole cluster. Once delay
|
|
|
|
// operation is undone, nodes come back and cluster comes back operative.
|
|
|
|
// As always, after recovery, each member must be able to process client
|
|
|
|
// requests.
|
2018-04-05 21:12:02 +03:00
|
|
|
RANDOM_DELAY_PEER_PORT_TX_RX_ALL = 211;
|
2018-03-28 03:01:18 +03:00
|
|
|
|
2018-04-09 23:39:42 +03:00
|
|
|
// NO_FAIL_WITH_STRESS runs no-op failure injection that does not do
|
|
|
|
// anything against cluster for "failure-delay-ms" duration, while
|
|
|
|
// stressers are still sending requests.
|
2018-04-05 05:24:48 +03:00
|
|
|
NO_FAIL_WITH_STRESS = 300;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
2018-04-04 22:37:51 +03:00
|
|
|
// NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS runs no-op failure injection
|
2018-04-09 23:39:42 +03:00
|
|
|
// that does not do anything against cluster for "failure-delay-ms"
|
|
|
|
// duration, while all stressers are stopped.
|
2018-04-05 05:24:48 +03:00
|
|
|
NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS = 301;
|
2018-04-04 22:37:51 +03:00
|
|
|
|
2018-04-09 23:39:42 +03:00
|
|
|
// FAILPOINTS injects failpoints to etcd server runtime, triggering panics
|
|
|
|
// in critical code paths.
|
2018-04-05 05:24:48 +03:00
|
|
|
FAILPOINTS = 400;
|
2018-04-09 23:39:42 +03:00
|
|
|
|
|
|
|
// EXTERNAL runs external failure injection scripts.
|
2018-04-05 05:24:48 +03:00
|
|
|
EXTERNAL = 500;
|
2018-03-28 03:01:18 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
enum StressType {
|
|
|
|
KV = 0;
|
|
|
|
LEASE = 1;
|
2018-04-04 22:37:51 +03:00
|
|
|
ELECTION_RUNNER = 2;
|
|
|
|
WATCH_RUNNER = 3;
|
|
|
|
LOCK_RACER_RUNNER = 4;
|
|
|
|
LEASE_RUNNER = 5;
|
2018-03-28 03:01:18 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
message Tester {
|
2018-04-09 17:09:48 +03:00
|
|
|
string DataDir = 1 [(gogoproto.moretags) = "yaml:\"data-dir\""];
|
|
|
|
string Network = 2 [(gogoproto.moretags) = "yaml:\"network\""];
|
|
|
|
string Addr = 3 [(gogoproto.moretags) = "yaml:\"addr\""];
|
2018-03-28 03:01:18 +03:00
|
|
|
|
|
|
|
// DelayLatencyMsRv is the delay latency in milliseconds,
|
|
|
|
// to inject to simulated slow network.
|
|
|
|
uint32 DelayLatencyMs = 11 [(gogoproto.moretags) = "yaml:\"delay-latency-ms\""];
|
|
|
|
// DelayLatencyMsRv is the delay latency random variable in milliseconds.
|
|
|
|
uint32 DelayLatencyMsRv = 12 [(gogoproto.moretags) = "yaml:\"delay-latency-ms-rv\""];
|
2018-04-05 21:33:14 +03:00
|
|
|
// UpdatedDelayLatencyMs is the update delay latency in milliseconds,
|
|
|
|
// to inject to simulated slow network. It's the final latency to apply,
|
|
|
|
// in case the latency numbers are randomly generated from given delay latency field.
|
2018-04-05 21:34:59 +03:00
|
|
|
uint32 UpdatedDelayLatencyMs = 13 [(gogoproto.moretags) = "yaml:\"updated-delay-latency-ms\""];
|
2018-03-28 03:01:18 +03:00
|
|
|
|
|
|
|
// RoundLimit is the limit of rounds to run failure set (-1 to run without limits).
|
|
|
|
int32 RoundLimit = 21 [(gogoproto.moretags) = "yaml:\"round-limit\""];
|
|
|
|
// ExitOnFailure is true, then exit tester on first failure.
|
|
|
|
bool ExitOnFailure = 22 [(gogoproto.moretags) = "yaml:\"exit-on-failure\""];
|
|
|
|
// ConsistencyCheck is true to check consistency (revision, hash).
|
|
|
|
bool ConsistencyCheck = 23 [(gogoproto.moretags) = "yaml:\"consistency-check\""];
|
|
|
|
// EnablePprof is true to enable profiler.
|
|
|
|
bool EnablePprof = 24 [(gogoproto.moretags) = "yaml:\"enable-pprof\""];
|
|
|
|
|
2018-04-04 23:11:01 +03:00
|
|
|
// FailureDelayMs is the delay duration after failure is injected.
|
|
|
|
// Useful when triggering snapshot or no-op failure cases.
|
2018-04-09 17:09:48 +03:00
|
|
|
uint32 FailureDelayMs = 31 [(gogoproto.moretags) = "yaml:\"failure-delay-ms\""];
|
2018-03-28 03:01:18 +03:00
|
|
|
// FailureShuffle is true to randomize failure injecting order.
|
2018-04-09 17:09:48 +03:00
|
|
|
bool FailureShuffle = 32 [(gogoproto.moretags) = "yaml:\"failure-shuffle\""];
|
|
|
|
// FailureCases is the selected test cases to schedule.
|
|
|
|
// If empty, run all failure cases.
|
|
|
|
repeated string FailureCases = 33 [(gogoproto.moretags) = "yaml:\"failure-cases\""];
|
|
|
|
// Failpoinommands is the list of "gofail" commands (e.g. panic("etcd-tester"),1*sleep(1000)
|
2018-04-04 23:11:01 +03:00
|
|
|
repeated string FailpointCommands = 34 [(gogoproto.moretags) = "yaml:\"failpoint-commands\""];
|
2018-03-28 03:01:18 +03:00
|
|
|
|
|
|
|
// RunnerExecPath is a path of etcd-runner binary.
|
|
|
|
string RunnerExecPath = 41 [(gogoproto.moretags) = "yaml:\"runner-exec-path\""];
|
|
|
|
// ExternalExecPath is a path of script for enabling/disabling an external fault injector.
|
|
|
|
string ExternalExecPath = 42 [(gogoproto.moretags) = "yaml:\"external-exec-path\""];
|
|
|
|
|
|
|
|
// StressTypes is the list of stresser names:
|
|
|
|
// keys, lease, nop, election-runner, watch-runner, lock-racer-runner, lease-runner.
|
|
|
|
repeated string StressTypes = 101 [(gogoproto.moretags) = "yaml:\"stress-types\""];
|
|
|
|
// StressKeySize is the size of each small key written into etcd.
|
|
|
|
int32 StressKeySize = 102 [(gogoproto.moretags) = "yaml:\"stress-key-size\""];
|
|
|
|
// StressKeySizeLarge is the size of each large key written into etcd.
|
|
|
|
int32 StressKeySizeLarge = 103 [(gogoproto.moretags) = "yaml:\"stress-key-size-large\""];
|
|
|
|
// StressKeySuffixRange is the count of key range written into etcd.
|
|
|
|
// Stress keys are created with "fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)".
|
|
|
|
int32 StressKeySuffixRange = 104 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range\""];
|
|
|
|
// StressKeySuffixRangeTxn is the count of key range written into etcd txn (max 100).
|
|
|
|
// Stress keys are created with "fmt.Sprintf("/k%03d", i)".
|
|
|
|
int32 StressKeySuffixRangeTxn = 105 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range-txn\""];
|
|
|
|
// StressKeyTxnOps is the number of operations per a transaction (max 64).
|
|
|
|
int32 StressKeyTxnOps = 106 [(gogoproto.moretags) = "yaml:\"stress-key-txn-ops\""];
|
2018-04-04 22:37:51 +03:00
|
|
|
|
|
|
|
// StressClients is the number of concurrent stressing clients
|
|
|
|
// with "one" shared TCP connection.
|
|
|
|
int32 StressClients = 201 [(gogoproto.moretags) = "yaml:\"stress-clients\""];
|
2018-03-28 03:01:18 +03:00
|
|
|
// StressQPS is the maximum number of stresser requests per second.
|
2018-04-04 22:37:51 +03:00
|
|
|
int32 StressQPS = 202 [(gogoproto.moretags) = "yaml:\"stress-qps\""];
|
2018-03-28 03:01:18 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
message Request {
|
|
|
|
Operation Operation = 1;
|
2018-04-06 00:15:50 +03:00
|
|
|
// Member contains the same Member object from tester configuration.
|
2018-03-28 03:01:18 +03:00
|
|
|
Member Member = 2;
|
2018-04-06 00:15:50 +03:00
|
|
|
// Tester contains tester configuration.
|
2018-03-28 03:01:18 +03:00
|
|
|
Tester Tester = 3;
|
|
|
|
}
|
|
|
|
|
|
|
|
message Response {
|
|
|
|
bool Success = 1;
|
|
|
|
string Status = 2;
|
2018-04-06 00:15:50 +03:00
|
|
|
// Member contains the same Member object from tester request.
|
|
|
|
Member Member = 3;
|
2018-03-28 03:01:18 +03:00
|
|
|
}
|