version: bump up to 3.1.13

semaphore: run release test with v3.1.12
Signed-off-by: Gyuho Lee <gyuhox@gmail.com>
2018-03-29 10:28:55 -07:00 · 2018-03-29 09:23:15 -07:00 · 2018-03-28 12:40:07 -07:00 · 2018-03-28 12:39:59 -07:00 · 2018-03-28 12:39:59 -07:00 · 2018-03-28 10:17:30 -07:00
667 changed files with 53783 additions and 12076 deletions
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@@ -5,4 +5,4 @@ A good bug report has some very specific qualities, so please read over our shor

 To ask a question, go ahead and ignore this.

-[report_bugs]: ../Documentation/reporting_bugs.md
+[report_bugs]: https://github.com/coreos/etcd/blob/master/Documentation/reporting_bugs.md
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -2,4 +2,4 @@

 Please read our [contribution workflow][contributing] before submitting a pull request.

-[contributing]: ../CONTRIBUTING.md#contribution-flow
+[contributing]: https://github.com/coreos/etcd/blob/master/CONTRIBUTING.md#contribution-flow
--- a/.semaphore.sh
+++ b/.semaphore.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+TEST_SUFFIX=$(date +%s | base64 | head -c 15)
+
+TEST_OPTS="PASSES='build unit release integration_e2e functional' MANUAL_VER=v3.1.12"
+if [ "$TEST_ARCH" == "386" ]; then
+	TEST_OPTS="GOARCH=386 PASSES='build unit integration_e2e'"
+fi
+
+docker run \
+	--rm \
+	--volume=`pwd`:/go/src/github.com/coreos/etcd \
+	gcr.io/etcd-development/etcd-test:go1.8.7 \
+	/bin/bash -c "${TEST_OPTS} ./test 2>&1 | tee test-${TEST_SUFFIX}.log"
+
+! egrep "(--- FAIL:|panic: test timed out|appears to have leaked)" -B50 -A10 test-${TEST_SUFFIX}.log
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,27 +1,43 @@
-dist: trusty
 language: go
 go_import_path: github.com/coreos/etcd
-sudo: false
+
+sudo: required
+
+services: docker

 go:
-  - 1.7.3
-  - tip
+- "1.8.7"
+- tip
+
+notifications:
+  on_success: never
+  on_failure: never

 env:
-  global:
-   - GO15VENDOREXPERIMENT=1
  matrix:
-   - TARGET=amd64
-   - TARGET=arm64
-   - TARGET=arm
-   - TARGET=386
-   - TARGET=ppc64le
+  - TARGET=amd64
+  - TARGET=amd64-go-tip
+  - TARGET=darwin-amd64
+  - TARGET=windows-amd64
+  - TARGET=arm64
+  - TARGET=arm
+  - TARGET=386
+  - TARGET=ppc64le

 matrix:
  fast_finish: true
  allow_failures:
-    - go: tip
+  - go: tip
+    env: TARGET=amd64-go-tip
  exclude:
+  - go: "1.8.7"
+    env: TARGET=amd64-go-tip
+  - go: tip
+    env: TARGET=amd64
+  - go: tip
+    env: TARGET=darwin-amd64
+  - go: tip
+    env: TARGET=windows-amd64
  - go: tip
    env: TARGET=arm
  - go: tip
@@ -31,33 +47,43 @@ matrix:
  - go: tip
    env: TARGET=ppc64le

-addons:
-  apt:
-    packages:
-    - libpcap-dev
-    - libaspell-dev
-    - libhunspell-dev
-
 before_install:
- - go get -v github.com/chzchzchz/goword
- - go get -v honnef.co/go/simple/cmd/gosimple
- - go get -v honnef.co/go/unused/cmd/unused
+- if [[ $TRAVIS_GO_VERSION == 1.* ]]; then docker pull gcr.io/etcd-development/etcd-test:go${TRAVIS_GO_VERSION}; fi

-# disable godep restore override
 install:
- - pushd cmd/etcd && go get -t -v ./... && popd
+- pushd cmd/etcd && go get -t -v ./... && popd

 script:
+ - echo "TRAVIS_GO_VERSION=${TRAVIS_GO_VERSION}"
 - >
    case "${TARGET}" in
      amd64)
+        docker run --rm \
+          --volume=`pwd`:/go/src/github.com/coreos/etcd gcr.io/etcd-development/etcd-test:go${TRAVIS_GO_VERSION} \
+          /bin/bash -c "GOARCH=amd64 ./test"
+        ;;
+      amd64-go-tip)
        GOARCH=amd64 ./test
        ;;
+      darwin-amd64)
+        docker run --rm \
+          --volume=`pwd`:/go/src/github.com/coreos/etcd gcr.io/etcd-development/etcd-test:go${TRAVIS_GO_VERSION} \
+          /bin/bash -c "GO_BUILD_FLAGS='-a -v' GOOS=darwin GOARCH=amd64 ./build"
+        ;;
+      windows-amd64)
+        docker run --rm \
+          --volume=`pwd`:/go/src/github.com/coreos/etcd gcr.io/etcd-development/etcd-test:go${TRAVIS_GO_VERSION} \
+          /bin/bash -c "GO_BUILD_FLAGS='-a -v' GOOS=windows GOARCH=amd64 ./build"
+        ;;
      386)
-        GOARCH=386 PASSES="build unit" ./test
+        docker run --rm \
+          --volume=`pwd`:/go/src/github.com/coreos/etcd gcr.io/etcd-development/etcd-test:go${TRAVIS_GO_VERSION} \
+          /bin/bash -c "GOARCH=386 PASSES='build unit' ./test"
        ;;
      *)
        # test building out of gopath
-        GO_BUILD_FLAGS="-a -v" GOPATH="" GOARCH="${TARGET}" ./build
+        docker run --rm \
+          --volume=`pwd`:/go/src/github.com/coreos/etcd gcr.io/etcd-development/etcd-test:go${TRAVIS_GO_VERSION} \
+          /bin/bash -c "GO_BUILD_FLAGS='-a -v' GOARCH='${TARGET}' ./build"
        ;;
    esac
--- a/6
+++ b/6
@@ -5,6 +5,12 @@ ADD etcdctl /usr/local/bin/
 RUN mkdir -p /var/etcd/
 RUN mkdir -p /var/lib/etcd/

+# Alpine Linux doesn't use pam, which means that there is no /etc/nsswitch.conf,
+# but Golang relies on /etc/nsswitch.conf to check the order of DNS resolving
+# (see https://github.com/golang/go/commit/9dee7771f561cf6aee081c0af6658cc81fac3918)
+# To fix this we just create /etc/nsswitch.conf and add the following line:
+RUN echo 'hosts: files mdns4_minimal [NOTFOUND=return] dns mdns4' >> /etc/nsswitch.conf
+
 EXPOSE 2379 2380

 # Define default command.
--- a/57
+++ b/57
@@ -0,0 +1,57 @@
+FROM ubuntu:16.10
+
+RUN rm /bin/sh && ln -s /bin/bash /bin/sh
+RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
+
+RUN apt-get -y update \
+  && apt-get -y install \
+  build-essential \
+  gcc \
+  apt-utils \
+  pkg-config \
+  software-properties-common \
+  apt-transport-https \
+  libssl-dev \
+  sudo \
+  bash \
+  curl \
+  wget \
+  tar \
+  git \
+  netcat \
+  libaspell-dev \
+  libhunspell-dev \
+  hunspell-en-us \
+  aspell-en \
+  shellcheck \
+  && apt-get -y update \
+  && apt-get -y upgrade \
+  && apt-get -y autoremove \
+  && apt-get -y autoclean
+
+ENV GOROOT /usr/local/go
+ENV GOPATH /go
+ENV PATH ${GOPATH}/bin:${GOROOT}/bin:${PATH}
+ENV GO_VERSION REPLACE_ME_GO_VERSION
+ENV GO_DOWNLOAD_URL https://storage.googleapis.com/golang
+RUN rm -rf ${GOROOT} \
+  && curl -s ${GO_DOWNLOAD_URL}/go${GO_VERSION}.linux-amd64.tar.gz | tar -v -C /usr/local/ -xz \
+  && mkdir -p ${GOPATH}/src ${GOPATH}/bin \
+  && go version
+
+RUN mkdir -p ${GOPATH}/src/github.com/coreos/etcd
+WORKDIR ${GOPATH}/src/github.com/coreos/etcd
+
+ADD ./scripts/install-marker.sh /tmp/install-marker.sh
+
+RUN go get -v -u -tags spell github.com/chzchzchz/goword \
+  && go get -v -u github.com/coreos/license-bill-of-materials \
+  && go get -v -u honnef.co/go/tools/cmd/gosimple \
+  && go get -v -u honnef.co/go/tools/cmd/unused \
+  && go get -v -u honnef.co/go/tools/cmd/staticcheck \
+  && go get -v -u github.com/wadey/gocovmerge \
+  && go get -v -u github.com/gordonklaus/ineffassign \
+  && /tmp/install-marker.sh amd64 \
+  && rm -f /tmp/install-marker.sh \
+  && curl -s https://codecov.io/bash >/codecov \
+  && chmod 700 /codecov
--- a/Documentation/README.md
+++ b/Documentation/README.md
@@ -0,0 +1 @@
+docs.md
--- a/Documentation/benchmarks/etcd-2-1-0-alpha-benchmarks.md
+++ b/Documentation/benchmarks/etcd-2-1-0-alpha-benchmarks.md
@@ -49,4 +49,4 @@ Bootstrap another machine and use the [hey HTTP benchmark tool][hey] to send req
 | 256               | 256               | all servers        | 3061      | 119.3 |

 [hey]: https://github.com/rakyll/hey
-[hack-benchmark]: /hack/benchmark/
+[hack-benchmark]: https://github.com/coreos/etcd/tree/master/hack/benchmark
--- a/Documentation/benchmarks/etcd-2-2-0-rc-benchmarks.md
+++ b/Documentation/benchmarks/etcd-2-2-0-rc-benchmarks.md
@@ -69,4 +69,4 @@ Bootstrap another machine and use the [hey HTTP benchmark tool][hey] to send req
 [hey]: https://github.com/rakyll/hey
 [c7146bd5]: https://github.com/coreos/etcd/commits/c7146bd5f2c73716091262edc638401bb8229144
 [etcd-2.1-benchmark]: etcd-2-1-0-alpha-benchmarks.md
-[hack-benchmark]: /hack/benchmark/
+[hack-benchmark]: ../../hack/benchmark/
--- a/Documentation/benchmarks/etcd-3-demo-benchmarks.md
+++ b/Documentation/benchmarks/etcd-3-demo-benchmarks.md
@@ -39,4 +39,4 @@ The performance is nearly the same as the one with empty server handler.
 The performance with empty server handler is not affected by one put. So the
 performance downgrade should be caused by storage package.

-[etcd-v3-benchmark]: /tools/benchmark/
+[etcd-v3-benchmark]: ../../tools/benchmark/
--- a/Documentation/dev-guide/api_grpc_gateway.md
+++ b/Documentation/dev-guide/api_grpc_gateway.md
@@ -8,6 +8,8 @@ etcd v3 uses [gRPC][grpc] for its messaging protocol. The etcd project includes

 The gateway accepts a [JSON mapping][json-mapping] for etcd's [protocol buffer][api-ref] message definitions. Note that `key` and `value` fields are defined as byte arrays and therefore must be base64 encoded in JSON.

+Use `curl` to put and get a key:
+
 ```bash
 <<COMMENT
 https://www.base64encode.org/
@@ -17,11 +19,24 @@ COMMENT

 curl -L http://localhost:2379/v3alpha/kv/put \
 	-X POST -d '{"key": "Zm9v", "value": "YmFy"}'
+# {"header":{"cluster_id":"12585971608760269493","member_id":"13847567121247652255","revision":"2","raft_term":"3"}}

 curl -L http://localhost:2379/v3alpha/kv/range \
 	-X POST -d '{"key": "Zm9v"}'
+# {"header":{"cluster_id":"12585971608760269493","member_id":"13847567121247652255","revision":"2","raft_term":"3"},"kvs":[{"key":"Zm9v","create_revision":"2","mod_revision":"2","version":"1","value":"YmFy"}],"count":"1"}
 ```

+Use `curl` to watch a key:
+
+```bash
+curl http://localhost:2379/v3alpha/watch \
+        -X POST -d '{"create_request": {"key":"Zm9v"} }' &
+# {"result":{"header":{"cluster_id":"12585971608760269493","member_id":"13847567121247652255","revision":"1","raft_term":"2"},"created":true}}
+
+curl -L http://localhost:2379/v3alpha/kv/put \
+	-X POST -d '{"key": "Zm9v", "value": "YmFy"}' >/dev/null 2>&1
+# {"result":{"header":{"cluster_id":"12585971608760269493","member_id":"13847567121247652255","revision":"2","raft_term":"2"},"events":[{"kv":{"key":"Zm9v","create_revision":"2","mod_revision":"2","version":"1","value":"YmFy"}}]}}
+```

 ## Swagger

--- a/Documentation/dev-guide/experimental_apis.md
+++ b/Documentation/dev-guide/experimental_apis.md
@@ -1,8 +1,11 @@
 # Experimental APIs and features

-For the most part, the etcd project is stable, but we are still moving fast! We believe in the release fast philosophy. We want to get early feedback on features still in development and stabilizing. Thus, there are, and will be more, experimental features and APIs. We plan to improve these features based on the early feedback from the community, or abandon them if there is little interest, in the next few releases. If you are running a production system, please do not rely on any experimental features or APIs.
+For the most part, the etcd project is stable, but we are still moving fast! We believe in the release fast philosophy. We want to get early feedback on features still in development and stabilizing. Thus, there are, and will be more, experimental features and APIs. We plan to improve these features based on the early feedback from the community, or abandon them if there is little interest, in the next few releases. Please do not rely on any experimental features or APIs in production environment.

 ## The current experimental API/features are:

- v3 auth API: expect to be stable in 3.1 release
- etcd gateway: expect to be stable in 3.1 release
+- [gateway][gateway]: beta, to be stable in 3.2 release
+- [gRPC proxy][grpc-proxy]: alpha, to be stable in 3.2 release
+
+[gateway]: ../op-guide/gateway.md
+[grpc-proxy]: ../op-guide/grpc_proxy.md
--- a/Documentation/dev-guide/interacting_v3.md
+++ b/Documentation/dev-guide/interacting_v3.md
@@ -51,6 +51,7 @@ Suppose the etcd cluster has stored the following keys:
 ```bash
 foo = bar
 foo1 = bar1
+foo2 = bar2
 foo3 = bar3
 ```

@@ -77,22 +78,38 @@ $ etcdctl get foo --print-value-only
 bar
 ```

-Here is the command to range over the keys from `foo` to `foo9`:
+Here is the command to range over the keys from `foo` to `foo3`:

 ```bash
-$ etcdctl get foo foo9
+$ etcdctl get foo foo3
 foo
 bar
 foo1
 bar1
+foo2
+bar2
+```
+
+Note that `foo3` is excluded since the range is over the half-open interval `[foo, foo3)`, excluding `foo3`.
+
+Here is the command to range over all keys prefixed with `foo`:
+
+```bash
+$ etcdctl get --prefix foo
+foo
+bar
+foo1
+bar1
+foo2
+bar2
 foo3
 bar3
 ```

-Here is the command to range over the keys from `foo` to `foo9` limiting the number of results to 2:
+Here is the command to range over all keys prefixed with `foo`, limiting the number of results to 2:

 ```bash
-$ etcdctl get foo foo9 --limit 2
+$ etcdctl get --prefix --limit=2 foo
 foo
 bar
 foo1
@@ -116,29 +133,29 @@ foo1 = bar1_new   # revision = 5
 Here are an example to access the past versions of keys:

 ```bash
-$ etcdctl get foo foo9 # access the most recent versions of keys
+$ etcdctl get --prefix foo # access the most recent versions of keys
 foo
 bar_new
 foo1
 bar1_new

-$ etcdctl get --rev=4 foo foo9 # access the versions of keys at revision 4
+$ etcdctl get --prefix --rev=4 foo # access the versions of keys at revision 4
 foo
 bar_new
 foo1
 bar1

-$ etcdctl get --rev=3 foo foo9 # access the versions of keys at revision 3
+$ etcdctl get --prefix --rev=3 foo # access the versions of keys at revision 3
 foo
 bar
 foo1
 bar1

-$ etcdctl get --rev=2 foo foo9 # access the versions of keys at revision 2
+$ etcdctl get --prefix --rev=2 foo # access the versions of keys at revision 2
 foo
 bar

-$ etcdctl get --rev=1 foo foo9 # access the versions of keys at revision 1
+$ etcdctl get --prefix --rev=1 foo # access the versions of keys at revision 1
 ```

 ## Read keys which are greater than or equal to the byte value of the specified key
@@ -454,4 +471,5 @@ lease 694d5765fc71500b granted with TTL(500s), remaining(132s), attached keys([z

 # if the lease has expired or does not exist it will give the below response:
 Error:  etcdserver: requested lease not found
-```
+```
+
--- a/Documentation/dev-guide/limit.md
+++ b/Documentation/dev-guide/limit.md
@@ -0,0 +1,10 @@
+# System limits
+
+## Request size limit
+
+etcd is designed to handle small key value pairs typical for metadata. Larger requests will work, but may increase the latency of other requests. For the time being, etcd guarantees to support RPC requests with up to 1MB of data. In the future, the size limit may be loosened or made it configurable.
+
+## Storage size limit
+
+The default storage size limit is 2GB, configurable with `--quota-backend-bytes` flag; supports up to 8GB.
+
--- a/Documentation/dev-guide/local_cluster.md
+++ b/Documentation/dev-guide/local_cluster.md
@@ -45,7 +45,7 @@ To interact with the started cluster by using etcdctl:
 # use API version 3
 $ export ETCDCTL_API=3

-$ etcdctl --write-out=table --endpoints=localhost:12379 member list
+$ etcdctl --write-out=table --endpoints=localhost:2379 member list
 +------------------+---------+--------+------------------------+------------------------+
 |        ID        | STATUS  |  NAME  |       PEER ADDRS       |      CLIENT ADDRS      |
 +------------------+---------+--------+------------------------+------------------------+
--- a/Documentation/dev-internal/logging.md
+++ b/Documentation/dev-internal/logging.md
@@ -3,7 +3,7 @@
 etcd uses the [capnslog][capnslog] library for logging application output categorized into *levels*. A log message's level is determined according to these conventions:

 * Error: Data has been lost, a request has failed for a bad reason, or a required resource has been lost
-  * Examples: 
+  * Examples:
    * A failure to allocate disk space for WAL

 * Warning: (Hopefully) Temporary conditions that may cause errors, but may work fine. A replica disappearing (that may reconnect) is a warning.
@@ -26,4 +26,4 @@ etcd uses the [capnslog][capnslog] library for logging application output catego
    * Send a normal message to a remote peer
    * Write a log entry to disk

-[capnslog]: [https://github.com/coreos/pkg/tree/master/capnslog]
+[capnslog]: https://github.com/coreos/pkg/tree/master/capnslog
--- a/Documentation/dl_build.md
+++ b/Documentation/dl_build.md
@@ -10,29 +10,44 @@ The easiest way to get etcd is to use one of the pre-built release binaries whic

 ## Build the latest version

-For those wanting to try the very latest version, build etcd from the `master` branch.
-[Go](https://golang.org/) version 1.6+ (with HTTP2 support) is required to build the latest version of etcd.
-etcd vendors its dependency for official release binaries, while making vendoring optional to avoid import conflicts.
-[`build` script][build-script] would automatically include the vendored dependencies from [`cmd`][cmd-directory] directory.  
+For those wanting to try the very latest version, build etcd from the `master` branch. [Go](https://golang.org/) version 1.7+ is required to build the latest version of etcd. To ensure etcd is built against well-tested libraries, etcd vendors its dependencies for official release binaries. However, etcd's vendoring is also optional to avoid potential import conflicts when embedding the etcd server or using the etcd client.

-Here are the commands to build an etcd binary from the `master` branch:
+First, confirm go 1.7+ is installed:

-```
+```sh
 # go is required
 $ go version
-go version go1.6 darwin/amd64
+go version go1.7.3 darwin/amd64

-# GOPATH should be set correctly
-$ echo $GOPATH
-/Users/example/go
+```

-$ mkdir -p $GOPATH/src/github.com/coreos
-$ cd $GOPATH/src/github.com/coreos
+To build `etcd` from the `master` branch without a `GOPATH` using the official `build` script:
+
+```sh
 $ git clone https://github.com/coreos/etcd.git
 $ cd etcd
 $ ./build
 $ ./bin/etcd
-...
+```
+
+To build a vendored `etcd` from the `master` branch via `go get`:
+
+```sh
+# GOPATH should be set
+$ echo $GOPATH
+/Users/example/go
+$ go get github.com/coreos/etcd/cmd/etcd
+$ $GOPATH/bin/etcd
+```
+
+To build `etcd` from the `master` branch without vendoring (may not build due to upstream conflicts):
+
+```sh
+# GOPATH should be set
+$ echo $GOPATH
+/Users/example/go
+$ go get github.com/coreos/etcd
+$ $GOPATH/bin/etcd
 ```

 ## Test the installation
--- a/Documentation/docs.md
+++ b/Documentation/docs.md
@@ -17,6 +17,7 @@ The easiest way to get started using etcd as a distributed key-value store is to
 - [gRPC naming and discovery][grpc_naming]
 - [Embedding etcd][embed_etcd]
 - [Experimental features and APIs][experimental]
+ - [System limits][system-limit]

 ## Operating etcd clusters

@@ -26,6 +27,7 @@ Administrators who need to create reliable and scalable key-value stores for the
 - [Setting up etcd gateways][gateway]
 - [Setting up etcd gRPC proxy (pre-alpha)][grpc_proxy]
 - [Run etcd clusters inside containers][container]
+ - [Hardware recommendations][hardware]
 - [Configuration][conf]
 - [Security][security]
 - [Monitoring][monitoring]
@@ -40,7 +42,7 @@ Administrators who need to create reliable and scalable key-value stores for the

 To learn more about the concepts and internals behind etcd, read the following pages:

- - Why etcd (TODO)
+ - [Why etcd][why] (TODO)
 - [Understand data model][data_model]
 - [Understand APIs][understand_apis]
 - [Glossary][glossary]
@@ -50,13 +52,19 @@ To learn more about the concepts and internals behind etcd, read the following p

 - [Migrate applications from using API v2 to API v3][v2_migration]
 - [Updating v2.3 to v3.0][v3_upgrade]
+ - [Updating v3.0 to v3.1][v31_upgrade]

-## Troubleshooting
+## Frequently Asked Questions (FAQ)
+
+Answers to [common questions] about etcd.

 [api_ref]: dev-guide/api_reference_v3.md
 [api_grpc_gateway]: dev-guide/api_grpc_gateway.md
 [clustering]: op-guide/clustering.md
 [conf]: op-guide/configuration.md
+[system-limit]: dev-guide/limit.md
+[common questions]: faq.md
+[why]: learning/why.md
 [data_model]: learning/data_model.md
 [demo]: demo.md
 [download_build]: dl_build.md
@@ -66,6 +74,7 @@ To learn more about the concepts and internals behind etcd, read the following p
 [gateway]: op-guide/gateway.md
 [glossary]: learning/glossary.md
 [grpc_proxy]: op-guide/grpc_proxy.md
+[hardware]: op-guide/hardware.md
 [interacting]: dev-guide/interacting_v3.md
 [local_cluster]: dev-guide/local_cluster.md
 [performance]: op-guide/performance.md
@@ -80,3 +89,4 @@ To learn more about the concepts and internals behind etcd, read the following p
 [supported_platform]: op-guide/supported-platform.md
 [experimental]: dev-guide/experimental_apis.md
 [v3_upgrade]: upgrades/upgrade_3_0.md
+[v31_upgrade]: upgrades/upgrade_3_1.md
--- a/Documentation/faq.md
+++ b/Documentation/faq.md
@@ -0,0 +1,128 @@
+## Frequently Asked Questions (FAQ)
+
+### etcd, general
+
+#### Do clients have to send requests to the etcd leader?
+
+[Raft][raft] is leader-based; the leader handles all client requests which need cluster consensus. However, the client does not need to know which node is the leader. Any request that requires consensus sent to a follower is automatically forwarded to the leader. Requests that do not require consensus (e.g., serialized reads) can be processed by any cluster member.
+
+### Configuration
+
+#### What is the difference between advertise-urls and listen-urls?
+
+`listen-urls` specifies the local addresses etcd server binds to for accepting incoming connections. To listen on a port for all interfaces, specify `0.0.0.0` as the listen IP address.
+
+`advertise-urls` specifies the addresses etcd clients or other etcd members should use to contact the etcd server. The advertise addresses must be reachable from the remote machines. Do not advertise addresses like `localhost` or `0.0.0.0` for a production setup since these addresses are unreachable from remote machines.
+
+### Deployment
+
+#### System requirements
+
+Since etcd writes data to disk, SSD is highly recommended. To prevent performance degradation or unintentionally overloading the key-value store, etcd enforces a 2GB default storage size quota, configurable up to 8GB. To avoid swapping or running out of memory, the machine should have at least as much RAM to cover the quota. At CoreOS, an etcd cluster is usually deployed on dedicated CoreOS Container Linux machines with dual-core processors, 2GB of RAM, and 80GB of SSD *at the very least*. **Note that performance is intrinsically workload dependent; please test before production deployment**. See [hardware][hardware-setup] for more recommendations.
+
+Most stable production environment is Linux operating system with amd64 architecture; see [supported platform][supported-platform] for more.
+
+#### Why an odd number of cluster members?
+
+An etcd cluster needs a majority of nodes, a quorum, to agree on updates to the cluster state. For a cluster with n members, quorum is (n/2)+1. For any odd-sized cluster, adding one node will always increase the number of nodes necessary for quorum. Although adding a node to an odd-sized cluster appears better since there are more machines, the fault tolerance is worse since exactly the same number of nodes may fail without losing quorum but there are more nodes that can fail. If the cluster is in a state where it can't tolerate any more failures, adding a node before removing nodes is dangerous because if the new node fails to register with the cluster (e.g., the address is misconfigured), quorum will be permanently lost.
+
+#### What is maximum cluster size?
+
+Theoretically, there is no hard limit. However, an etcd cluster probably should have no more than seven nodes. [Google Chubby lock service][chubby], similar to etcd and widely deployed within Google for many years, suggests running five nodes. A 5-member etcd cluster can tolerate two member failures, which is enough in most cases. Although larger clusters provide better fault tolerance, the write performance suffers because data must be replicated across more machines.
+
+#### What is failure tolerance?
+
+An etcd cluster operates so long as a member quorum can be established. If quorum is lost through transient network failures (e.g., partitions), etcd automatically and safely resumes once the network recovers and restores quorum; Raft enforces cluster consistency. For power loss, etcd persists the Raft log to disk; etcd replays the log to the point of failure and resumes cluster participation. For permanent hardware failure, the node may be removed from the cluster through [runtime reconfiguration][runtime reconfiguration].
+
+It is recommended to have an odd number of members in a cluster. An odd-size cluster tolerates the same number of failures as an even-size cluster but with fewer nodes. The difference can be seen by comparing even and odd sized clusters:
+
+| Cluster Size | Majority | Failure Tolerance |
+|:-:|:-:|:-:|
+| 1 | 1 | 0 |
+| 2 | 2 | 0 |
+| 3 | 2 | 1 |
+| 4 | 3 | 1 |
+| 5 | 3 | 2 |
+| 6 | 4 | 2 |
+| 7 | 4 | 3 |
+| 8 | 5 | 3 |
+| 9 | 5 | 4 |
+
+Adding a member to bring the size of cluster up to an even number doesn't buy additional fault tolerance. Likewise, during a network partition, an odd number of members guarantees that there will always be a majority partition that can continue to operate and be the source of truth when the partition ends.
+
+#### Does etcd work in cross-region or cross data center deployments?
+
+Deploying etcd across regions improves etcd's fault tolerance since members are in separate failure domains. The cost is higher consensus request latency from crossing data center boundaries. Since etcd relies on a member quorum for consensus, the latency from crossing data centers will be somewhat pronounced because at least a majority of cluster members must respond to consensus requests. Additionally, cluster data must be replicated across all peers, so there will be bandwidth cost as well.
+
+With longer latencies, the default etcd configuration may cause frequent elections or heartbeat timeouts. See [tuning] for adjusting timeouts for high latency deployments.
+
+### Operation
+
+#### How to backup a etcd cluster?
+
+etcdctl provides a `snapshot` command to create backups. See [backup][backup] for more details.
+
+#### Should I add a member before removing an unhealthy member?
+
+When replacing an etcd node, it's important  to remove the member first and then add its replacement.
+
+etcd employs distributed consensus based on a quorum model; (n+1)/2 members, a majority, must agree on a proposal before it can be committed to the cluster. These proposals include key-value updates and membership changes. This model totally avoids any possibility of split brain inconsistency. The downside is permanent quorum loss is catastrophic.
+
+How this applies to membership: If a 3-member cluster has 1 downed member, it can still make forward progress because the quorum is 2 and 2 members are still live. However, adding a new member to a 3-member cluster will increase the quorum to 3 because 3 votes are required for a majority of 4 members. Since the quorum increased, this extra member buys nothing in terms of fault tolerance; the cluster is still one node failure away from being unrecoverable.
+
+Additionally, that new member is risky because it may turn out to be misconfigured or incapable of joining the cluster. In that case, there's no way to recover quorum because the cluster has two members down and two members up, but needs three votes to change membership to undo the botched membership addition. etcd will by default reject member add attempts that could take down the cluster in this manner.
+
+On the other hand, if the downed member is removed from cluster membership first, the number of members becomes 2 and the quorum remains at 2. Following that removal by adding a new member will also keep the quorum steady at 2. So, even if the new node can't be brought up, it's still possible to remove the new member through quorum on the remaining live members.
+
+#### Why won't etcd accept my membership changes?
+
+etcd sets `strict-reconfig-check` in order to reject reconfiguration requests that would cause quorum loss. Abandoning quorum is really risky (especially when the cluster is already unhealthy). Although it may be tempting to disable quorum checking if there's quorum loss to add a new member, this could lead to full fledged cluster inconsistency. For many applications, this will make the problem even worse ("disk geometry corruption" being a candidate for most terrifying).
+
+### Performance
+
+#### How should I benchmark etcd?
+
+Try the [benchmark] tool. Current [benchmark results][benchmark-result] are available for comparison.
+
+#### What does the etcd warning "apply entries took too long" mean?
+
+After a majority of etcd members agree to commit a request, each etcd server applies the request to its data store and persists the result to disk. Even with a slow mechanical disk or a virtualized network disk, such as Amazon’s EBS or Google’s PD, applying a request should normally take fewer than 50 milliseconds. If the average apply duration exceeds 100 milliseconds, etcd will warn that entries are taking too long to apply.
+ 
+Usually this issue is caused by a slow disk. The disk could be experiencing contention among etcd and other applications, or the disk is too simply slow (e.g., a shared virtualized disk). To rule out a slow disk from causing this warning, monitor  [backend_commit_duration_seconds][backend_commit_metrics] (p99 duration should be less than 25ms) to confirm the disk is reasonably fast. If the disk is too slow, assigning a dedicated disk to etcd or using faster disk will typically solve the problem.
+
+The second most common cause is CPU starvation. If monitoring of the machine’s CPU usage shows heavy utilization, there may not be enough compute capacity for etcd. Moving etcd to dedicated machine, increasing process resource isolation  cgroups, or renicing the etcd server process into a higher priority can usually solve the problem.
+
+Expensive user requests which access too many keys (e.g., fetching the entire keyspace) can also cause long apply latencies. Accessing fewer than a several hundred keys per request, however, should always be performant.
+
+If none of the above suggestions clear the warnings, please [open an issue][new_issue] with detailed logging, monitoring, metrics and optionally workload information.
+
+#### What does the etcd warning "failed to send out heartbeat on time" mean?
+
+etcd uses a leader-based consensus protocol for consistent data replication and log execution. Cluster members elect a single leader, all other members become followers. The elected leader must periodically send heartbeats to its followers to maintain its leadership. Followers infer leader failure if no heartbeats are received within an election interval and trigger an election. If a leader doesn’t send its heartbeats in time but is still running, the election is spurious and likely caused by insufficient resources. To catch these soft failures, if the leader skips two heartbeat intervals, etcd will warn it failed to send a heartbeat on time.
+
+Usually this issue is caused by a slow disk. Before the leader sends heartbeats attached with metadata, it may need to persist the metadata to disk. The disk could be experiencing contention among etcd and other applications, or the disk is too simply slow (e.g., a shared virtualized disk). To rule out a slow disk from causing this warning, monitor  [wal_fsync_duration_seconds][wal_fsync_duration_seconds] (p99 duration should be less than 10ms) to confirm the disk is reasonably fast. If the disk is too slow, assigning a dedicated disk to etcd or using faster disk will typically solve the problem.
+
+The second most common cause is CPU starvation. If monitoring of the machine’s CPU usage shows heavy utilization, there may not be enough compute capacity for etcd. Moving etcd to dedicated machine, increasing process resource isolation  with cgroups, or renicing the etcd server process into a higher priority can usually solve the problem.
+
+A slow network can also cause this issue. If network metrics among the etcd machines shows long latencies or high drop rate, there may not be enough network capacity for etcd. Moving etcd members to a less congested network will typically solve the problem. However, if the etcd cluster is deployed across data centers, long latency between members is expected. For such deployments, tune the `heartbeat-interval` configuration to roughly match the round trip time between the machines, and the `election-timeout` configuration to be at least 5 * `heartbeat-interval`. See [tuning documentation][tuning] for detailed information.
+
+If none of the above suggestions clear the warnings, please [open an issue][new_issue] with detailed logging, monitoring, metrics and optionally workload information.
+
+#### What does the etcd warning "request ignored (cluster ID mismatch)" mean?
+
+Every new etcd cluster generates a new cluster ID based on the initial cluster configuration and a user-provided unique `initial-cluster-token` value. By having unique cluster ID's, etcd is protected from cross-cluster interaction which could corrupt the cluster.
+
+Usually this warning happens after tearing down an old cluster, then reusing some of the peer addresses for the new cluster. If any etcd process from the old cluster is still running it will try to contact the new cluster. The new cluster will recognize a cluster ID mismatch, then ignore the request and emit this warning. This warning is often cleared by ensuring peer addresses among distinct clusters are disjoint.
+
+[hardware-setup]: ./op-guide/hardware.md
+[supported-platform]: ./op-guide/supported-platform.md
+[wal_fsync_duration_seconds]: ./metrics.md#disk
+[tuning]: ./tuning.md
+[new_issue]: https://github.com/coreos/etcd/issues/new
+[backend_commit_metrics]: ./metrics.md#disk
+[raft]: https://raft.github.io/raft.pdf
+[backup]: https://github.com/coreos/etcd/blob/master/Documentation/op-guide/recovery.md#snapshotting-the-keyspace
+[chubby]: http://static.googleusercontent.com/media/research.google.com/en//archive/chubby-osdi06.pdf
+[runtime reconfiguration]: https://github.com/coreos/etcd/blob/master/Documentation/op-guide/runtime-configuration.md
+[benchmark]: https://github.com/coreos/etcd/tree/master/tools/benchmark
+[benchmark-result]: https://github.com/coreos/etcd/blob/master/Documentation/op-guide/performance.md
--- a/Documentation/learning/why.md
+++ b/Documentation/learning/why.md
@@ -0,0 +1,21 @@
+# Why etcd
+
+The name "etcd" originated from two ideas, the unix "/etc" folder and "d"istibuted systems. The "/etc" folder is a place to store configuration data for a single system whereas etcd stores configuration information for large scale distributed systems. Hence, a "d"istributed "/etc" is "etcd".
+
+etcd stores metadata in a consistent and fault-tolerant way. Distributed systems use etcd as a consistent key-value store for configuration management, service discovery, and coordinating distributed work. Common distributed patterns using etcd include leader election, [distributed locks][etcd-concurrency], and monitoring machine liveness.
+
+## Use cases
+
+- Container Linux by CoreOS: Application running on [Container Linux][container-linux] gets automatic, zero-downtime Linux kernel updates. Container Linux uses [locksmith] to coordinate updates. locksmith implements a distributed semaphore over etcd to ensure only a subset of a cluster is rebooting at any given time.
+- [Kubernetes][kubernetes] stores configuration data into etcd for service discovery and cluster management; etcd's consistency is crucial for correctly scheduling and operating services. The Kubernetes API server persists cluster state into etcd. It uses etcd's watch API to monitor the cluster and roll out critical configuration changes.
+
+
+## Features and system comparisons
+
+TODO
+
+[etcd-concurrency]: https://godoc.org/github.com/coreos/etcd/clientv3/concurrency
+[container-linux]: https://coreos.com/why
+[locksmith]: https://github.com/coreos/locksmith
+[kubernetes]: http://kubernetes.io/docs/whatisk8s
+
--- a/Documentation/libraries-and-tools.md
+++ b/Documentation/libraries-and-tools.md
@@ -14,6 +14,7 @@
 - [etcdtool](https://github.com/mickep76/etcdtool) - Export/Import/Edit etcd directory as JSON/YAML/TOML and Validate directory using JSON schema
 - [etcd-rest](https://github.com/mickep76/etcd-rest) - Create generic REST API in Go using etcd as a backend with validation using JSON schema
 - [etcdsh](https://github.com/kamilhark/etcdsh) - A command line client with support of command history and tab completion. Supports v2
+- [etcdloadtest](https://github.com/sinsharat/etcdloadtest) - A command line load test client for etcd version 3.0 and above.

 **Go libraries**

@@ -34,6 +35,7 @@
 **Scala libraries**

 - [maciej/etcd-client](https://github.com/maciej/etcd-client) - Supports v2. Akka HTTP-based fully async client
+- [eiipii/etcdhttpclient](https://bitbucket.org/eiipii/etcdhttpclient) - Supports v2. Async HTTP client based on Netty and Scala Futures.

 **Python libraries**

@@ -122,7 +124,9 @@
 **Projects using etcd**

 - [binocarlos/yoda](https://github.com/binocarlos/yoda) - etcd + ZeroMQ
+- [blox/blox](https://github.com/blox/blox) - a collection of open source projects for container management and orchestration with AWS ECS
 - [calavera/active-proxy](https://github.com/calavera/active-proxy) - HTTP Proxy configured with etcd
+- [chain/chain](https://github.com/chain/chain) - software designed to operate and connect to highly scalable permissioned blockchain networks 
 - [derekchiang/etcdplus](https://github.com/derekchiang/etcdplus) - A set of distributed synchronization primitives built upon etcd
 - [go-discover](https://github.com/flynn/go-discover) - service discovery in Go
 - [gleicon/goreman](https://github.com/gleicon/goreman/tree/etcd) - Branch of the Go Foreman clone with etcd support
--- a/Documentation/op-guide/clustering.md
+++ b/Documentation/op-guide/clustering.md
@@ -83,7 +83,7 @@ A cluster using self-signed certificates both encrypts traffic and authenticates
 On each machine, etcd would be started with these flags:

 ```
-$ etcd --name infra0 --initial-advertise-peer-urls http://10.0.1.10:2380 \
+$ etcd --name infra0 --initial-advertise-peer-urls https://10.0.1.10:2380 \
  --listen-peer-urls https://10.0.1.10:2380 \
  --listen-client-urls https://10.0.1.10:2379,https://127.0.0.1:2379 \
  --advertise-client-urls https://10.0.1.10:2379 \
@@ -475,5 +475,5 @@ To setup an etcd cluster with proxies of v2 API, please read the the [clustering
 [proxy]: https://github.com/coreos/etcd/blob/release-2.3/Documentation/proxy.md
 [clustering_etcd2]: https://github.com/coreos/etcd/blob/release-2.3/Documentation/clustering.md
 [security-guide]: security.md
-[tls-setup]: /hack/tls-setup
+[tls-setup]: ../../hack/tls-setup
 [gateway]: gateway.md
--- a/Documentation/op-guide/configuration.md
+++ b/Documentation/op-guide/configuration.md
@@ -247,7 +247,7 @@ The security flags help to [build a secure etcd cluster][security].
 + env variable: ETCD_DEBUG

 ### --log-package-levels
-+ Set individual etcd subpackages to specific log levels. An example being `etcdserver=WARNING,security=DEBUG` 
+ Set individual etcd subpackages to specific log levels. An example being `etcdserver=WARNING,security=DEBUG`
 + default: none (INFO for all packages)
 + env variable: ETCD_LOG_PACKAGE_LEVELS

@@ -279,10 +279,14 @@ Follow the instructions when using these flags.
 + Enable runtime profiling data via HTTP server. Address is at client URL + "/debug/pprof/"
 + default: false

+### --metrics
+ Set level of detail for exported metrics, specify 'extensive' to include histogram metrics.
+ default: basic
+
 [build-cluster]: clustering.md#static
 [reconfig]: runtime-configuration.md
 [discovery]: clustering.md#discovery
-[iana-ports]: https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml?search=etcd
+[iana-ports]: http://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.txt
 [proxy]: ../v2/proxy.md
 [restore]: ../v2/admin_guide.md#restoring-a-backup
 [security]: security.md
--- a/Documentation/op-guide/container.md
+++ b/Documentation/op-guide/container.md
@@ -57,7 +57,7 @@ sudo rkt run --net=default:IP=${NODE3} coreos.com/etcd:v3.0.6 -- -name=node3 -ad
 Verify the cluster is healthy and can be reached.

 ```
-ETCDCTL_API=3 etcdctl --endpoints=http://172.16.28.21:2379,http://172.16.28.22:2379,http://172.16.28.23:2379 endpoint-health
+ETCDCTL_API=3 etcdctl --endpoints=http://172.16.28.21:2379,http://172.16.28.22:2379,http://172.16.28.23:2379 endpoint health
 ```

 ### DNS
--- a/Documentation/op-guide/grpc_proxy.md
+++ b/Documentation/op-guide/grpc_proxy.md
@@ -1,6 +1,6 @@
 # gRPC proxy

-*This is a pre-alpha feature, we are looking for early feedback.*
+*This is an alpha feature, we are looking for early feedback.*

 The gRPC proxy is a stateless etcd reverse proxy operating at the gRPC layer (L7). The proxy is designed to reduce the total processing load on the core etcd cluster. For horizontal scalability, it coalesces watch and lease API requests. To protect the cluster against abusive clients, it caches key range requests.

@@ -36,9 +36,9 @@ watch key A ^     ^ watch key A    |

 To effectively coalesce multiple client watchers into a single watcher, the gRPC proxy coalesces new `c-watchers` into an existing `s-watcher` when possible. This coalesced `s-watcher` may be out of sync with the etcd server due to network delays or buffered undelivered events. When the watch revision is unspecified, the gRPC proxy will not guarantee the `c-watcher` will start watching from the most recent store revision. For example, if a client watches from an etcd server with revision 1000, that watcher will begin at revision 1000. If a client watches from the gRPC proxy, may begin watching from revision 990.

-Similar limitations apply to cancellation. When the watcher is cancelled, the etcd server’s revision may be greater than the cancellation response revision. 
+Similar limitations apply to cancellation. When the watcher is cancelled, the etcd server’s revision may be greater than the cancellation response revision.

-These two limitations should not cause problems for most use cases. In the future, there may be additional options to force the watcher to bypass the gRPC proxy for more accurate revision responses. 
+These two limitations should not cause problems for most use cases. In the future, there may be additional options to force the watcher to bypass the gRPC proxy for more accurate revision responses.

 ## Scalable lease API

@@ -75,3 +75,4 @@ $ ETCDCTL_API=3 ./etcdctl --endpoints=127.0.0.1:2379 get foo
 foo
 bar
 ```
+
--- a/Documentation/op-guide/hardware.md
+++ b/Documentation/op-guide/hardware.md
@@ -0,0 +1,93 @@
+# Hardware recommendations
+
+etcd usually runs well with limited resources for development or testing purposes; it’s common to develop with etcd on a  laptop or a cheap cloud machine. However, when running etcd clusters in production, some hardware guidelines are useful for proper administration. These suggestions are not hard rules; they serve as a good starting point for a robust production deployment. As always, deployments should be tested with simulated workloads before running in production.
+
+## CPUs
+
+Few etcd deployments require a lot of CPU capacity. Typical clusters need two to four cores to run smoothly.
+Heavily loaded etcd deployments, serving thousands of clients or tens of thousands of requests per second, tend to be CPU bound since etcd can serve requests from memory. Such heavy deployments usually need eight to sixteen dedicated cores.
+
+
+## Memory
+
+etcd has a relatively small memory footprint but its performance still depends on having enough memory. An etcd server will aggressively cache key-value data and spends most of the rest of its memory tracking watchers. Typically 8GB is enough. For heavy deployments with thousands of watchers and millions of keys, allocate 16GB to 64GB memory accordingly.
+
+
+## Disks
+
+Fast disks are the most critical factor for etcd deployment performance and stability. 
+
+A slow disk will increase etcd request latency and potentially hurt cluster stability. Since etcd’s consensus protocol depends on persistently storing metadata to a log, a majority of etcd cluster members must write every request down to disk. Additionally, etcd will also incrementally checkpoint its state to disk so it can truncate this log. If these writes take too long, heartbeats may time out and trigger an election, undermining the stability of the cluster.
+
+etcd is very sensitive to disk write latency. Typically 50 sequential IOPS (e.g., a 7200 RPM disk) is required. For heavily loaded clusters, 500 sequential IOPS (e.g., a typical local SSD or a high performance virtualized block device) is recommended. Note that most cloud providers publish concurrent IOPS rather than sequential IOPS; the published concurrent IOPS can be 10x greater than the sequential IOPS. To measure actual sequential IOPS, we suggest using a disk benchmarking tool such as [diskbench][diskbench] or [fio][fio].
+
+etcd requires only modest disk bandwidth but more disk bandwidth buys faster recovery times when a failed member has to catch up with the cluster. Typically 10MB/s will recover 100MB data within 15 seconds. For large clusters, 100MB/s or higher is suggested for recovering 1GB data within 15 seconds.
+
+When possible, back etcd’s storage with a SSD. A SSD usually provides lower write latencies and with less variance than a spinning disk, thus improving the stability and reliability of etcd. If using spinning disk, get the fastest disks possible (15,000 RPM). Using RAID 0 is also an effective way to increase disk speed, for both spinning disks and SSD. With at least three cluster members, mirroring and/or parity variants of RAID are unnecessary; etcd's consistent replication already gets high availability.
+
+
+## Network
+
+Multi-member etcd deployments benefit from a fast and reliable network. In order for etcd to be both consistent and partition tolerant, an unreliable network with partitioning outages will lead to poor availability. Low latency ensures etcd members can communicate fast. High bandwidth can reduce the time to recover a failed etcd member. 1GbE is sufficient for common etcd deployments. For large etcd clusters, a 10GbE network will reduce mean time to recovery.
+
+Deploy etcd members within a single data center when possible to avoid latency overheads and lessen the possibility of partitioning events. If a failure domain in another data center is required, choose a data center closer to the existing one. Please also read the [tuning][tuning] documentation for more information on cross data center deployment.
+
+
+## Example hardware configurations
+
+Here are a few example hardware setups on AWS and GCE environments. As mentioned before, but must be stressed  regardless, administrators should test an etcd deployment with a simulated workload before putting it into production.
+
+Note that these configurations assume these machines are totally dedicated to etcd. Running other applications along with etcd on these machines may cause resource contentions and lead to cluster instability.
+
+### Small cluster
+
+A small cluster serves fewer than 100 clients, fewer than 200 of requests per second, and stores no more than 100MB of data.
+
+Example application workload: A 50-node Kubernetes cluster
+
+| Provider | Type | vCPUs | Memory (GB) | Max concurrent IOPS | Disk bandwidth (MB/s) |
+|----------|------|-------|--------|------|----------------|
+| AWS | m4.large | 2 | 8 | 3600 | 56.25 |
+| GCE | n1-standard-1 + 50GB PD SSD | 2 | 7.5 | 1500 | 25 |
+
+
+### Medium cluster
+
+A medium cluster serves fewer than 500 clients, fewer than 1,000 of requests per second, and stores no more than 500MB of data.
+
+Example application workload: A 250-node Kubernetes cluster
+
+| Provider | Type | vCPUs | Memory (GB) | Max concurrent IOPS | Disk bandwidth (MB/s) |
+|----------|------|-------|--------|------|----------------|
+| AWS | m4.xlarge | 4 | 16 | 6000 | 93.75 |
+| GCE | n1-standard-4 + 150GB PD SSD | 4 | 15 | 4500 | 75 |
+
+
+### Large cluster
+
+A large cluster serves fewer than 1,500 clients, fewer than 10,000 of requests per second, and stores no more  than 1GB of data.
+
+Example application workload: A 1,000-node Kubernetes cluster
+
+| Provider | Type | vCPUs | Memory (GB) | Max concurrent IOPS | Disk bandwidth (MB/s) |
+|----------|------|-------|--------|------|----------------|
+| AWS | m4.2xlarge | 8 | 32 | 8000 | 125 |
+| GCE | n1-standard-8 + 250GB PD SSD | 8 | 30 | 7500 | 125 |
+
+
+### xLarge cluster
+
+An xLarge cluster serves more than 1,500 clients, more than 10,000 of requests per second, and stores more than 1GB data.
+
+Example application workload: A 3,000 node Kubernetes cluster
+
+| Provider | Type | vCPUs | Memory (GB) | Max concurrent IOPS | Disk bandwidth (MB/s) |
+|----------|------|-------|--------|------|----------------|
+| AWS | m4.4xlarge | 16 | 64 | 16,000 | 250 |
+| GCE | n1-standard-16 + 500GB PD SSD | 16 | 60 | 15,000 | 250 |
+
+
+[diskbench]: https://github.com/ongardie/diskbenchmark
+[fio]: https://github.com/axboe/fio
+[tuning]: ../tuning.md
+
--- a/Documentation/op-guide/monitoring.md
+++ b/Documentation/op-guide/monitoring.md
@@ -67,7 +67,9 @@ Url:    http://localhost:9090
 Access: proxy
 ```

-Then import the default [etcd dashboard template][template] and customize; see the [demo][demo].
+Then import the default [etcd dashboard template][template] and customize. For instance, if Prometheus data source name is `my-etcd`, the `datasource` field values in JSON also need to be `my-etcd`.
+
+See the [demo][demo].

 Sample dashboard:

--- a/Documentation/op-guide/recovery.md
+++ b/Documentation/op-guide/recovery.md
@@ -11,7 +11,7 @@ To recover from disastrous failure, etcd v3 provides snapshot and restore facili
 Recovering a cluster first needs a snapshot of the keyspace from an etcd member. A snapshot may either be taken from a live member with the `etcdctl snapshot save` command or by copying the `member/snap/db` file from an etcd data directory. For example, the following command snapshots the keyspace served by `$ENDPOINT` to the file `snapshot.db`:

 ```sh
-$ etcdctl --endpoints $ENDPOINT snapshot save snapshot.db
+$ ETCDCTL_API=3 etcdctl --endpoints $ENDPOINT snapshot save snapshot.db
 ```

 ### Restoring a cluster
@@ -23,19 +23,19 @@ Snapshot integrity may be optionally verified at restore time. If the snapshot i
 A restore initializes a new member of a new cluster, with a fresh cluster configuration using `etcd`'s cluster configuration flags, but preserves the contents of the etcd keyspace. Continuing from the previous example, the following creates new etcd data directories (`m1.etcd`, `m2.etcd`, `m3.etcd`) for a three member cluster:

 ```sh
-$ etcdctl snapshot restore snapshot.db \
+$ ETCDCTL_API=3 etcdctl snapshot restore snapshot.db \
  --name m1 \
-  --initial-cluster m1=http:/host1:2380,m2=http://host2:2380,m3=http://host3:2380 \
+  --initial-cluster m1=http://host1:2380,m2=http://host2:2380,m3=http://host3:2380 \
  --initial-cluster-token etcd-cluster-1 \
  --initial-advertise-peer-urls http://host1:2380
-$ etcdctl snapshot restore snapshot.db \
+$ ETCDCTL_API=3 etcdctl snapshot restore snapshot.db \
  --name m2 \
-  --initial-cluster m1=http:/host1:2380,m2=http://host2:2380,m3=http://host3:2380 \
+  --initial-cluster m1=http://host1:2380,m2=http://host2:2380,m3=http://host3:2380 \
  --initial-cluster-token etcd-cluster-1 \
  --initial-advertise-peer-urls http://host2:2380
-$ etcdctl snapshot restore snapshot.db \
+$ ETCDCTL_API=3 etcdctl snapshot restore snapshot.db \
  --name m3 \
-  --initial-cluster m1=http:/host1:2380,m2=http://host2:2380,m3=http://host3:2380 \
+  --initial-cluster m1=http://host1:2380,m2=http://host2:2380,m3=http://host3:2380 \
  --initial-cluster-token etcd-cluster-1 \
  --initial-advertise-peer-urls http://host3:2380
 ```
--- a/Documentation/op-guide/security.md
+++ b/Documentation/op-guide/security.md
@@ -219,6 +219,6 @@ Make sure to sign the certificates with a Subject Name the member's public IP ad
 The certificate needs to be signed for the member's FQDN in its Subject Name, use Subject Alternative Names (short IP SANs) to add the IP address. The `etcd-ca` tool provides `--domain=` option for its `new-cert` command, and openssl can make [it][alt-name] too.

 [cfssl]: https://github.com/cloudflare/cfssl
-[tls-setup]: /hack/tls-setup
+[tls-setup]: ../../hack/tls-setup
 [tls-guide]: https://github.com/coreos/docs/blob/master/os/generate-self-signed-certificates.md
 [alt-name]: http://wiki.cacert.org/FAQ/subjectAltName
--- a/Documentation/production-users.md
+++ b/Documentation/production-users.md
@@ -50,7 +50,7 @@ Radius Intelligence uses Kubernetes running CoreOS to containerize and scale int

 ## Vonage

- *Application*: system configuration for microservices, scheduling, locks (future - service discovery) 
+- *Application*: system configuration for microservices, scheduling, locks (future - service discovery)
 - *Launched*: August 2015
 - *Cluster Size*: 2 clusters of 5 members in 2 DCs, n local proxies 1-to-1 with microservice, (ssl and SRV look up)
 - *Order of Data Size*: kilobytes
@@ -60,3 +60,148 @@ Radius Intelligence uses Kubernetes running CoreOS to containerize and scale int

 [teamcity]: https://www.jetbrains.com/teamcity/
 [raoofm]:https://github.com/raoofm
+
+## Qiniu Cloud
+
+- *Application*: system configuration for microservices, distributed locks
+- *Launched*: Jan. 2016
+- *Cluster Size*: 3 members each with several clusters
+- *Order of Data Size*: kilobytes
+- *Operator*: Pandora, chenchao@qiniu.com
+- *Environment*: Baremetal
+- *Backups*: None, all data can be recreated if necessary
+
+## QingCloud
+
+- *Application*: [QingCloud][qingcloud] appcenter cluster for service discovery as [metad][metad] backend.
+- *Launched*: December 2016
+- *Cluster Size*: 1 cluster of 3 members per user.
+- *Order of Data Size*: kilobytes
+- *Operator*: [yunify][yunify]
+- *Environment*: QingCloud IaaS
+- *Backups*: None, all data can be recreated if necessary.
+
+[metad]:https://github.com/yunify/metad
+[yunify]:https://github.com/yunify
+[qingcloud]:https://qingcloud.com/
+
+
+## Yandex
+
+- *Application*: system configuration for services, service discovery
+- *Launched*: March 2016
+- *Cluster Size*: 3 clusters of 5 members
+- *Order of Data Size*: several gigabytes
+- *Operator*: Yandex; [nekto0n][nekto0n]
+- *Environment*: Bare Metal
+- *Backups*: None
+
+[nekto0n]:https://github.com/nekto0n
+
+## Tencent Games
+
+- *Application*: Meta data and configuration data for service discovery, Kubernetes, etc.
+- *Launched*: Jan. 2015
+- *Cluster Size*: 3 members each with 10s of clusters
+- *Order of Data Size*: 10s of Megabytes
+- *Operator*: Tencent Game Operations Department
+- *Environment*: Baremetal
+- *Backups*: Periodic sync to backup server
+
+In Tencent games, we use Docker and Kubernetes to deploy and run our applications, and use etcd to save meta data for service discovery, Kubernetes, etc.
+
+## Hyper.sh
+
+- *Application*: Kubernetes, distributed locks, etc.
+- *Launched*: April 2016
+- *Cluster Size*: 1 cluster of 3 members
+- *Order of Data Size*: 10s of MB
+- *Operator*: Hyper.sh
+- *Environment*: Baremetal
+- *Backups*: None, all data can be recreated if necessary.
+
+In [hyper.sh][hyper.sh], the container service is backed by [hypernetes][hypernetes], a multi-tenant kubernetes distro. Moreover, we use etcd to coordinate the multiple manage services and store global meta data.
+
+[hypernetes]:https://github.com/hyperhq/hypernetes
+[Hyper.sh]:https://www.hyper.sh
+
+## Meitu
+- *Application*: system configuration for services, service discovery, kubernetes in test environment
+- *Launched*: October 2015
+- *Cluster Size*: 1 cluster of 3 members
+- *Order of Data Size*: megabytes
+- *Operator*: Meitu, hxj@meitu.com, [shafreeck][shafreeck]
+- *Environment*: Bare Metal
+- *Backups*: None, all data can be recreated if necessary.
+
+[shafreeck]:https://github.com/shafreeck
+
+## Grab
+- *Application*: system configuration for services, service discovery
+- *Launched*: June 2016
+- *Cluster Size*: 1 cluster of 7 members
+- *Order of Data Size*: megabytes
+- *Operator*: Grab, [taxitan][taxitan], [reterVision][reterVision]
+- *Environment*: AWS
+- *Backups*: None, all data can be recreated if necessary.
+
+[taxitan]:https://github.com/taxitan
+[reterVision]:https://github.com/reterVision
+
+## DaoCloud.io
+
+- *Application*: container management
+- *Launched*: Sep. 2015
+- *Cluster Size*: 1000+ deployments, each deployment contains a 3 node cluster.
+- *Order of Data Size*: 100s of Megabytes
+- *Operator*: daocloud.io
+- *Environment*: Baremetal and virtual machines
+- *Backups*: None, all data can be recreated if necessary.
+
+In [DaoCloud][DaoCloud], we use Docker and Swarm to deploy and run our applications, and we use etcd to save metadata for service discovery.
+
+[DaoCloud]:https://www.daocloud.io
+
+## Branch.io
+
+- *Application*: Kubernetes
+- *Launched*: April 2016
+- *Cluster Size*: Multiple clusters, multiple sizes
+- *Order of Data Size*: 100s of Megabytes
+- *Operator*: branch.io
+- *Environment*: AWS, Kubernetes
+- *Backups*: EBS volume backups
+
+At [Branch][branch], we use kubernetes heavily as our core microservice platform for staging and production.
+
+[branch]: https://branch.io
+
+## Baidu Waimai
+
+- *Application*: SkyDNS, Kubernetes, UDC, CMDB and other distributed systems
+- *Launched*: April. 2016
+- *Cluster Size*: 3 clusters of 5 members
+- *Order of Data Size*: several gigabytes
+- *Operator*: Baidu Waimai Operations Department
+- *Environment*: CentOS 6.5
+- *Backups*: backup scripts
+
+## Salesforce.com
+
+- *Application*: Kubernetes
+- *Launched*: Jan 2017
+- *Cluster Size*: Multiple clusters of 3 members
+- *Order of Data Size*: 100s of Megabytes
+- *Operator*: Salesforce.com (krmayankk@github)
+- *Environment*: BareMetal
+- *Backups*: None, all data can be recreated
+
+## Hosted Graphite
+
+- *Application*: Service discovery, locking, ephemeral application data
+- *Launched*: January 2017
+- *Cluster Size*: 2 clusters of 7 members
+- *Order of Data Size*: Megabytes
+- *Operator*: Hosted Graphite (sre@hostedgraphite.com)
+- *Environment*: Bare Metal
+- *Backups*: None, all data is considered ephemeral.
--- a/Documentation/reporting_bugs.md
+++ b/Documentation/reporting_bugs.md
@@ -1,6 +1,6 @@
 # Reporting bugs

-If any part of the etcd project has bugs or documentation mistakes, please let us know by [opening an issue][issue]. We treat bugs and mistakes very seriously and believe no issue is too small. Before creating a bug report, please check that an issue reporting the same problem does not already exist.
+If any part of the etcd project has bugs or documentation mistakes, please let us know by [opening an issue][etcd-issue]. We treat bugs and mistakes very seriously and believe no issue is too small. Before creating a bug report, please check that an issue reporting the same problem does not already exist.

 To make the bug report accurate and easy to understand, please try to create bug reports that are:

--- a/Documentation/upgrades/upgrade_3_0.md
+++ b/Documentation/upgrades/upgrade_3_0.md
@@ -6,27 +6,29 @@ In the general case, upgrading from etcd 2.3 to 3.0 can be a zero-downtime, roll

 Before [starting an upgrade](#upgrade-procedure), read through the rest of this guide to prepare.

-### Upgrade Checklists
+### Upgrade checklists

-#### Upgrade Requirements
+**NOTE:** When [migrating from v2 with no v3 data](https://github.com/coreos/etcd/issues/9480), etcd server v3.2+ panics when etcd restores from existing snapshots but no v3 `ETCD_DATA_DIR/member/snap/db` file. This happens when the server had migrated from v2 with no previous v3 data. This also prevents accidental v3 data loss (e.g. `db` file might have been moved). etcd requires that post v3 migration can only happen with v3 data. Do not upgrade to newer v3 versions until v3.0 server contains v3 data.

-To upgrade an existing etcd deployment to 3.0, the running cluster must be 2.3 or greater. If it's before 2.3, please upgrade to [2.3](https://github.com/coreos/etcd/releases/tag/v2.3.0) before upgrading to 3.0.
+#### Upgrade requirements

-Also, to ensure a smooth rolling upgrade, the running cluster must be healthy. You can check the health of the cluster by using the `etcdctl cluster-health` command.
+To upgrade an existing etcd deployment to 3.0, the running cluster must be 2.3 or greater. If it's before 2.3, please upgrade to [2.3](https://github.com/coreos/etcd/releases/tag/v2.3.8) before upgrading to 3.0.
+
+Also, to ensure a smooth rolling upgrade, the running cluster must be healthy. Check the health of the cluster by using the `etcdctl cluster-health` command before proceeding.

 #### Preparation

 Before upgrading etcd, always test the services relying on etcd in a staging environment before deploying the upgrade to the production environment.

-Before beginning,  [backup the etcd data directory](../v2/admin_guide.md#backing-up-the-datastore). Should something go wrong with the upgrade, it is possible to use this backup to [downgrade](#downgrade) back to existing etcd version.
+Before beginning, [backup the etcd data directory](../v2/admin_guide.md#backing-up-the-datastore). Should something go wrong with the upgrade, it is possible to use this backup to [downgrade](#downgrade) back to existing etcd version.

-#### Mixed Versions
+#### Mixed versions

 While upgrading, an etcd cluster supports mixed versions of etcd members, and operates with the protocol of the lowest common version. The cluster is only considered upgraded once all of its members are upgraded to version 3.0. Internally, etcd members negotiate with each other to determine the overall cluster version, which controls the reported version and the supported features.

 #### Limitations

-It might take up to 2 minutes for the newly upgraded member to catch up with the existing cluster when the total data size is larger than 50MB. Check the size of a recent  snapshot to estimate  the total data size. In other words, it is safest to wait for 2 minutes between upgrading each member.
+It might take up to 2 minutes for the newly upgraded member to catch up with the existing cluster when the total data size is larger than 50MB. Check the size of a recent snapshot to estimate the total data size. In other words, it is safest to wait for 2 minutes between upgrading each member.

 For a much larger total data size, 100MB or more , this one-time process might take even more time. Administrators of very large etcd clusters of this magnitude can feel free to contact the [etcd team][etcd-contact] before upgrading, and we’ll be happy to provide advice on the procedure.

@@ -36,13 +38,13 @@ If all members have been upgraded to v3.0, the cluster will be upgraded to v3.0,

 Please [backup the data directory](../v2/admin_guide.md#backing-up-the-datastore) of all etcd members to make downgrading the cluster possible even after it has been completely upgraded.

-### Upgrade Procedure
+### Upgrade procedure

-This example details the  upgrade of a three-member v2.3 ectd cluster running on a local machine.
+This example details the upgrade of a three-member v2.3 ectd cluster running on a local machine.

 #### 1. Check upgrade requirements.

-Is the the cluster healthy and running v.2.3.x?
+Is the cluster healthy and running v.2.3.x?

 ```
 $ etcdctl cluster-health
@@ -52,7 +54,7 @@ member 8211f1d0f64f3269 is healthy: got healthy result from http://localhost:123
 cluster is healthy

 $ curl http://localhost:2379/version
-{"etcdserver":"2.3.x","etcdcluster":"2.3.0"}
+{"etcdserver":"2.3.x","etcdcluster":"2.3.8"}
 ```

 #### 2. Stop the existing etcd process
@@ -64,7 +66,7 @@ When each etcd process is stopped, expected errors will be logged by other clust
 2016-06-27 15:21:48.624175 I | rafthttp: the connection with 8211f1d0f64f3269 became inactive
 ```

-It’s a good idea at this point to  [backup the etcd data directory](../v2/admin_guide.md#backing-up-the-datastore) to provide a downgrade path should any problems occur:
+It’s a good idea at this point to [backup the etcd data directory](../v2/admin_guide.md#backing-up-the-datastore) to provide a downgrade path should any problems occur:

 ```
 $ etcdctl backup \
@@ -102,7 +104,7 @@ Upgraded members will log warnings like the following until the entire cluster i

 #### 5. Finish

-When all members are upgraded, the cluster will report  upgrading to 3.0 successfully:
+When all members are upgraded, the cluster will report upgrading to 3.0 successfully:

 ```
 2016-06-27 15:22:19.873751 N | membership: updated the cluster version from 2.3 to 3.0
@@ -116,4 +118,14 @@ $ ETCDCTL_API=3 etcdctl endpoint health
 127.0.0.1:22379 is healthy: successfully committed proposal: took = 18.513301ms
 ```

+## Further considerations
+
+- etcdctl environment variables have been updated. If `ETCDCTL_API=2 etcdctl cluster-health` works properly but `ETCDCTL_API=3 etcdctl endpoints health` responds with `Error:  grpc: timed out when dialing`, be sure to use the [new variable names](https://github.com/coreos/etcd/tree/master/etcdctl#etcdctl).
+
+## Known Issues
+
+- etcd &lt; v3.1 does not work properly if built with Go &gt; v1.7. See [Issue 6951](https://github.com/coreos/etcd/issues/6951) for additional information.
+- If an error such as `transport: http2Client.notifyError got notified that the client transport was broken unexpected EOF.` shows up in the etcd server logs, be sure etcd is a pre-built release or built with (etcd v3.1+ &amp; go v1.7+) or (etcd &lt;v3.1 &amp; go v1.6.x).
+- Adding a v3 node to v2.3 cluster during upgrades is not supported and could trigger panics. See [Issue 7249](https://github.com/coreos/etcd/issues/7429) for additional information. Mixed versions of etcd members are only allowed during v3 migration. Finish upgrades before making any membership changes.
+
 [etcd-contact]: https://groups.google.com/forum/#!forum/etcd-dev
--- a/Documentation/upgrades/upgrade_3_1.md
+++ b/Documentation/upgrades/upgrade_3_1.md
@@ -0,0 +1,134 @@
+## Upgrade etcd from 3.0 to 3.1
+
+In the general case, upgrading from etcd 3.0 to 3.1 can be a zero-downtime, rolling upgrade:
+ - one by one, stop the etcd v3.0 processes and replace them with etcd v3.1 processes
+ - after running all v3.1 processes, new features in v3.1 are available to the cluster
+
+Before [starting an upgrade](#upgrade-procedure), read through the rest of this guide to prepare.
+
+### Upgrade checklists
+
+**NOTE:** When [migrating from v2 with no v3 data](https://github.com/coreos/etcd/issues/9480), etcd server v3.2+ panics when etcd restores from existing snapshots but no v3 `ETCD_DATA_DIR/member/snap/db` file. This happens when the server had migrated from v2 with no previous v3 data. This also prevents accidental v3 data loss (e.g. `db` file might have been moved). etcd requires that post v3 migration can only happen with v3 data. Do not upgrade to newer v3 versions until v3.0 server contains v3 data.
+
+#### Monitoring
+
+Following metrics from v3.0.x have been deprecated in favor of [go-grpc-prometheus](https://github.com/grpc-ecosystem/go-grpc-prometheus):
+
+- `etcd_grpc_requests_total`
+- `etcd_grpc_requests_failed_total`
+- `etcd_grpc_active_streams`
+- `etcd_grpc_unary_requests_duration_seconds`
+
+#### Upgrade requirements
+
+To upgrade an existing etcd deployment to 3.1, the running cluster must be 3.0 or greater. If it's before 3.0, please [upgrade to 3.0](upgrade_3_0.md) before upgrading to 3.1.
+
+Also, to ensure a smooth rolling upgrade, the running cluster must be healthy. Check the health of the cluster by using the `etcdctl endpoint health` command before proceeding.
+
+#### Preparation
+
+Before upgrading etcd, always test the services relying on etcd in a staging environment before deploying the upgrade to the production environment.
+
+Before beginning, [backup the etcd data](../op-guide/maintenance.md#snapshot-backup). Should something go wrong with the upgrade, it is possible to use this backup to [downgrade](#downgrade) back to existing etcd version. Please note that the `snapshot` command only backs up the v3 data. For v2 data, see [backing up v2 datastore](../v2/admin_guide.md#backing-up-the-datastore).
+
+#### Mixed versions
+
+While upgrading, an etcd cluster supports mixed versions of etcd members, and operates with the protocol of the lowest common version. The cluster is only considered upgraded once all of its members are upgraded to version 3.1. Internally, etcd members negotiate with each other to determine the overall cluster version, which controls the reported version and the supported features.
+
+#### Limitations
+
+Note: If the cluster only has v3 data and no v2 data, it is not subject to this limitation.
+
+If the cluster is serving a v2 data set larger than 50MB, each newly upgraded member may take up to two minutes to catch up with the existing cluster. Check the size of a recent snapshot to estimate the total data size. In other words, it is safest to wait for 2 minutes between upgrading each member.
+
+For a much larger total data size, 100MB or more , this one-time process might take even more time. Administrators of very large etcd clusters of this magnitude can feel free to contact the [etcd team][etcd-contact] before upgrading, and we'll be happy to provide advice on the procedure.
+
+#### Downgrade
+
+If all members have been upgraded to v3.1, the cluster will be upgraded to v3.1, and downgrade from this completed state is **not possible**. If any single member is still v3.0, however, the cluster and its operations remains "v3.0", and it is possible from this mixed cluster state to return to using a v3.0 etcd binary on all members.
+
+Please [backup the data directory](../op-guide/maintenance.md#snapshot-backup) of all etcd members to make downgrading the cluster possible even after it has been completely upgraded.
+
+### Upgrade procedure
+
+This example shows how to upgrade a 3-member v3.0 ectd cluster running on a local machine.
+
+#### 1. Check upgrade requirements
+
+Is the cluster healthy and running v3.0.x?
+
+```
+$ ETCDCTL_API=3 etcdctl endpoint health --endpoints=localhost:2379,localhost:22379,localhost:32379
+localhost:2379 is healthy: successfully committed proposal: took = 6.600684ms
+localhost:22379 is healthy: successfully committed proposal: took = 8.540064ms
+localhost:32379 is healthy: successfully committed proposal: took = 8.763432ms
+
+$ curl http://localhost:2379/version
+{"etcdserver":"3.0.16","etcdcluster":"3.0.0"}
+```
+
+#### 2. Stop the existing etcd process
+
+When each etcd process is stopped, expected errors will be logged by other cluster members. This is normal since a cluster member connection has been (temporarily) broken:
+
+```
+2017-01-17 09:34:18.352662 I | raft: raft.node: 1640829d9eea5cfb elected leader 1640829d9eea5cfb at term 5
+2017-01-17 09:34:18.359630 W | etcdserver: failed to reach the peerURL(http://localhost:2380) of member fd32987dcd0511e0 (Get http://localhost:2380/version: dial tcp 127.0.0.1:2380: getsockopt: connection refused)
+2017-01-17 09:34:18.359679 W | etcdserver: cannot get the version of member fd32987dcd0511e0 (Get http://localhost:2380/version: dial tcp 127.0.0.1:2380: getsockopt: connection refused)
+2017-01-17 09:34:18.548116 W | rafthttp: lost the TCP streaming connection with peer fd32987dcd0511e0 (stream Message writer)
+2017-01-17 09:34:19.147816 W | rafthttp: lost the TCP streaming connection with peer fd32987dcd0511e0 (stream MsgApp v2 writer)
+2017-01-17 09:34:34.364907 W | etcdserver: failed to reach the peerURL(http://localhost:2380) of member fd32987dcd0511e0 (Get http://localhost:2380/version: dial tcp 127.0.0.1:2380: getsockopt: connection refused)
+```
+
+It's a good idea at this point to [backup the etcd data](../op-guide/maintenance.md#snapshot-backup) to provide a downgrade path should any problems occur:
+
+```
+$ etcdctl snapshot save backup.db
+```
+
+#### 3. Drop-in etcd v3.1 binary and start the new etcd process
+
+The new v3.1 etcd will publish its information to the cluster:
+
+```
+2017-01-17 09:36:00.996590 I | etcdserver: published {Name:my-etcd-1 ClientURLs:[http://localhost:2379]} to cluster 46bc3ce73049e678
+```
+
+Verify that each member, and then the entire cluster, becomes healthy with the new v3.1 etcd binary:
+
+```
+$ ETCDCTL_API=3 /etcdctl endpoint health --endpoints=localhost:2379,localhost:22379,localhost:32379
+localhost:22379 is healthy: successfully committed proposal: took = 5.540129ms
+localhost:32379 is healthy: successfully committed proposal: took = 7.321671ms
+localhost:2379 is healthy: successfully committed proposal: took = 10.629901ms
+```
+
+Upgraded members will log warnings like the following until the entire cluster is upgraded. This is expected and will cease after all etcd cluster members are upgraded to v3.1:
+
+```
+2017-01-17 09:36:38.406268 W | etcdserver: the local etcd version 3.0.16 is not up-to-date
+2017-01-17 09:36:38.406295 W | etcdserver: member fd32987dcd0511e0 has a higher version 3.1.0
+2017-01-17 09:36:42.407695 W | etcdserver: the local etcd version 3.0.16 is not up-to-date
+2017-01-17 09:36:42.407730 W | etcdserver: member fd32987dcd0511e0 has a higher version 3.1.0
+```
+
+#### 4. Repeat step 2 to step 3 for all other members
+
+#### 5. Finish
+
+When all members are upgraded, the cluster will report upgrading to 3.1 successfully:
+
+```
+2017-01-17 09:37:03.100015 I | etcdserver: updating the cluster version from 3.0 to 3.1
+2017-01-17 09:37:03.104263 N | etcdserver/membership: updated the cluster version from 3.0 to 3.1
+2017-01-17 09:37:03.104374 I | etcdserver/api: enabled capabilities for version 3.1
+```
+
+```
+$ ETCDCTL_API=3 /etcdctl endpoint health --endpoints=localhost:2379,localhost:22379,localhost:32379
+localhost:2379 is healthy: successfully committed proposal: took = 2.312897ms
+localhost:22379 is healthy: successfully committed proposal: took = 2.553476ms
+localhost:32379 is healthy: successfully committed proposal: took = 2.516902ms
+```
+
+[etcd-contact]: https://groups.google.com/forum/#!forum/etcd-dev
--- a/Documentation/upgrades/upgrade_3_2.md
+++ b/Documentation/upgrades/upgrade_3_2.md
@@ -0,0 +1,338 @@
+## Upgrade etcd from 3.1 to 3.2
+
+In the general case, upgrading from etcd 3.1 to 3.2 can be a zero-downtime, rolling upgrade:
+ - one by one, stop the etcd v3.1 processes and replace them with etcd v3.2 processes
+ - after running all v3.2 processes, new features in v3.2 are available to the cluster
+
+Before [starting an upgrade](#upgrade-procedure), read through the rest of this guide to prepare.
+
+### Upgrade checklists
+
+**NOTE:** When [migrating from v2 with no v3 data](https://github.com/coreos/etcd/issues/9480), etcd server v3.2+ panics when etcd restores from existing snapshots but no v3 `ETCD_DATA_DIR/member/snap/db` file. This happens when the server had migrated from v2 with no previous v3 data. This also prevents accidental v3 data loss (e.g. `db` file might have been moved). etcd requires that post v3 migration can only happen with v3 data. Do not upgrade to newer v3 versions until v3.0 server contains v3 data.
+
+Highlighted breaking changes in 3.2.
+
+#### Change in default `snapshot-count` value
+
+The default value of `--snapshot-count` has [changed from from 10,000 to 100,000](https://github.com/coreos/etcd/pull/7160). Higher snapshot count means it holds Raft entries in memory for longer before discarding old entries. It is a trade-off between less frequent snapshotting and [higher memory usage](https://github.com/kubernetes/kubernetes/issues/60589#issuecomment-371977156). Higher `--snapshot-count` will be manifested with higher memory usage, while retaining more Raft entries helps with the availabilities of slow followers: leader is still able to replicate its logs to followers, rather than forcing followers to rebuild its stores from leader snapshots.
+
+#### Change in gRPC dependency (>=3.2.10)
+
+3.2.10 or later now requires [grpc/grpc-go](https://github.com/grpc/grpc-go/releases) `v1.7.5` (<=3.2.9 requires `v1.2.1`).
+
+##### Deprecate `grpclog.Logger`
+
+`grpclog.Logger` has been deprecated in favor of [`grpclog.LoggerV2`](https://github.com/grpc/grpc-go/blob/master/grpclog/loggerv2.go). `clientv3.Logger` is now `grpclog.LoggerV2`.
+
+Before
+
+```go
+import "github.com/coreos/etcd/clientv3"
+clientv3.SetLogger(log.New(os.Stderr, "grpc: ", 0))
+```
+
+After
+
+```go
+import "github.com/coreos/etcd/clientv3"
+import "google.golang.org/grpc/grpclog"
+clientv3.SetLogger(grpclog.NewLoggerV2(os.Stderr, os.Stderr, os.Stderr))
+
+// log.New above cannot be used (not implement grpclog.LoggerV2 interface)
+```
+
+##### Deprecate `grpc.ErrClientConnTimeout`
+
+Previously, `grpc.ErrClientConnTimeout` error is returned on client dial time-outs. 3.2 instead returns `context.DeadlineExceeded` (see [#8504](https://github.com/coreos/etcd/issues/8504)).
+
+Before
+
+```go
+// expect dial time-out on ipv4 blackhole
+_, err := clientv3.New(clientv3.Config{
+    Endpoints:   []string{"http://254.0.0.1:12345"},
+    DialTimeout: 2 * time.Second
+})
+if err == grpc.ErrClientConnTimeout {
+	// handle errors
+}
+```
+
+After
+
+```go
+_, err := clientv3.New(clientv3.Config{
+    Endpoints:   []string{"http://254.0.0.1:12345"},
+    DialTimeout: 2 * time.Second
+})
+if err == context.DeadlineExceeded {
+	// handle errors
+}
+```
+
+#### Change in maximum request size limits (>=3.2.10)
+
+3.2.10 and 3.2.11 allow custom request size limits in server side. >=3.2.12 allows custom request size limits for both server and **client side**. In previous versions(v3.2.10, v3.2.11), client response size was limited to only 4 MiB.
+
+Server-side request limits can be configured with `--max-request-bytes` flag:
+
+```bash
+# limits request size to 1.5 KiB
+etcd --max-request-bytes 1536
+
+# client writes exceeding 1.5 KiB will be rejected
+etcdctl put foo [LARGE VALUE...]
+# etcdserver: request is too large
+```
+
+Or configure `embed.Config.MaxRequestBytes` field:
+
+```go
+import "github.com/coreos/etcd/embed"
+import "github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
+
+// limit requests to 5 MiB
+cfg := embed.NewConfig()
+cfg.MaxRequestBytes = 5 * 1024 * 1024
+
+// client writes exceeding 5 MiB will be rejected
+_, err := cli.Put(ctx, "foo", [LARGE VALUE...])
+err == rpctypes.ErrRequestTooLarge
+```
+
+**If not specified, server-side limit defaults to 1.5 MiB**.
+
+Client-side request limits must be configured based on server-side limits.
+
+```bash
+# limits request size to 1 MiB
+etcd --max-request-bytes 1048576
+```
+
+```go
+import "github.com/coreos/etcd/clientv3"
+
+cli, _ := clientv3.New(clientv3.Config{
+    Endpoints: []string{"127.0.0.1:2379"},
+    MaxCallSendMsgSize: 2 * 1024 * 1024,
+    MaxCallRecvMsgSize: 3 * 1024 * 1024,
+})
+
+
+// client writes exceeding "--max-request-bytes" will be rejected from etcd server
+_, err := cli.Put(ctx, "foo", strings.Repeat("a", 1*1024*1024+5))
+err == rpctypes.ErrRequestTooLarge
+
+
+// client writes exceeding "MaxCallSendMsgSize" will be rejected from client-side
+_, err = cli.Put(ctx, "foo", strings.Repeat("a", 5*1024*1024))
+err.Error() == "rpc error: code = ResourceExhausted desc = grpc: trying to send message larger than max (5242890 vs. 2097152)"
+
+
+// some writes under limits
+for i := range []int{0,1,2,3,4} {
+    _, err = cli.Put(ctx, fmt.Sprintf("foo%d", i), strings.Repeat("a", 1*1024*1024-500))
+    if err != nil {
+        panic(err)
+    }
+}
+// client reads exceeding "MaxCallRecvMsgSize" will be rejected from client-side
+_, err = cli.Get(ctx, "foo", clientv3.WithPrefix())
+err.Error() == "rpc error: code = ResourceExhausted desc = grpc: received message larger than max (5240509 vs. 3145728)"
+```
+
+**If not specified, client-side send limit defaults to 2 MiB (1.5 MiB + gRPC overhead bytes) and receive limit to `math.MaxInt32`**. Please see [clientv3 godoc](https://godoc.org/github.com/coreos/etcd/clientv3#Config) for more detail.
+
+#### Change in raw gRPC client wrappers
+
+3.2.12 or later changes the function signatures of `clientv3` gRPC client wrapper. This change was needed to support [custom `grpc.CallOption` on message size limits](https://github.com/coreos/etcd/pull/9047).
+
+Before and after
+
+```diff
+-func NewKVFromKVClient(remote pb.KVClient) KV {
+func NewKVFromKVClient(remote pb.KVClient, c *Client) KV {
+
+-func NewClusterFromClusterClient(remote pb.ClusterClient) Cluster {
+func NewClusterFromClusterClient(remote pb.ClusterClient, c *Client) Cluster {
+
+-func NewLeaseFromLeaseClient(remote pb.LeaseClient, keepAliveTimeout time.Duration) Lease {
+func NewLeaseFromLeaseClient(remote pb.LeaseClient, c *Client, keepAliveTimeout time.Duration) Lease {
+
+-func NewMaintenanceFromMaintenanceClient(remote pb.MaintenanceClient) Maintenance {
+func NewMaintenanceFromMaintenanceClient(remote pb.MaintenanceClient, c *Client) Maintenance {
+
+-func NewWatchFromWatchClient(wc pb.WatchClient) Watcher {
+func NewWatchFromWatchClient(wc pb.WatchClient, c *Client) Watcher {
+```
+
+#### Change in `clientv3.Lease.TimeToLive` API
+
+Previously, `clientv3.Lease.TimeToLive` API returned `lease.ErrLeaseNotFound` on non-existent lease ID. 3.2 instead returns TTL=-1 in its response and no error (see [#7305](https://github.com/coreos/etcd/pull/7305)).
+
+Before
+
+```go
+// when leaseID does not exist
+resp, err := TimeToLive(ctx, leaseID)
+resp == nil
+err == lease.ErrLeaseNotFound
+```
+
+After
+
+```go
+// when leaseID does not exist
+resp, err := TimeToLive(ctx, leaseID)
+resp.TTL == -1
+err == nil
+```
+
+#### Change in `clientv3.NewFromConfigFile`
+
+`clientv3.NewFromConfigFile` is moved to `yaml.NewConfig`.
+
+Before
+
+```go
+import "github.com/coreos/etcd/clientv3"
+clientv3.NewFromConfigFile
+```
+
+After
+
+```go
+import clientv3yaml "github.com/coreos/etcd/clientv3/yaml"
+clientv3yaml.NewConfig
+```
+
+#### Change in `--listen-peer-urls` and `--listen-client-urls`
+
+3.2 now rejects domains names for `--listen-peer-urls` and `--listen-client-urls` (3.1 only prints out warnings), since domain name is invalid for network interface binding. Make sure that those URLs are properly formated as `scheme://IP:port`.
+
+See [issue #6336](https://github.com/coreos/etcd/issues/6336) for more contexts.
+
+### Server upgrade checklists
+
+#### Upgrade requirements
+
+To upgrade an existing etcd deployment to 3.2, the running cluster must be 3.1 or greater. If it's before 3.1, please [upgrade to 3.1](upgrade_3_1.md) before upgrading to 3.2.
+
+Also, to ensure a smooth rolling upgrade, the running cluster must be healthy. Check the health of the cluster by using the `etcdctl endpoint health` command before proceeding.
+
+#### Preparation
+
+Before upgrading etcd, always test the services relying on etcd in a staging environment before deploying the upgrade to the production environment.
+
+Before beginning, [backup the etcd data](../op-guide/maintenance.md#snapshot-backup). Should something go wrong with the upgrade, it is possible to use this backup to [downgrade](#downgrade) back to existing etcd version. Please note that the `snapshot` command only backs up the v3 data. For v2 data, see [backing up v2 datastore](../v2/admin_guide.md#backing-up-the-datastore).
+
+#### Mixed versions
+
+While upgrading, an etcd cluster supports mixed versions of etcd members, and operates with the protocol of the lowest common version. The cluster is only considered upgraded once all of its members are upgraded to version 3.2. Internally, etcd members negotiate with each other to determine the overall cluster version, which controls the reported version and the supported features.
+
+#### Limitations
+
+Note: If the cluster only has v3 data and no v2 data, it is not subject to this limitation.
+
+If the cluster is serving a v2 data set larger than 50MB, each newly upgraded member may take up to two minutes to catch up with the existing cluster. Check the size of a recent snapshot to estimate the total data size. In other words, it is safest to wait for 2 minutes between upgrading each member.
+
+For a much larger total data size, 100MB or more , this one-time process might take even more time. Administrators of very large etcd clusters of this magnitude can feel free to contact the [etcd team][etcd-contact] before upgrading, and we'll be happy to provide advice on the procedure.
+
+#### Downgrade
+
+If all members have been upgraded to v3.2, the cluster will be upgraded to v3.2, and downgrade from this completed state is **not possible**. If any single member is still v3.1, however, the cluster and its operations remains "v3.1", and it is possible from this mixed cluster state to return to using a v3.1 etcd binary on all members.
+
+Please [backup the data directory](../op-guide/maintenance.md#snapshot-backup) of all etcd members to make downgrading the cluster possible even after it has been completely upgraded.
+
+### Upgrade procedure
+
+This example shows how to upgrade a 3-member v3.1 ectd cluster running on a local machine.
+
+#### 1. Check upgrade requirements
+
+Is the cluster healthy and running v3.1.x?
+
+```
+$ ETCDCTL_API=3 etcdctl endpoint health --endpoints=localhost:2379,localhost:22379,localhost:32379
+localhost:2379 is healthy: successfully committed proposal: took = 6.600684ms
+localhost:22379 is healthy: successfully committed proposal: took = 8.540064ms
+localhost:32379 is healthy: successfully committed proposal: took = 8.763432ms
+
+$ curl http://localhost:2379/version
+{"etcdserver":"3.1.7","etcdcluster":"3.1.0"}
+```
+
+#### 2. Stop the existing etcd process
+
+When each etcd process is stopped, expected errors will be logged by other cluster members. This is normal since a cluster member connection has been (temporarily) broken:
+
+```
+2017-04-27 14:13:31.491746 I | raft: c89feb932daef420 [term 3] received MsgTimeoutNow from 6d4f535bae3ab960 and starts an election to get leadership.
+2017-04-27 14:13:31.491769 I | raft: c89feb932daef420 became candidate at term 4
+2017-04-27 14:13:31.491788 I | raft: c89feb932daef420 received MsgVoteResp from c89feb932daef420 at term 4
+2017-04-27 14:13:31.491797 I | raft: c89feb932daef420 [logterm: 3, index: 9] sent MsgVote request to 6d4f535bae3ab960 at term 4
+2017-04-27 14:13:31.491805 I | raft: c89feb932daef420 [logterm: 3, index: 9] sent MsgVote request to 9eda174c7df8a033 at term 4
+2017-04-27 14:13:31.491815 I | raft: raft.node: c89feb932daef420 lost leader 6d4f535bae3ab960 at term 4
+2017-04-27 14:13:31.524084 I | raft: c89feb932daef420 received MsgVoteResp from 6d4f535bae3ab960 at term 4
+2017-04-27 14:13:31.524108 I | raft: c89feb932daef420 [quorum:2] has received 2 MsgVoteResp votes and 0 vote rejections
+2017-04-27 14:13:31.524123 I | raft: c89feb932daef420 became leader at term 4
+2017-04-27 14:13:31.524136 I | raft: raft.node: c89feb932daef420 elected leader c89feb932daef420 at term 4
+2017-04-27 14:13:31.592650 W | rafthttp: lost the TCP streaming connection with peer 6d4f535bae3ab960 (stream MsgApp v2 reader)
+2017-04-27 14:13:31.592825 W | rafthttp: lost the TCP streaming connection with peer 6d4f535bae3ab960 (stream Message reader)
+2017-04-27 14:13:31.693275 E | rafthttp: failed to dial 6d4f535bae3ab960 on stream Message (dial tcp [::1]:2380: getsockopt: connection refused)
+2017-04-27 14:13:31.693289 I | rafthttp: peer 6d4f535bae3ab960 became inactive
+2017-04-27 14:13:31.936678 W | rafthttp: lost the TCP streaming connection with peer 6d4f535bae3ab960 (stream Message writer)
+```
+
+It's a good idea at this point to [backup the etcd data](../op-guide/maintenance.md#snapshot-backup) to provide a downgrade path should any problems occur:
+
+```
+$ etcdctl snapshot save backup.db
+```
+
+#### 3. Drop-in etcd v3.2 binary and start the new etcd process
+
+The new v3.2 etcd will publish its information to the cluster:
+
+```
+2017-04-27 14:14:25.363225 I | etcdserver: published {Name:s1 ClientURLs:[http://localhost:2379]} to cluster a9ededbffcb1b1f1
+```
+
+Verify that each member, and then the entire cluster, becomes healthy with the new v3.2 etcd binary:
+
+```
+$ ETCDCTL_API=3 /etcdctl endpoint health --endpoints=localhost:2379,localhost:22379,localhost:32379
+localhost:22379 is healthy: successfully committed proposal: took = 5.540129ms
+localhost:32379 is healthy: successfully committed proposal: took = 7.321771ms
+localhost:2379 is healthy: successfully committed proposal: took = 10.629901ms
+```
+
+Upgraded members will log warnings like the following until the entire cluster is upgraded. This is expected and will cease after all etcd cluster members are upgraded to v3.2:
+
+```
+2017-04-27 14:15:17.071804 W | etcdserver: member c89feb932daef420 has a higher version 3.2.0
+2017-04-27 14:15:21.073110 W | etcdserver: the local etcd version 3.1.7 is not up-to-date
+2017-04-27 14:15:21.073142 W | etcdserver: member 6d4f535bae3ab960 has a higher version 3.2.0
+2017-04-27 14:15:21.073157 W | etcdserver: the local etcd version 3.1.7 is not up-to-date
+2017-04-27 14:15:21.073164 W | etcdserver: member c89feb932daef420 has a higher version 3.2.0
+```
+
+#### 4. Repeat step 2 to step 3 for all other members
+
+#### 5. Finish
+
+When all members are upgraded, the cluster will report upgrading to 3.2 successfully:
+
+```
+2017-04-27 14:15:54.536901 N | etcdserver/membership: updated the cluster version from 3.1 to 3.2
+2017-04-27 14:15:54.537035 I | etcdserver/api: enabled capabilities for version 3.2
+```
+
+```
+$ ETCDCTL_API=3 /etcdctl endpoint health --endpoints=localhost:2379,localhost:22379,localhost:32379
+localhost:2379 is healthy: successfully committed proposal: took = 2.312897ms
+localhost:22379 is healthy: successfully committed proposal: took = 2.553476ms
+localhost:32379 is healthy: successfully committed proposal: took = 2.517902ms
+```
+
+[etcd-contact]: https://groups.google.com/forum/#!forum/etcd-dev
--- a/Documentation/upgrades/upgrade_3_3.md
+++ b/Documentation/upgrades/upgrade_3_3.md
@@ -0,0 +1,476 @@
+## Upgrade etcd from 3.2 to 3.3
+
+In the general case, upgrading from etcd 3.2 to 3.3 can be a zero-downtime, rolling upgrade:
+ - one by one, stop the etcd v3.2 processes and replace them with etcd v3.3 processes
+ - after running all v3.3 processes, new features in v3.3 are available to the cluster
+
+Before [starting an upgrade](#upgrade-procedure), read through the rest of this guide to prepare.
+
+### Upgrade checklists
+
+**NOTE:** When [migrating from v2 with no v3 data](https://github.com/coreos/etcd/issues/9480), etcd server v3.2+ panics when etcd restores from existing snapshots but no v3 `ETCD_DATA_DIR/member/snap/db` file. This happens when the server had migrated from v2 with no previous v3 data. This also prevents accidental v3 data loss (e.g. `db` file might have been moved). etcd requires that post v3 migration can only happen with v3 data. Do not upgrade to newer v3 versions until v3.0 server contains v3 data.
+
+Highlighted breaking changes in 3.3.
+
+#### Change in `etcdserver.EtcdServer` struct
+
+`etcdserver.EtcdServer` has changed the type of its member field `*etcdserver.ServerConfig` to `etcdserver.ServerConfig`. And `etcdserver.NewServer` now takes `etcdserver.ServerConfig`, instead of `*etcdserver.ServerConfig`.
+
+Before and after (e.g. [k8s.io/kubernetes/test/e2e_node/services/etcd.go](https://github.com/kubernetes/kubernetes/blob/release-1.8/test/e2e_node/services/etcd.go#L50-L55))
+
+```diff
+import "github.com/coreos/etcd/etcdserver"
+
+type EtcdServer struct {
+	*etcdserver.EtcdServer
+-	config *etcdserver.ServerConfig
+	config etcdserver.ServerConfig
+}
+
+func NewEtcd(dataDir string) *EtcdServer {
+-	config := &etcdserver.ServerConfig{
+	config := etcdserver.ServerConfig{
+		DataDir: dataDir,
+        ...
+	}
+	return &EtcdServer{config: config}
+}
+
+func (e *EtcdServer) Start() error {
+	var err error
+	e.EtcdServer, err = etcdserver.NewServer(e.config)
+    ...
+```
+
+#### Change in `embed.EtcdServer` struct
+
+Field `LogOutput` is added to `embed.Config`:
+
+```diff
+package embed
+
+type Config struct {
+ 	Debug bool `json:"debug"`
+ 	LogPkgLevels string `json:"log-package-levels"`
+	LogOutput string `json:"log-output"`
+ 	...
+```
+
+Before gRPC server warnings were logged in etcdserver.
+
+```
+WARNING: 2017/11/02 11:35:51 grpc: addrConn.resetTransport failed to create client transport: connection error: desc = "transport: Error while dialing dial tcp: operation was canceled"; Reconnecting to {localhost:2379 <nil>}
+WARNING: 2017/11/02 11:35:51 grpc: addrConn.resetTransport failed to create client transport: connection error: desc = "transport: Error while dialing dial tcp: operation was canceled"; Reconnecting to {localhost:2379 <nil>}
+```
+
+From v3.3, gRPC server logs are disabled by default.
+
+```go
+import "github.com/coreos/etcd/embed"
+
+cfg := &embed.Config{Debug: false}
+cfg.SetupLogging()
+```
+
+Set `embed.Config.Debug` field to `true` to enable gRPC server logs.
+
+#### Change in `/health` endpoint response
+
+Previously, `[endpoint]:[client-port]/health` returned manually marshaled JSON value. 3.3 now defines [`etcdhttp.Health`](https://godoc.org/github.com/coreos/etcd/etcdserver/api/etcdhttp#Health) struct.
+
+Note that in v3.3.0-rc.0, v3.3.0-rc.1, and v3.3.0-rc.2, `etcdhttp.Health` has boolean type `"health"` and `"errors"` fields. For backward compatibilities, we reverted `"health"` field to `string` type and removed `"errors"` field. Further health information will be provided in separate APIs.
+
+```bash
+$ curl http://localhost:2379/health
+{"health":"true"}
+```
+
+#### Change in gRPC gateway HTTP endpoints (replaced `/v3alpha` with `/v3beta`)
+
+Before
+
+```bash
+curl -L http://localhost:2379/v3alpha/kv/put \
+	-X POST -d '{"key": "Zm9v", "value": "YmFy"}'
+```
+
+After
+
+```bash
+curl -L http://localhost:2379/v3beta/kv/put \
+	-X POST -d '{"key": "Zm9v", "value": "YmFy"}'
+```
+
+Requests to `/v3alpha` endpoints will redirect to `/v3beta`, and `/v3alpha` will be removed in 3.4 release.
+
+#### Change in maximum request size limits
+
+3.3 now allows custom request size limits for both server and **client side**. In previous versions(v3.2.10, v3.2.11), client response size was limited to only 4 MiB.
+
+Server-side request limits can be configured with `--max-request-bytes` flag:
+
+```bash
+# limits request size to 1.5 KiB
+etcd --max-request-bytes 1536
+
+# client writes exceeding 1.5 KiB will be rejected
+etcdctl put foo [LARGE VALUE...]
+# etcdserver: request is too large
+```
+
+Or configure `embed.Config.MaxRequestBytes` field:
+
+```go
+import "github.com/coreos/etcd/embed"
+import "github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
+
+// limit requests to 5 MiB
+cfg := embed.NewConfig()
+cfg.MaxRequestBytes = 5 * 1024 * 1024
+
+// client writes exceeding 5 MiB will be rejected
+_, err := cli.Put(ctx, "foo", [LARGE VALUE...])
+err == rpctypes.ErrRequestTooLarge
+```
+
+**If not specified, server-side limit defaults to 1.5 MiB**.
+
+Client-side request limits must be configured based on server-side limits.
+
+```bash
+# limits request size to 1 MiB
+etcd --max-request-bytes 1048576
+```
+
+```go
+import "github.com/coreos/etcd/clientv3"
+
+cli, _ := clientv3.New(clientv3.Config{
+    Endpoints: []string{"127.0.0.1:2379"},
+    MaxCallSendMsgSize: 2 * 1024 * 1024,
+    MaxCallRecvMsgSize: 3 * 1024 * 1024,
+})
+
+
+// client writes exceeding "--max-request-bytes" will be rejected from etcd server
+_, err := cli.Put(ctx, "foo", strings.Repeat("a", 1*1024*1024+5))
+err == rpctypes.ErrRequestTooLarge
+
+
+// client writes exceeding "MaxCallSendMsgSize" will be rejected from client-side
+_, err = cli.Put(ctx, "foo", strings.Repeat("a", 5*1024*1024))
+err.Error() == "rpc error: code = ResourceExhausted desc = grpc: trying to send message larger than max (5242890 vs. 2097152)"
+
+
+// some writes under limits
+for i := range []int{0,1,2,3,4} {
+    _, err = cli.Put(ctx, fmt.Sprintf("foo%d", i), strings.Repeat("a", 1*1024*1024-500))
+    if err != nil {
+        panic(err)
+    }
+}
+// client reads exceeding "MaxCallRecvMsgSize" will be rejected from client-side
+_, err = cli.Get(ctx, "foo", clientv3.WithPrefix())
+err.Error() == "rpc error: code = ResourceExhausted desc = grpc: received message larger than max (5240509 vs. 3145728)"
+```
+
+**If not specified, client-side send limit defaults to 2 MiB (1.5 MiB + gRPC overhead bytes) and receive limit to `math.MaxInt32`**. Please see [clientv3 godoc](https://godoc.org/github.com/coreos/etcd/clientv3#Config) for more detail.
+
+#### Change in raw gRPC client wrappers
+
+3.3 changes the function signatures of `clientv3` gRPC client wrapper. This change was needed to support [custom `grpc.CallOption` on message size limits](https://github.com/coreos/etcd/pull/9047).
+
+Before and after
+
+```diff
+-func NewKVFromKVClient(remote pb.KVClient) KV {
+func NewKVFromKVClient(remote pb.KVClient, c *Client) KV {
+
+-func NewClusterFromClusterClient(remote pb.ClusterClient) Cluster {
+func NewClusterFromClusterClient(remote pb.ClusterClient, c *Client) Cluster {
+
+-func NewLeaseFromLeaseClient(remote pb.LeaseClient, keepAliveTimeout time.Duration) Lease {
+func NewLeaseFromLeaseClient(remote pb.LeaseClient, c *Client, keepAliveTimeout time.Duration) Lease {
+
+-func NewMaintenanceFromMaintenanceClient(remote pb.MaintenanceClient) Maintenance {
+func NewMaintenanceFromMaintenanceClient(remote pb.MaintenanceClient, c *Client) Maintenance {
+
+-func NewWatchFromWatchClient(wc pb.WatchClient) Watcher {
+func NewWatchFromWatchClient(wc pb.WatchClient, c *Client) Watcher {
+```
+
+#### Change in clientv3 `Snapshot` API error type
+
+Previously, clientv3 `Snapshot` API returned raw [`grpc/*status.statusError`] type error. v3.3 now translates those errors to corresponding public error types, to be consistent with other APIs.
+
+Before
+
+```go
+import "context"
+
+// reading snapshot with canceled context should error out
+ctx, cancel := context.WithCancel(context.Background())
+rc, _ := cli.Snapshot(ctx)
+cancel()
+_, err := io.Copy(f, rc)
+err.Error() == "rpc error: code = Canceled desc = context canceled"
+
+// reading snapshot with deadline exceeded should error out
+ctx, cancel = context.WithTimeout(context.Background(), time.Second)
+defer cancel()
+rc, _ = cli.Snapshot(ctx)
+time.Sleep(2 * time.Second)
+_, err = io.Copy(f, rc)
+err.Error() == "rpc error: code = DeadlineExceeded desc = context deadline exceeded"
+```
+
+After
+
+```go
+import "context"
+
+// reading snapshot with canceled context should error out
+ctx, cancel := context.WithCancel(context.Background())
+rc, _ := cli.Snapshot(ctx)
+cancel()
+_, err := io.Copy(f, rc)
+err == context.Canceled
+
+// reading snapshot with deadline exceeded should error out
+ctx, cancel = context.WithTimeout(context.Background(), time.Second)
+defer cancel()
+rc, _ = cli.Snapshot(ctx)
+time.Sleep(2 * time.Second)
+_, err = io.Copy(f, rc)
+err == context.DeadlineExceeded
+```
+
+#### Change in `etcdctl lease timetolive` command output
+
+Previously, `lease timetolive LEASE_ID` command on expired lease prints `-1s` for remaining seconds. 3.3 now outputs clearer messages.
+
+Before
+
+
+```bash
+lease 2d8257079fa1bc0c granted with TTL(0s), remaining(-1s)
+```
+
+After
+
+```bash
+lease 2d8257079fa1bc0c already expired
+```
+
+#### Change in `golang.org/x/net/context` imports
+
+`clientv3` has deprecated `golang.org/x/net/context`. If a project vendors `golang.org/x/net/context` in other code (e.g. etcd generated protocol buffer code) and imports `github.com/coreos/etcd/clientv3`, it requires Go 1.9+ to compile.
+
+Before
+
+```go
+import "golang.org/x/net/context"
+cli.Put(context.Background(), "f", "v")
+```
+
+After
+
+```go
+import "context"
+cli.Put(context.Background(), "f", "v")
+```
+
+#### Change in gRPC dependency
+
+3.3 now requires [grpc/grpc-go](https://github.com/grpc/grpc-go/releases) `v1.7.5`.
+
+##### Deprecate `grpclog.Logger`
+
+`grpclog.Logger` has been deprecated in favor of [`grpclog.LoggerV2`](https://github.com/grpc/grpc-go/blob/master/grpclog/loggerv2.go). `clientv3.Logger` is now `grpclog.LoggerV2`.
+
+Before
+
+```go
+import "github.com/coreos/etcd/clientv3"
+clientv3.SetLogger(log.New(os.Stderr, "grpc: ", 0))
+```
+
+After
+
+```go
+import "github.com/coreos/etcd/clientv3"
+import "google.golang.org/grpc/grpclog"
+clientv3.SetLogger(grpclog.NewLoggerV2(os.Stderr, os.Stderr, os.Stderr))
+
+// log.New above cannot be used (not implement grpclog.LoggerV2 interface)
+```
+
+##### Deprecate `grpc.ErrClientConnTimeout`
+
+Previously, `grpc.ErrClientConnTimeout` error is returned on client dial time-outs. 3.3 instead returns `context.DeadlineExceeded` (see [#8504](https://github.com/coreos/etcd/issues/8504)).
+
+Before
+
+```go
+// expect dial time-out on ipv4 blackhole
+_, err := clientv3.New(clientv3.Config{
+    Endpoints:   []string{"http://254.0.0.1:12345"},
+    DialTimeout: 2 * time.Second
+})
+if err == grpc.ErrClientConnTimeout {
+	// handle errors
+}
+```
+
+After
+
+```go
+_, err := clientv3.New(clientv3.Config{
+    Endpoints:   []string{"http://254.0.0.1:12345"},
+    DialTimeout: 2 * time.Second
+})
+if err == context.DeadlineExceeded {
+	// handle errors
+}
+```
+
+#### Change in official container registry
+
+etcd now uses [`gcr.io/etcd-development/etcd`](https://gcr.io/etcd-development/etcd) as a primary container registry, and [`quay.io/coreos/etcd`](https://quay.io/coreos/etcd) as secondary.
+
+Before
+
+```bash
+docker pull quay.io/coreos/etcd:v3.2.5
+```
+
+After
+
+```bash
+docker pull gcr.io/etcd-development/etcd:v3.3.0
+```
+
+### Server upgrade checklists
+
+#### Upgrade requirements
+
+To upgrade an existing etcd deployment to 3.3, the running cluster must be 3.2 or greater. If it's before 3.2, please [upgrade to 3.2](upgrade_3_2.md) before upgrading to 3.3.
+
+Also, to ensure a smooth rolling upgrade, the running cluster must be healthy. Check the health of the cluster by using the `etcdctl endpoint health` command before proceeding.
+
+#### Preparation
+
+Before upgrading etcd, always test the services relying on etcd in a staging environment before deploying the upgrade to the production environment.
+
+Before beginning, [backup the etcd data](../op-guide/maintenance.md#snapshot-backup). Should something go wrong with the upgrade, it is possible to use this backup to [downgrade](#downgrade) back to existing etcd version. Please note that the `snapshot` command only backs up the v3 data. For v2 data, see [backing up v2 datastore](../v2/admin_guide.md#backing-up-the-datastore).
+
+#### Mixed versions
+
+While upgrading, an etcd cluster supports mixed versions of etcd members, and operates with the protocol of the lowest common version. The cluster is only considered upgraded once all of its members are upgraded to version 3.3. Internally, etcd members negotiate with each other to determine the overall cluster version, which controls the reported version and the supported features.
+
+#### Limitations
+
+Note: If the cluster only has v3 data and no v2 data, it is not subject to this limitation.
+
+If the cluster is serving a v2 data set larger than 50MB, each newly upgraded member may take up to two minutes to catch up with the existing cluster. Check the size of a recent snapshot to estimate the total data size. In other words, it is safest to wait for 2 minutes between upgrading each member.
+
+For a much larger total data size, 100MB or more , this one-time process might take even more time. Administrators of very large etcd clusters of this magnitude can feel free to contact the [etcd team][etcd-contact] before upgrading, and we'll be happy to provide advice on the procedure.
+
+#### Downgrade
+
+If all members have been upgraded to v3.3, the cluster will be upgraded to v3.3, and downgrade from this completed state is **not possible**. If any single member is still v3.2, however, the cluster and its operations remains "v3.2", and it is possible from this mixed cluster state to return to using a v3.2 etcd binary on all members.
+
+Please [backup the data directory](../op-guide/maintenance.md#snapshot-backup) of all etcd members to make downgrading the cluster possible even after it has been completely upgraded.
+
+### Upgrade procedure
+
+This example shows how to upgrade a 3-member v3.2 ectd cluster running on a local machine.
+
+#### 1. Check upgrade requirements
+
+Is the cluster healthy and running v3.2.x?
+
+```
+$ ETCDCTL_API=3 etcdctl endpoint health --endpoints=localhost:2379,localhost:22379,localhost:32379
+localhost:2379 is healthy: successfully committed proposal: took = 6.600684ms
+localhost:22379 is healthy: successfully committed proposal: took = 8.540064ms
+localhost:32379 is healthy: successfully committed proposal: took = 8.763432ms
+
+$ curl http://localhost:2379/version
+{"etcdserver":"3.2.7","etcdcluster":"3.2.0"}
+```
+
+#### 2. Stop the existing etcd process
+
+When each etcd process is stopped, expected errors will be logged by other cluster members. This is normal since a cluster member connection has been (temporarily) broken:
+
+```
+14:13:31.491746 I | raft: c89feb932daef420 [term 3] received MsgTimeoutNow from 6d4f535bae3ab960 and starts an election to get leadership.
+14:13:31.491769 I | raft: c89feb932daef420 became candidate at term 4
+14:13:31.491788 I | raft: c89feb932daef420 received MsgVoteResp from c89feb932daef420 at term 4
+14:13:31.491797 I | raft: c89feb932daef420 [logterm: 3, index: 9] sent MsgVote request to 6d4f535bae3ab960 at term 4
+14:13:31.491805 I | raft: c89feb932daef420 [logterm: 3, index: 9] sent MsgVote request to 9eda174c7df8a033 at term 4
+14:13:31.491815 I | raft: raft.node: c89feb932daef420 lost leader 6d4f535bae3ab960 at term 4
+14:13:31.524084 I | raft: c89feb932daef420 received MsgVoteResp from 6d4f535bae3ab960 at term 4
+14:13:31.524108 I | raft: c89feb932daef420 [quorum:2] has received 2 MsgVoteResp votes and 0 vote rejections
+14:13:31.524123 I | raft: c89feb932daef420 became leader at term 4
+14:13:31.524136 I | raft: raft.node: c89feb932daef420 elected leader c89feb932daef420 at term 4
+14:13:31.592650 W | rafthttp: lost the TCP streaming connection with peer 6d4f535bae3ab960 (stream MsgApp v2 reader)
+14:13:31.592825 W | rafthttp: lost the TCP streaming connection with peer 6d4f535bae3ab960 (stream Message reader)
+14:13:31.693275 E | rafthttp: failed to dial 6d4f535bae3ab960 on stream Message (dial tcp [::1]:2380: getsockopt: connection refused)
+14:13:31.693289 I | rafthttp: peer 6d4f535bae3ab960 became inactive
+14:13:31.936678 W | rafthttp: lost the TCP streaming connection with peer 6d4f535bae3ab960 (stream Message writer)
+```
+
+It's a good idea at this point to [backup the etcd data](../op-guide/maintenance.md#snapshot-backup) to provide a downgrade path should any problems occur:
+
+```
+$ etcdctl snapshot save backup.db
+```
+
+#### 3. Drop-in etcd v3.3 binary and start the new etcd process
+
+The new v3.3 etcd will publish its information to the cluster:
+
+```
+14:14:25.363225 I | etcdserver: published {Name:s1 ClientURLs:[http://localhost:2379]} to cluster a9ededbffcb1b1f1
+```
+
+Verify that each member, and then the entire cluster, becomes healthy with the new v3.3 etcd binary:
+
+```
+$ ETCDCTL_API=3 /etcdctl endpoint health --endpoints=localhost:2379,localhost:22379,localhost:32379
+localhost:22379 is healthy: successfully committed proposal: took = 5.540129ms
+localhost:32379 is healthy: successfully committed proposal: took = 7.321771ms
+localhost:2379 is healthy: successfully committed proposal: took = 10.629901ms
+```
+
+Upgraded members will log warnings like the following until the entire cluster is upgraded. This is expected and will cease after all etcd cluster members are upgraded to v3.3:
+
+```
+14:15:17.071804 W | etcdserver: member c89feb932daef420 has a higher version 3.3.0
+14:15:21.073110 W | etcdserver: the local etcd version 3.2.7 is not up-to-date
+14:15:21.073142 W | etcdserver: member 6d4f535bae3ab960 has a higher version 3.3.0
+14:15:21.073157 W | etcdserver: the local etcd version 3.2.7 is not up-to-date
+14:15:21.073164 W | etcdserver: member c89feb932daef420 has a higher version 3.3.0
+```
+
+#### 4. Repeat step 2 to step 3 for all other members
+
+#### 5. Finish
+
+When all members are upgraded, the cluster will report upgrading to 3.3 successfully:
+
+```
+14:15:54.536901 N | etcdserver/membership: updated the cluster version from 3.2 to 3.3
+14:15:54.537035 I | etcdserver/api: enabled capabilities for version 3.3
+```
+
+```
+$ ETCDCTL_API=3 /etcdctl endpoint health --endpoints=localhost:2379,localhost:22379,localhost:32379
+localhost:2379 is healthy: successfully committed proposal: took = 2.312897ms
+localhost:22379 is healthy: successfully committed proposal: took = 2.553476ms
+localhost:32379 is healthy: successfully committed proposal: took = 2.517902ms
+```
+
+[etcd-contact]: https://groups.google.com/forum/#!forum/etcd-dev
--- a/Documentation/upgrades/upgrade_3_4.md
+++ b/Documentation/upgrades/upgrade_3_4.md
@@ -0,0 +1,171 @@
+## Upgrade etcd from 3.3 to 3.4
+
+In the general case, upgrading from etcd 3.3 to 3.4 can be a zero-downtime, rolling upgrade:
+ - one by one, stop the etcd v3.3 processes and replace them with etcd v3.4 processes
+ - after running all v3.4 processes, new features in v3.4 are available to the cluster
+
+Before [starting an upgrade](#upgrade-procedure), read through the rest of this guide to prepare.
+
+### Upgrade checklists
+
+**NOTE:** When [migrating from v2 with no v3 data](https://github.com/coreos/etcd/issues/9480), etcd server v3.2+ panics when etcd restores from existing snapshots but no v3 `ETCD_DATA_DIR/member/snap/db` file. This happens when the server had migrated from v2 with no previous v3 data. This also prevents accidental v3 data loss (e.g. `db` file might have been moved). etcd requires that post v3 migration can only happen with v3 data. Do not upgrade to newer v3 versions until v3.0 server contains v3 data.
+
+Highlighted breaking changes in 3.4.
+
+#### Change in `etcd` flags
+
+`--ca-file` and `--peer-ca-file` flags are deprecated; they have been deprecated since v2.1.
+
+```diff
+-etcd --ca-file ca-client.crt
+etcd --trusted-ca-file ca-client.crt
+```
+
+```diff
+-etcd --peer-ca-file ca-peer.crt
+etcd --peer-trusted-ca-file ca-peer.crt
+```
+
+#### Change in ``pkg/transport`
+
+Deprecated `pkg/transport.TLSInfo.CAFile` field.
+
+```diff
+import "github.com/coreos/etcd/pkg/transport"
+
+tlsInfo := transport.TLSInfo{
+    CertFile: "/tmp/test-certs/test.pem",
+    KeyFile: "/tmp/test-certs/test-key.pem",
+-   CAFile: "/tmp/test-certs/trusted-ca.pem",
+   TrustedCAFile: "/tmp/test-certs/trusted-ca.pem",
+}
+tlsConfig, err := tlsInfo.ClientConfig()
+if err != nil {
+    panic(err)
+}
+```
+
+### Server upgrade checklists
+
+#### Upgrade requirements
+
+To upgrade an existing etcd deployment to 3.4, the running cluster must be 3.3 or greater. If it's before 3.3, please [upgrade to 3.3](upgrade_3_3.md) before upgrading to 3.4.
+
+Also, to ensure a smooth rolling upgrade, the running cluster must be healthy. Check the health of the cluster by using the `etcdctl endpoint health` command before proceeding.
+
+#### Preparation
+
+Before upgrading etcd, always test the services relying on etcd in a staging environment before deploying the upgrade to the production environment.
+
+Before beginning, [backup the etcd data](../op-guide/maintenance.md#snapshot-backup). Should something go wrong with the upgrade, it is possible to use this backup to [downgrade](#downgrade) back to existing etcd version. Please note that the `snapshot` command only backs up the v3 data. For v2 data, see [backing up v2 datastore](../v2/admin_guide.md#backing-up-the-datastore).
+
+#### Mixed versions
+
+While upgrading, an etcd cluster supports mixed versions of etcd members, and operates with the protocol of the lowest common version. The cluster is only considered upgraded once all of its members are upgraded to version 3.4. Internally, etcd members negotiate with each other to determine the overall cluster version, which controls the reported version and the supported features.
+
+#### Limitations
+
+Note: If the cluster only has v3 data and no v2 data, it is not subject to this limitation.
+
+If the cluster is serving a v2 data set larger than 50MB, each newly upgraded member may take up to two minutes to catch up with the existing cluster. Check the size of a recent snapshot to estimate the total data size. In other words, it is safest to wait for 2 minutes between upgrading each member.
+
+For a much larger total data size, 100MB or more , this one-time process might take even more time. Administrators of very large etcd clusters of this magnitude can feel free to contact the [etcd team][etcd-contact] before upgrading, and we'll be happy to provide advice on the procedure.
+
+#### Downgrade
+
+If all members have been upgraded to v3.4, the cluster will be upgraded to v3.4, and downgrade from this completed state is **not possible**. If any single member is still v3.3, however, the cluster and its operations remains "v3.3", and it is possible from this mixed cluster state to return to using a v3.3 etcd binary on all members.
+
+Please [backup the data directory](../op-guide/maintenance.md#snapshot-backup) of all etcd members to make downgrading the cluster possible even after it has been completely upgraded.
+
+### Upgrade procedure
+
+This example shows how to upgrade a 3-member v3.3 ectd cluster running on a local machine.
+
+#### 1. Check upgrade requirements
+
+Is the cluster healthy and running v3.3.x?
+
+```
+$ ETCDCTL_API=3 etcdctl endpoint health --endpoints=localhost:2379,localhost:22379,localhost:32379
+localhost:2379 is healthy: successfully committed proposal: took = 6.600684ms
+localhost:22379 is healthy: successfully committed proposal: took = 8.540064ms
+localhost:32379 is healthy: successfully committed proposal: took = 8.763432ms
+
+$ curl http://localhost:2379/version
+{"etcdserver":"3.3.0","etcdcluster":"3.3.0"}
+```
+
+#### 2. Stop the existing etcd process
+
+When each etcd process is stopped, expected errors will be logged by other cluster members. This is normal since a cluster member connection has been (temporarily) broken:
+
+```
+14:13:31.491746 I | raft: c89feb932daef420 [term 3] received MsgTimeoutNow from 6d4f535bae3ab960 and starts an election to get leadership.
+14:13:31.491769 I | raft: c89feb932daef420 became candidate at term 4
+14:13:31.491788 I | raft: c89feb932daef420 received MsgVoteResp from c89feb932daef420 at term 4
+14:13:31.491797 I | raft: c89feb932daef420 [logterm: 3, index: 9] sent MsgVote request to 6d4f535bae3ab960 at term 4
+14:13:31.491805 I | raft: c89feb932daef420 [logterm: 3, index: 9] sent MsgVote request to 9eda174c7df8a033 at term 4
+14:13:31.491815 I | raft: raft.node: c89feb932daef420 lost leader 6d4f535bae3ab960 at term 4
+14:13:31.524084 I | raft: c89feb932daef420 received MsgVoteResp from 6d4f535bae3ab960 at term 4
+14:13:31.524108 I | raft: c89feb932daef420 [quorum:2] has received 2 MsgVoteResp votes and 0 vote rejections
+14:13:31.524123 I | raft: c89feb932daef420 became leader at term 4
+14:13:31.524136 I | raft: raft.node: c89feb932daef420 elected leader c89feb932daef420 at term 4
+14:13:31.592650 W | rafthttp: lost the TCP streaming connection with peer 6d4f535bae3ab960 (stream MsgApp v2 reader)
+14:13:31.592825 W | rafthttp: lost the TCP streaming connection with peer 6d4f535bae3ab960 (stream Message reader)
+14:13:31.693275 E | rafthttp: failed to dial 6d4f535bae3ab960 on stream Message (dial tcp [::1]:2380: getsockopt: connection refused)
+14:13:31.693289 I | rafthttp: peer 6d4f535bae3ab960 became inactive
+14:13:31.936678 W | rafthttp: lost the TCP streaming connection with peer 6d4f535bae3ab960 (stream Message writer)
+```
+
+It's a good idea at this point to [backup the etcd data](../op-guide/maintenance.md#snapshot-backup) to provide a downgrade path should any problems occur:
+
+```
+$ etcdctl snapshot save backup.db
+```
+
+#### 3. Drop-in etcd v3.4 binary and start the new etcd process
+
+The new v3.4 etcd will publish its information to the cluster:
+
+```
+14:14:25.363225 I | etcdserver: published {Name:s1 ClientURLs:[http://localhost:2379]} to cluster a9ededbffcb1b1f1
+```
+
+Verify that each member, and then the entire cluster, becomes healthy with the new v3.4 etcd binary:
+
+```
+$ ETCDCTL_API=3 /etcdctl endpoint health --endpoints=localhost:2379,localhost:22379,localhost:32379
+localhost:22379 is healthy: successfully committed proposal: took = 5.540129ms
+localhost:32379 is healthy: successfully committed proposal: took = 7.321771ms
+localhost:2379 is healthy: successfully committed proposal: took = 10.629901ms
+```
+
+Upgraded members will log warnings like the following until the entire cluster is upgraded. This is expected and will cease after all etcd cluster members are upgraded to v3.4:
+
+```
+14:15:17.071804 W | etcdserver: member c89feb932daef420 has a higher version 3.4.0
+14:15:21.073110 W | etcdserver: the local etcd version 3.3.0 is not up-to-date
+14:15:21.073142 W | etcdserver: member 6d4f535bae3ab960 has a higher version 3.4.0
+14:15:21.073157 W | etcdserver: the local etcd version 3.3.0 is not up-to-date
+14:15:21.073164 W | etcdserver: member c89feb932daef420 has a higher version 3.4.0
+```
+
+#### 4. Repeat step 2 to step 3 for all other members
+
+#### 5. Finish
+
+When all members are upgraded, the cluster will report upgrading to 3.4 successfully:
+
+```
+14:15:54.536901 N | etcdserver/membership: updated the cluster version from 3.3 to 3.4
+14:15:54.537035 I | etcdserver/api: enabled capabilities for version 3.4
+```
+
+```
+$ ETCDCTL_API=3 /etcdctl endpoint health --endpoints=localhost:2379,localhost:22379,localhost:32379
+localhost:2379 is healthy: successfully committed proposal: took = 2.312897ms
+localhost:22379 is healthy: successfully committed proposal: took = 2.553476ms
+localhost:32379 is healthy: successfully committed proposal: took = 2.517902ms
+```
+
+[etcd-contact]: https://groups.google.com/forum/#!forum/etcd-dev
--- a/Documentation/upgrades/upgrading-etcd.md
+++ b/Documentation/upgrades/upgrading-etcd.md
@@ -0,0 +1,19 @@
+# Upgrading etcd clusters and applications
+
+This section contains documents specific to upgrading etcd clusters and applications.
+
+## Moving from etcd API v2 to API v3
+* [Migrate applications from using API v2 to API v3][migrate-apps]
+
+## Upgrading an etcd v3.x cluster
+* [Upgrade etcd from 3.0 to 3.1][upgrade-3-1]
+* [Upgrade etcd from 3.1 to 3.2][upgrade-3-2]
+
+## Upgrading from etcd v2.3
+* [Upgrade a v2.3 cluster to v3.0][upgrade-cluster]
+
+
+[migrate-apps]: ../op-guide/v2-migration.md
+[upgrade-cluster]: upgrade_3_0.md
+[upgrade-3-1]: upgrade_3_1.md
+[upgrade-3-2]: upgrade_3_2.md
--- a/Documentation/v2/README.md
+++ b/Documentation/v2/README.md
@@ -67,13 +67,13 @@ You have successfully started an etcd and written a key to the store.

 The [official etcd ports][iana-ports] are 2379 for client requests, and 2380 for peer communication. To maintain compatibility, some etcd configuration and documentation continues to refer to the legacy ports 4001 and 7001, but all new etcd use and discussion should adopt the IANA-assigned ports. The legacy ports 4001 and 7001 will be fully deprecated, and support for their use removed, in future etcd releases.

-[iana-ports]: https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml?search=etcd
+[iana-ports]: http://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.txt

 ### Running local etcd cluster

 First install [goreman](https://github.com/mattn/goreman), which manages Procfile-based applications.

-Our [Procfile script](./Procfile) will set up a local example cluster. You can start it with:
+Our [Procfile script](../../V2Procfile) will set up a local example cluster. You can start it with:

 ```sh
 goreman start
@@ -162,4 +162,4 @@ Currently only the amd64 architecture is officially supported by `etcd`.

 ### License

-etcd is under the Apache 2.0 license. See the [LICENSE](LICENSE) file for details.
+etcd is under the Apache 2.0 license. See the [LICENSE](../../LICENSE) file for details.
--- a/Documentation/v2/api_v3.md
+++ b/Documentation/v2/api_v3.md
@@ -18,7 +18,7 @@ A key’s lifetime spans a generation. Each key may have one or multiple generat

 ### Physical View

-etcd stores the physical data as key-value pairs in a persistent [b+tree][b+tree]. Each revision of the store’s state only contains the delta from its previous revision to be efficient. A single revision may correspond to multiple keys in the tree. 
+etcd stores the physical data as key-value pairs in a persistent [b+tree][b+tree]. Each revision of the store’s state only contains the delta from its previous revision to be efficient. A single revision may correspond to multiple keys in the tree.

 The key of key-value pair is a 3-tuple (major, sub, type). Major is the store revision holding the key. Sub differentiates among  keys within the same revision. Type is an optional suffix for special value (e.g., `t` if the value contains a tombstone). The value of the key-value pair contains the modification from previous revision, thus one delta from previous revision. The b+tree is ordered by key in lexical byte-order. Ranged lookups over revision deltas are fast; this enables quickly finding modifications from one specific revision to another. Compaction removes out-of-date keys-value pairs.

@@ -73,7 +73,7 @@ Any completed operations are durable. All accessible data is also durable data.

 #### Linearizability

-Linearizability (also known as Atomic Consistency or External Consistency) is a consistency level between strict consistency and sequential consistency. 
+Linearizability (also known as Atomic Consistency or External Consistency) is a consistency level between strict consistency and sequential consistency.

 For linearizability, suppose each operation receives a timestamp from a loosely synchronized global clock. Operations are linearized if and only if they always complete as though they were executed in a sequential order and each operation appears to complete in the order specified by the program. Likewise, if an operation’s timestamp precedes another, that operation must also precede the other operation in the sequence.

@@ -83,10 +83,10 @@ etcd does not ensure linearizability for watch operations. Users are expected to

 etcd ensures linearizability for all other operations by default. Linearizability comes with a cost, however, because linearized requests must go through the Raft consensus process. To obtain lower latencies and higher throughput for read requests, clients can configure a request’s consistency mode to `serializable`, which may access stale data with respect to quorum, but removes the performance penalty of linearized accesses' reliance on live consensus.

-[persistent-ds]: [https://en.wikipedia.org/wiki/Persistent_data_structure]
-[btree]: [https://en.wikipedia.org/wiki/B-tree]
-[b+tree]: [https://en.wikipedia.org/wiki/B%2B_tree]
-[seq_consistency]: [https://en.wikipedia.org/wiki/Consistency_model#Sequential_consistency]
-[strict_consistency]: [https://en.wikipedia.org/wiki/Consistency_model#Strict_consistency]
-[serializable_isolation]: [https://en.wikipedia.org/wiki/Isolation_(database_systems)#Serializable]
-[Linearizability]: [#Linearizability]
+[persistent-ds]: https://en.wikipedia.org/wiki/Persistent_data_structure
+[btree]: https://en.wikipedia.org/wiki/B-tree
+[b+tree]: https://en.wikipedia.org/wiki/B%2B_tree
+[seq_consistency]: https://en.wikipedia.org/wiki/Consistency_model#Sequential_consistency
+[strict_consistency]: https://en.wikipedia.org/wiki/Consistency_model#Strict_consistency
+[serializable_isolation]: https://en.wikipedia.org/wiki/Isolation_(database_systems)#Serializable
+[Linearizability]: #linearizability
--- a/Documentation/v2/backward_compatibility.md
+++ b/Documentation/v2/backward_compatibility.md
@@ -32,7 +32,7 @@ The consistent flag for read operations is removed in etcd 2.0.0. The normal rea

 The read consistency guarantees are:

-The consistent read guarantees the sequential consistency within one client that talks to one etcd server. Read/Write from one client to one etcd member should be observed in order. If one client write a value to an etcd server successfully, it should be able to get the value out of the server immediately. 
+The consistent read guarantees the sequential consistency within one client that talks to one etcd server. Read/Write from one client to one etcd member should be observed in order. If one client write a value to an etcd server successfully, it should be able to get the value out of the server immediately.

 Each etcd member will proxy the request to leader and only return the result to user after the result is applied on the local member. Thus after the write succeed, the user is guaranteed to see the value on the member it sent the request to.

@@ -56,6 +56,7 @@ Proxy mode in 2.0 will provide similar functionality, and with improved control
 ## Discovery Service

 A size key needs to be provided inside a [discovery token][discoverytoken].
+
 [discoverytoken]: clustering.md#custom-etcd-discovery-service

 ## HTTP Admin API
--- a/Documentation/v2/benchmarks/etcd-2-1-0-alpha-benchmarks.md
+++ b/Documentation/v2/benchmarks/etcd-2-1-0-alpha-benchmarks.md
@@ -49,4 +49,4 @@ Bootstrap another machine and use the [boom HTTP benchmark tool][boom] to send r
 | 256               | 256               | all servers        | 3061      | 119.3 |

 [boom]: https://github.com/rakyll/boom
-[hack-benchmark]: /hack/benchmark/
+[hack-benchmark]: ../../../hack/benchmark/
--- a/Documentation/v2/benchmarks/etcd-2-2-0-benchmarks.md
+++ b/Documentation/v2/benchmarks/etcd-2-2-0-benchmarks.md
@@ -24,7 +24,7 @@ Go OS/Arch: linux/amd64

 ## Testing

-Bootstrap another machine, outside of the etcd cluster, and run the [`boom` HTTP benchmark tool](https://github.com/rakyll/boom) with a connection reuse patch to send requests to each etcd cluster member. See the [benchmark instructions](../../hack/benchmark/) for the patch and the steps to reproduce our procedures.
+Bootstrap another machine, outside of the etcd cluster, and run the [`boom` HTTP benchmark tool][boom] with a connection reuse patch to send requests to each etcd cluster member. See the [benchmark instructions][hack] for the patch and the steps to reproduce our procedures.

 The performance is calulated through results of 100 benchmark rounds.

@@ -66,4 +66,7 @@ The performance is calulated through results of 100 benchmark rounds.

 - Write QPS to cluster leaders seems to be increased by a small margin. This is because the main loop and entry apply loops were decoupled in the etcd raft logic, eliminating several blocks between them.

- Write QPS to all members seems to be increased by a significant margin, because followers now receive the latest commit index sooner, and commit proposals more quickly.
+- Write QPS to all members seems to be increased by a significant margin, because followers now receive the latest commit index sooner, and commit proposals more quickly.
+
+[boom]: https://github.com/rakyll/boom
+[hack]: ../../../hack/benchmark/
--- a/Documentation/v2/benchmarks/etcd-2-2-0-rc-benchmarks.md
+++ b/Documentation/v2/benchmarks/etcd-2-2-0-rc-benchmarks.md
@@ -69,4 +69,4 @@ Bootstrap another machine and use the [boom HTTP benchmark tool][boom] to send r
 [boom]: https://github.com/rakyll/boom
 [c7146bd5]: https://github.com/coreos/etcd/commits/c7146bd5f2c73716091262edc638401bb8229144
 [etcd-2.1-benchmark]: etcd-2-1-0-alpha-benchmarks.md
-[hack-benchmark]: /hack/benchmark/
+[hack-benchmark]: ../../../hack/benchmark/
--- a/Documentation/v2/benchmarks/etcd-3-demo-benchmarks.md
+++ b/Documentation/v2/benchmarks/etcd-3-demo-benchmarks.md
@@ -39,4 +39,4 @@ The performance is nearly the same as the one with empty server handler.
 The performance with empty server handler is not affected by one put. So the
 performance downgrade should be caused by storage package.

-[etcd-v3-benchmark]: /tools/benchmark/
+[etcd-v3-benchmark]: ../../../tools/benchmark/
--- a/Documentation/v2/clustering.md
+++ b/Documentation/v2/clustering.md
@@ -423,7 +423,7 @@ To make understanding this feature easier, we changed the naming of some flags,
 |-peers      |none      |Deprecated. The --initial-cluster flag provides a similar concept with different semantics. Please read this guide on cluster startup.|
 |-peers-file    |none      |Deprecated. The --initial-cluster flag provides a similar concept with different semantics. Please read this guide on cluster startup.|

-[client]: /client
+[client]: ../../client
 [client-discoverer]: https://godoc.org/github.com/coreos/etcd/client#Discoverer
 [conf-adv-client]: configuration.md#-advertise-client-urls
 [conf-listen-client]: configuration.md#-listen-client-urls
--- a/Documentation/v2/configuration.md
+++ b/Documentation/v2/configuration.md
@@ -234,7 +234,7 @@ The security flags help to [build a secure etcd cluster][security].
 + env variable: ETCD_DEBUG

 ### --log-package-levels
-+ Set individual etcd subpackages to specific log levels. An example being `etcdserver=WARNING,security=DEBUG` 
+ Set individual etcd subpackages to specific log levels. An example being `etcdserver=WARNING,security=DEBUG`
 + default: none (INFO for all packages)
 + env variable: ETCD_LOG_PACKAGE_LEVELS

@@ -272,7 +272,7 @@ Follow the instructions when using these flags.
 [build-cluster]: clustering.md#static
 [reconfig]: runtime-configuration.md
 [discovery]: clustering.md#discovery
-[iana-ports]: https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml?search=etcd
+[iana-ports]: http://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.txt
 [proxy]: proxy.md
 [reconfig]: runtime-configuration.md
 [restore]: admin_guide.md#restoring-a-backup
--- a/Documentation/v2/libraries-and-tools.md
+++ b/Documentation/v2/libraries-and-tools.md
@@ -112,7 +112,6 @@
 - [mattn/etcdenv](https://github.com/mattn/etcdenv) - "env" shebang with etcd integration
 - [kelseyhightower/confd](https://github.com/kelseyhightower/confd) - Manage local app config files using templates and data from etcd
 - [configdb](https://git.autistici.org/ai/configdb/tree/master) - A REST relational abstraction on top of arbitrary database backends, aimed at storing configs and inventories.
- [scrz](https://github.com/scrz/scrz) - Container manager, stores configuration in etcd.
 - [fleet](https://github.com/coreos/fleet) - Distributed init system
 - [kubernetes/kubernetes](https://github.com/kubernetes/kubernetes) - Container cluster manager introduced by Google.
 - [mailgun/vulcand](https://github.com/mailgun/vulcand) - HTTP proxy that uses etcd as a configuration backend.
--- a/Documentation/v2/reporting_bugs.md
+++ b/Documentation/v2/reporting_bugs.md
@@ -1,6 +1,6 @@
 # Reporting Bugs

-If you find bugs or documentation mistakes in the etcd project, please let us know by [opening an issue][issue]. We treat bugs and mistakes very seriously and believe no issue is too small. Before creating a bug report, please check that an issue reporting the same problem does not already exist.
+If you find bugs or documentation mistakes in the etcd project, please let us know by [opening an issue][etcd-issue]. We treat bugs and mistakes very seriously and believe no issue is too small. Before creating a bug report, please check that an issue reporting the same problem does not already exist.

 To make your bug report accurate and easy to understand, please try to create bug reports that are:

--- a/Documentation/v2/rfc/v3api.md
+++ b/Documentation/v2/rfc/v3api.md
@@ -7,25 +7,25 @@ To prove out the design of the v3 API the team has also built [a number of examp
 # Design

 1. Flatten binary key-value space
-    
+
 2. Keep the event history until compaction
    - access to old version of keys
    - user controlled history compaction
-    
+
 3. Support range query
    - Pagination support with limit argument
    - Support consistency guarantee across multiple range queries
-    
+
 4. Replace TTL key with Lease
    - more efficient/ low cost keep alive
    - a logical group of TTL keys
-    
+
 5. Replace CAS/CAD with multi-object Txn
    - MUCH MORE powerful and flexible
-    
+
 6. Support efficient watching with multiple ranges

-7. RPC API supports the completed set of APIs. 
+7. RPC API supports the completed set of APIs.
    - more efficient than JSON/HTTP
    - additional txn/lease support

@@ -56,7 +56,7 @@ the size in the future a little bit or make it configurable.
 // A put is always successful
 Put( PutRequest { key = foo, value = bar } )

-PutResponse { 
+PutResponse {
    cluster_id = 0x1000,
    member_id = 0x1,
    revision = 1,
@@ -119,7 +119,7 @@ RangeResponse {
 Txn(TxnRequest {
    // mod_revision of foo0 is equal to 1, mod_revision of foo1 is greater than 1
    compare = {
-        {compareType = equal, key = foo0, mod_revision = 1}, 
+        {compareType = equal, key = foo0, mod_revision = 1},
        {compareType = greater, key = foo1, mod_revision = 1}}
    },
    // if the comparison succeeds, put foo2 = bar2
@@ -156,7 +156,7 @@ Watch( WatchRequest{
           end_revision = 10000,
           // server decided notification frequency
           progress_notification = true,
-       } 
+       }
       … // this can be a watch request stream
      )

@@ -176,7 +176,7 @@ WatchResponse {
          },
    }
    …
-    
+
    // a notification at 2000
    WatchResponse {
        cluster_id = 0x1000,
@@ -185,9 +185,9 @@ WatchResponse {
        raft_term = 0x1,
        // nil event as notification
    }
-    
-    … 
-    
+
+    …
+
    // put (foo0=bar3000) event at 3000
    WatchResponse {
        cluster_id = 0x1000,
@@ -204,8 +204,8 @@ WatchResponse {
          },
    }
    …
-    
+
 ```

-[api-protobuf]: https://github.com/coreos/etcd/blob/master/etcdserver/etcdserverpb/rpc.proto
-[kv-protobuf]: https://github.com/coreos/etcd/blob/master/storage/storagepb/kv.proto
+[api-protobuf]: https://github.com/coreos/etcd/blob/release-2.3/etcdserver/etcdserverpb/rpc.proto
+[kv-protobuf]: https://github.com/coreos/etcd/blob/release-2.3/storage/storagepb/kv.proto
--- a/Documentation/v2/security.md
+++ b/Documentation/v2/security.md
@@ -188,6 +188,6 @@ Make sure that you sign your certificates with a Subject Name your member's publ
 If you need your certificate to be signed for your member's FQDN in its Subject Name then you could use Subject Alternative Names (short IP SANs) to add your IP address. The `etcd-ca` tool provides `--domain=` option for its `new-cert` command, and openssl can make [it][alt-name] too.

 [cfssl]: https://github.com/cloudflare/cfssl
-[tls-setup]: /hack/tls-setup
+[tls-setup]: ../../hack/tls-setup
 [tls-guide]: https://github.com/coreos/docs/blob/master/os/generate-self-signed-certificates.md
 [alt-name]: http://wiki.cacert.org/FAQ/subjectAltName
--- a/39
+++ b/39
@@ -1,3 +1,36 @@
+etcd v3.1.0 (2017-01-20)
+- faster linearizable reads (implements Raft read-index)
+- automatic leadership transfer when leader steps down
+- etcd uses default route IP if advertise URL is not given
+- cluster rejects removing members if quorum will be lost
+- SRV records (e.g., infra1.example.com) must match the discovery domain
+  (i.e., example.com) if no custom certificate authority is given
+  - TLSConfig ServerName is ignored with user-provided certificates
+    for backwards compatibility; to be deprecated in 3.2
+- discovery now has upper limit for waiting on retries
+- etcd flags
+  - --strict-reconfig-check flag is set by default
+  - add --log-output flag
+  - add --metrics flag
+- v3 authentication API is now stable
+- v3 client
+  - add SetEndpoints method; update endpoints at runtime
+  - add Sync method; auto-update endpoints at runtime
+  - add Lease TimeToLive API; fetch lease information
+  - replace Config.Logger field with global logger
+  - Get API responses are sorted in ascending order by default
+- v3 etcdctl
+  - add lease timetolive command
+  - add --print-value-only flag to get command
+  - add --dest-prefix flag to make-mirror command
+  - command get responses are sorted in ascending order by default
+- recipes now conform to sessions defined in clientv3/concurrency
+- ACI has symlinks to /usr/local/bin/etcd*
+- warn on binding listeners through domain names; to be deprecated in 3.2
+- experimental gRPC proxy feature
+
+etcd v3.0.16 (2017-01-13)
+
 etcd v3.0.15 (2016-11-11)
 - fix cancel watch request with wrong range end

@@ -11,7 +44,7 @@ etcd v3.0.12 (2016-10-07)
 etcd v3.0.11 (2016-10-07)
 - server returns previous key-value (optional)
  - clientv3 WithPrevKV option
-  - v3 etcdctl prev-kv flag
+  - v3 etcdctl put,watch,del --prev-kv flag

 etcd v3.0.10 (2016-09-23)

@@ -28,7 +61,7 @@ etcd v3.0.6 (2016-08-19)

 etcd v3.0.5 (2016-08-19)
 - SRV records (e.g., infra1.example.com) must match the discovery domain
-  (i.e., example.com) when using the default certificate authority.
+  (i.e., example.com) if no custom certificate authority is given

 etcd v3.0.4 (2016-07-27)
 - v2 auth can now use common name from TLS certificate when --client-cert-auth is enabled
@@ -44,3 +77,5 @@ etcd v3.0.2 (2016-07-08)
 - Dockerfile uses ENTRYPOINT, instead of CMD, to run etcd without binary path specified

 etcd v3.0.1 (2016-07-01)
+
+etcd v3.0.0 (2016-06-30)
--- a/README.md
+++ b/README.md
@@ -37,13 +37,14 @@ See [etcdctl][etcdctl] for a simple command line client.

 ### Getting etcd

-The easiest way to get etcd is to use one of the pre-built release binaries which are available for OSX, Linux, Windows, AppC (ACI), and Docker. Instructions for using these binaries are on the [GitHub releases page][github-release].
+The easiest way to get etcd is to use one of the pre-built release binaries which are available for OSX, Linux, Windows, [rkt][rkt], and Docker. Instructions for using these binaries are on the [GitHub releases page][github-release].

 For those wanting to try the very latest version, you can [build the latest version of etcd][dl-build] from the `master` branch.
-You will first need [*Go*](https://golang.org/) installed on your machine (version 1.6+ is required).
+You will first need [*Go*](https://golang.org/) installed on your machine (version 1.7+ is required).
 All development occurs on `master`, including new features and bug fixes.
 Bug fixes are first targeted at `master` and subsequently ported to release branches, as described in the [branch management][branch-management] guide.

+[rkt]: https://github.com/coreos/rkt/releases/
 [github-release]: https://github.com/coreos/etcd/releases/
 [branch-management]: ./Documentation/branch_management.md
 [dl-build]: ./Documentation/dl_build.md#build-the-latest-version
@@ -77,7 +78,7 @@ That's it! etcd is now running and serving client requests. For more

 The [official etcd ports][iana-ports] are 2379 for client requests, and 2380 for peer communication. 

-[iana-ports]: https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml?search=etcd
+[iana-ports]: http://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.txt

 ### Running a local etcd cluster

@@ -135,5 +136,3 @@ See [reporting bugs](Documentation/reporting_bugs.md) for details about reportin
 ### License

 etcd is under the Apache 2.0 license. See the [LICENSE](LICENSE) file for details.
-
-
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -6,19 +6,18 @@ This document defines a high level roadmap for etcd development.

 The dates below should not be considered authoritative, but rather indicative of the projected timeline of the project. The [milestones defined in GitHub](https://github.com/coreos/etcd/milestones) represent the most up-to-date and issue-for-issue plans.

-etcd 3.0 is our current stable branch. The roadmap below outlines new features that will be added to etcd, and while subject to change, define what future stable will look like.
+etcd 3.1 is our current stable branch. The roadmap below outlines new features that will be added to etcd, and while subject to change, define what future stable will look like.

-### etcd 3.1 (2016-Oct)
- Stable L4 gateway
- Experimental support for scalable proxy
- Automatic leadership transfer for the rolling upgrade
- V3 API improvements
-  - Get previous key-value pair
-  - Get only keys (ignore values)
-  - Get only key count
-
-### etcd 3.2 (2017-Feb)
+### etcd 3.2 (2017-May)
 - Stable scalable proxy
- JWT token based auth
+- Proxy-as-client interface passthrough
+- Lock service
+- Namespacing proxy
+- TLS Command Name and JWT token based authentication
+- Read-modify-write V3 Put
 - Improved watch performance
- ...
+- Support non-blocking concurrent read
+
+### etcd 3.3 (?)
+- TBD
+
--- a/auth/range_perm_cache.go
+++ b/auth/range_perm_cache.go
@@ -49,38 +49,30 @@ func isRangeEqual(a, b *rangePerm) bool {

 // removeSubsetRangePerms removes any rangePerms that are subsets of other rangePerms.
 // If there are equal ranges, removeSubsetRangePerms only keeps one of them.
-func removeSubsetRangePerms(perms []*rangePerm) []*rangePerm {
-	// TODO(mitake): currently it is O(n^2), we need a better algorithm
-	var newp []*rangePerm
-
+// It returns a sorted rangePerm slice.
+func removeSubsetRangePerms(perms []*rangePerm) (newp []*rangePerm) {
+	sort.Sort(RangePermSliceByBegin(perms))
+	var prev *rangePerm
 	for i := range perms {
-		skip := false
-
-		for j := range perms {
-			if i == j {
-				continue
-			}
-
-			if isRangeEqual(perms[i], perms[j]) {
-				// if ranges are equal, we only keep the first range.
-				if i > j {
-					skip = true
-					break
-				}
-			} else if isSubset(perms[i], perms[j]) {
-				// if a range is a strict subset of the other one, we skip the subset.
-				skip = true
-				break
-			}
-		}
-
-		if skip {
+		if i == 0 {
+			prev = perms[i]
+			newp = append(newp, perms[i])
 			continue
 		}
-
+		if isRangeEqual(perms[i], prev) {
+			continue
+		}
+		if isSubset(perms[i], prev) {
+			continue
+		}
+		if isSubset(prev, perms[i]) {
+			prev = perms[i]
+			newp[len(newp)-1] = perms[i]
+			continue
+		}
+		prev = perms[i]
 		newp = append(newp, perms[i])
 	}
-
 	return newp
 }

@@ -88,7 +80,6 @@ func removeSubsetRangePerms(perms []*rangePerm) []*rangePerm {
 func mergeRangePerms(perms []*rangePerm) []*rangePerm {
 	var merged []*rangePerm
 	perms = removeSubsetRangePerms(perms)
-	sort.Sort(RangePermSliceByBegin(perms))

 	i := 0
 	for i < len(perms) {
--- a/auth/range_perm_cache_test.go
+++ b/auth/range_perm_cache_test.go
@@ -16,6 +16,8 @@ package auth

 import (
 	"bytes"
+	"fmt"
+	"reflect"
 	"testing"
 )

@@ -131,3 +133,47 @@ func TestGetMergedPerms(t *testing.T) {
 		}
 	}
 }
+
+func TestRemoveSubsetRangePerms(t *testing.T) {
+	tests := []struct {
+		perms  []*rangePerm
+		expect []*rangePerm
+	}{
+		{ // subsets converge
+			[]*rangePerm{{[]byte{2}, []byte{3}}, {[]byte{2}, []byte{5}}, {[]byte{1}, []byte{4}}},
+			[]*rangePerm{{[]byte{1}, []byte{4}}, {[]byte{2}, []byte{5}}},
+		},
+		{ // subsets converge
+			[]*rangePerm{{[]byte{0}, []byte{3}}, {[]byte{0}, []byte{1}}, {[]byte{2}, []byte{4}}, {[]byte{0}, []byte{2}}},
+			[]*rangePerm{{[]byte{0}, []byte{3}}, {[]byte{2}, []byte{4}}},
+		},
+		{ // biggest range at the end
+			[]*rangePerm{{[]byte{2}, []byte{3}}, {[]byte{0}, []byte{2}}, {[]byte{1}, []byte{4}}, {[]byte{0}, []byte{5}}},
+			[]*rangePerm{{[]byte{0}, []byte{5}}},
+		},
+		{ // biggest range at the beginning
+			[]*rangePerm{{[]byte{0}, []byte{5}}, {[]byte{2}, []byte{3}}, {[]byte{0}, []byte{2}}, {[]byte{1}, []byte{4}}},
+			[]*rangePerm{{[]byte{0}, []byte{5}}},
+		},
+		{ // no overlapping ranges
+			[]*rangePerm{{[]byte{2}, []byte{3}}, {[]byte{0}, []byte{1}}, {[]byte{4}, []byte{7}}, {[]byte{8}, []byte{15}}},
+			[]*rangePerm{{[]byte{0}, []byte{1}}, {[]byte{2}, []byte{3}}, {[]byte{4}, []byte{7}}, {[]byte{8}, []byte{15}}},
+		},
+	}
+	for i, tt := range tests {
+		rs := removeSubsetRangePerms(tt.perms)
+		if !reflect.DeepEqual(rs, tt.expect) {
+			t.Fatalf("#%d: unexpected rangePerms %q, got %q", i, printPerms(rs), printPerms(tt.expect))
+		}
+	}
+}
+
+func printPerms(rs []*rangePerm) (txt string) {
+	for i, p := range rs {
+		if i != 0 {
+			txt += ","
+		}
+		txt += fmt.Sprintf("%+v", *p)
+	}
+	return
+}
--- a/auth/simple_token.go
+++ b/auth/simple_token.go
@@ -21,6 +21,8 @@ import (
 	"crypto/rand"
 	"math/big"
 	"strings"
+	"sync"
+	"time"
 )

 const (
@@ -28,6 +30,83 @@ const (
 	defaultSimpleTokenLength = 16
 )

+// var for testing purposes
+var (
+	simpleTokenTTL           = 5 * time.Minute
+	simpleTokenTTLResolution = 1 * time.Second
+)
+
+type simpleTokenTTLKeeper struct {
+	tokens          map[string]time.Time
+	donec           chan struct{}
+	stopc           chan struct{}
+	deleteTokenFunc func(string)
+	mu              *sync.Mutex
+}
+
+func (tm *simpleTokenTTLKeeper) stop() {
+	select {
+	case tm.stopc <- struct{}{}:
+	case <-tm.donec:
+	}
+	<-tm.donec
+}
+
+func (tm *simpleTokenTTLKeeper) addSimpleToken(token string) {
+	tm.tokens[token] = time.Now().Add(simpleTokenTTL)
+}
+
+func (tm *simpleTokenTTLKeeper) resetSimpleToken(token string) {
+	if _, ok := tm.tokens[token]; ok {
+		tm.tokens[token] = time.Now().Add(simpleTokenTTL)
+	}
+}
+
+func (tm *simpleTokenTTLKeeper) deleteSimpleToken(token string) {
+	delete(tm.tokens, token)
+}
+
+func (tm *simpleTokenTTLKeeper) run() {
+	tokenTicker := time.NewTicker(simpleTokenTTLResolution)
+	defer func() {
+		tokenTicker.Stop()
+		close(tm.donec)
+	}()
+	for {
+		select {
+		case <-tokenTicker.C:
+			nowtime := time.Now()
+			tm.mu.Lock()
+			for t, tokenendtime := range tm.tokens {
+				if nowtime.After(tokenendtime) {
+					tm.deleteTokenFunc(t)
+					delete(tm.tokens, t)
+				}
+			}
+			tm.mu.Unlock()
+		case <-tm.stopc:
+			return
+		}
+	}
+}
+
+func (as *authStore) enable() {
+	delf := func(tk string) {
+		if username, ok := as.simpleTokens[tk]; ok {
+			plog.Infof("deleting token %s for user %s", tk, username)
+			delete(as.simpleTokens, tk)
+		}
+	}
+	as.simpleTokenKeeper = &simpleTokenTTLKeeper{
+		tokens:          make(map[string]time.Time),
+		donec:           make(chan struct{}),
+		stopc:           make(chan struct{}),
+		deleteTokenFunc: delf,
+		mu:              &as.simpleTokensMu,
+	}
+	go as.simpleTokenKeeper.run()
+}
+
 func (as *authStore) GenSimpleToken() (string, error) {
 	ret := make([]byte, defaultSimpleTokenLength)

@@ -45,23 +124,26 @@ func (as *authStore) GenSimpleToken() (string, error) {

 func (as *authStore) assignSimpleTokenToUser(username, token string) {
 	as.simpleTokensMu.Lock()
-
 	_, ok := as.simpleTokens[token]
 	if ok {
 		plog.Panicf("token %s is alredy used", token)
 	}

 	as.simpleTokens[token] = username
+	as.simpleTokenKeeper.addSimpleToken(token)
 	as.simpleTokensMu.Unlock()
 }

 func (as *authStore) invalidateUser(username string) {
+	if as.simpleTokenKeeper == nil {
+		return
+	}
 	as.simpleTokensMu.Lock()
-	defer as.simpleTokensMu.Unlock()
-
 	for token, name := range as.simpleTokens {
 		if strings.Compare(name, username) == 0 {
 			delete(as.simpleTokens, token)
+			as.simpleTokenKeeper.deleteSimpleToken(token)
 		}
 	}
+	as.simpleTokensMu.Unlock()
 }
--- a/auth/store.go
+++ b/auth/store.go
@@ -20,6 +20,7 @@ import (
 	"errors"
 	"fmt"
 	"sort"
+	"strconv"
 	"strings"
 	"sync"

@@ -29,6 +30,7 @@ import (
 	"github.com/coreos/pkg/capnslog"
 	"golang.org/x/crypto/bcrypt"
 	"golang.org/x/net/context"
+	"google.golang.org/grpc/metadata"
 )

 var (
@@ -57,6 +59,7 @@ var (
 	ErrPermissionNotGranted = errors.New("auth: permission is not granted to the role")
 	ErrAuthNotEnabled       = errors.New("auth: authentication is not enabled")
 	ErrAuthOldRevision      = errors.New("auth: revision in header is old")
+	ErrInvalidAuthToken     = errors.New("auth: invalid auth token")

 	// BcryptCost is the algorithm cost / strength for hashing auth passwords
 	BcryptCost = bcrypt.DefaultCost
@@ -150,6 +153,12 @@ type AuthStore interface {

 	// CheckPassword checks a given pair of username and password is correct
 	CheckPassword(username, password string) (uint64, error)
+
+	// Close does cleanup of AuthStore
+	Close() error
+
+	// AuthInfoFromCtx gets AuthInfo from gRPC's context
+	AuthInfoFromCtx(ctx context.Context) (*AuthInfo, error)
 }

 type authStore struct {
@@ -159,13 +168,33 @@ type authStore struct {

 	rangePermCache map[string]*unifiedRangePermissions // username -> unifiedRangePermissions

-	simpleTokensMu sync.RWMutex
-	simpleTokens   map[string]string // token -> username
-
 	revision uint64
+
+	// tokenSimple in v3.2+
+	indexWaiter       func(uint64) <-chan struct{}
+	simpleTokenKeeper *simpleTokenTTLKeeper
+	simpleTokensMu    sync.Mutex
+	simpleTokens      map[string]string // token -> username
+}
+
+func newDeleterFunc(as *authStore) func(string) {
+	return func(t string) {
+		as.simpleTokensMu.Lock()
+		defer as.simpleTokensMu.Unlock()
+		if username, ok := as.simpleTokens[t]; ok {
+			plog.Infof("deleting token %s for user %s", t, username)
+			delete(as.simpleTokens, t)
+		}
+	}
 }

 func (as *authStore) AuthEnable() error {
+	as.enabledMu.Lock()
+	defer as.enabledMu.Unlock()
+	if as.enabled {
+		plog.Noticef("Authentication already enabled")
+		return nil
+	}
 	b := as.be
 	tx := b.BatchTx()
 	tx.Lock()
@@ -185,9 +214,8 @@ func (as *authStore) AuthEnable() error {

 	tx.UnsafePut(authBucketName, enableFlagKey, authEnabled)

-	as.enabledMu.Lock()
 	as.enabled = true
-	as.enabledMu.Unlock()
+	as.enable()

 	as.rangePermCache = make(map[string]*unifiedRangePermissions)

@@ -199,6 +227,11 @@ func (as *authStore) AuthEnable() error {
 }

 func (as *authStore) AuthDisable() {
+	as.enabledMu.Lock()
+	defer as.enabledMu.Unlock()
+	if !as.enabled {
+		return
+	}
 	b := as.be
 	tx := b.BatchTx()
 	tx.Lock()
@@ -207,17 +240,33 @@ func (as *authStore) AuthDisable() {
 	tx.Unlock()
 	b.ForceCommit()

-	as.enabledMu.Lock()
 	as.enabled = false
-	as.enabledMu.Unlock()

 	as.simpleTokensMu.Lock()
+	tk := as.simpleTokenKeeper
+	as.simpleTokenKeeper = nil
 	as.simpleTokens = make(map[string]string) // invalidate all tokens
 	as.simpleTokensMu.Unlock()
+	if tk != nil {
+		tk.stop()
+	}

 	plog.Noticef("Authentication disabled")
 }

+func (as *authStore) Close() error {
+	as.enabledMu.Lock()
+	defer as.enabledMu.Unlock()
+	if !as.enabled {
+		return nil
+	}
+	if as.simpleTokenKeeper != nil {
+		as.simpleTokenKeeper.stop()
+		as.simpleTokenKeeper = nil
+	}
+	return nil
+}
+
 func (as *authStore) Authenticate(ctx context.Context, username, password string) (*pb.AuthenticateResponse, error) {
 	if !as.isAuthEnabled() {
 		return nil, ErrAuthNotEnabled
@@ -421,11 +470,7 @@ func (as *authStore) UserGet(r *pb.AuthUserGetRequest) (*pb.AuthUserGetResponse,
 	if user == nil {
 		return nil, ErrUserNotFound
 	}
-
-	for _, role := range user.Roles {
-		resp.Roles = append(resp.Roles, role)
-	}
-
+	resp.Roles = append(resp.Roles, user.Roles...)
 	return &resp, nil
 }

@@ -491,11 +536,7 @@ func (as *authStore) RoleGet(r *pb.AuthRoleGetRequest) (*pb.AuthRoleGetResponse,
 	if role == nil {
 		return nil, ErrRoleNotFound
 	}
-
-	for _, perm := range role.KeyPermission {
-		resp.Perm = append(resp.Perm, perm)
-	}
-
+	resp.Perm = append(resp.Perm, role.KeyPermission...)
 	return &resp, nil
 }

@@ -605,10 +646,14 @@ func (as *authStore) RoleAdd(r *pb.AuthRoleAddRequest) (*pb.AuthRoleAddResponse,
 }

 func (as *authStore) AuthInfoFromToken(token string) (*AuthInfo, bool) {
-	as.simpleTokensMu.RLock()
-	defer as.simpleTokensMu.RUnlock()
-	t, ok := as.simpleTokens[token]
-	return &AuthInfo{Username: t, Revision: as.revision}, ok
+	// same as '(t *tokenSimple) info' in v3.2+
+	as.simpleTokensMu.Lock()
+	username, ok := as.simpleTokens[token]
+	if ok && as.simpleTokenKeeper != nil {
+		as.simpleTokenKeeper.resetSimpleToken(token)
+	}
+	as.simpleTokensMu.Unlock()
+	return &AuthInfo{Username: username, Revision: as.revision}, ok
 }

 type permSlice []*authpb.Permission
@@ -673,6 +718,11 @@ func (as *authStore) isOpPermitted(userName string, revision uint64, key, rangeE
 		return nil
 	}

+	// only gets rev == 0 when passed AuthInfo{}; no user given
+	if revision == 0 {
+		return ErrUserEmpty
+	}
+
 	if revision < as.revision {
 		return ErrAuthOldRevision
 	}
@@ -715,6 +765,9 @@ func (as *authStore) IsAdminPermitted(authInfo *AuthInfo) error {
 	if !as.isAuthEnabled() {
 		return nil
 	}
+	if authInfo == nil {
+		return ErrUserEmpty
+	}

 	tx := as.be.BatchTx()
 	tx.Lock()
@@ -833,7 +886,7 @@ func (as *authStore) isAuthEnabled() bool {
 	return as.enabled
 }

-func NewAuthStore(be backend.Backend) *authStore {
+func NewAuthStore(be backend.Backend, indexWaiter func(uint64) <-chan struct{}) *authStore {
 	tx := be.BatchTx()
 	tx.Lock()

@@ -841,13 +894,30 @@ func NewAuthStore(be backend.Backend) *authStore {
 	tx.UnsafeCreateBucket(authUsersBucketName)
 	tx.UnsafeCreateBucket(authRolesBucketName)

-	as := &authStore{
-		be:           be,
-		simpleTokens: make(map[string]string),
-		revision:     0,
+	enabled := false
+	_, vs := tx.UnsafeRange(authBucketName, enableFlagKey, nil, 0)
+	if len(vs) == 1 {
+		if bytes.Equal(vs[0], authEnabled) {
+			enabled = true
+		}
 	}

-	as.commitRevision(tx)
+	as := &authStore{
+		be:             be,
+		simpleTokens:   make(map[string]string),
+		revision:       getRevision(tx),
+		indexWaiter:    indexWaiter,
+		enabled:        enabled,
+		rangePermCache: make(map[string]*unifiedRangePermissions),
+	}
+
+	if enabled {
+		as.enable()
+	}
+
+	if as.revision == 0 {
+		as.commitRevision(tx)
+	}

 	tx.Unlock()
 	be.ForceCommit()
@@ -874,7 +944,8 @@ func (as *authStore) commitRevision(tx backend.BatchTx) {
 func getRevision(tx backend.BatchTx) uint64 {
 	_, vs := tx.UnsafeRange(authBucketName, []byte(revisionKey), nil, 0)
 	if len(vs) != 1 {
-		plog.Panicf("failed to get the key of auth store revision")
+		// this can happen in the initialization phase
+		return 0
 	}

 	return binary.BigEndian.Uint64(vs[0])
@@ -883,3 +954,46 @@ func getRevision(tx backend.BatchTx) uint64 {
 func (as *authStore) Revision() uint64 {
 	return as.revision
 }
+
+func (as *authStore) isValidSimpleToken(token string, ctx context.Context) bool {
+	splitted := strings.Split(token, ".")
+	if len(splitted) != 2 {
+		return false
+	}
+	index, err := strconv.Atoi(splitted[1])
+	if err != nil {
+		return false
+	}
+
+	select {
+	case <-as.indexWaiter(uint64(index)):
+		return true
+	case <-ctx.Done():
+	}
+
+	return false
+}
+
+func (as *authStore) AuthInfoFromCtx(ctx context.Context) (*AuthInfo, error) {
+	md, ok := metadata.FromContext(ctx)
+	if !ok {
+		return nil, nil
+	}
+
+	ts, tok := md["token"]
+	if !tok {
+		return nil, nil
+	}
+
+	token := ts[0]
+	if !as.isValidSimpleToken(token, ctx) {
+		return nil, ErrInvalidAuthToken
+	}
+
+	authInfo, uok := as.AuthInfoFromToken(token)
+	if !uok {
+		plog.Warningf("invalid auth token: %s", token)
+		return nil, ErrInvalidAuthToken
+	}
+	return authInfo, nil
+}
--- a/auth/store_test.go
+++ b/auth/store_test.go
@@ -26,31 +26,38 @@ import (

 func init() { BcryptCost = bcrypt.MinCost }

-func TestUserAdd(t *testing.T) {
-	b, tPath := backend.NewDefaultTmpBackend()
-	defer func() {
-		b.Close()
-		os.Remove(tPath)
+func dummyIndexWaiter(index uint64) <-chan struct{} {
+	ch := make(chan struct{})
+	go func() {
+		ch <- struct{}{}
 	}()
+	return ch
+}

-	as := NewAuthStore(b)
-	ua := &pb.AuthUserAddRequest{Name: "foo"}
-	_, err := as.UserAdd(ua) // add a non-existing user
+// TestNewAuthStoreRevision ensures newly auth store
+// keeps the old revision when there are no changes.
+func TestNewAuthStoreRevision(t *testing.T) {
+	b, tPath := backend.NewDefaultTmpBackend()
+	defer os.Remove(tPath)
+
+	as := NewAuthStore(b, dummyIndexWaiter)
+	err := enableAuthAndCreateRoot(as)
 	if err != nil {
 		t.Fatal(err)
 	}
-	_, err = as.UserAdd(ua) // add an existing user
-	if err == nil {
-		t.Fatalf("expected %v, got %v", ErrUserAlreadyExist, err)
-	}
-	if err != ErrUserAlreadyExist {
-		t.Fatalf("expected %v, got %v", ErrUserAlreadyExist, err)
-	}
+	old := as.Revision()
+	b.Close()
+	as.Close()

-	ua = &pb.AuthUserAddRequest{Name: ""}
-	_, err = as.UserAdd(ua) // add a user with empty name
-	if err != ErrUserEmpty {
-		t.Fatal(err)
+	// no changes to commit
+	b2 := backend.NewDefaultBackend(tPath)
+	as = NewAuthStore(b2, dummyIndexWaiter)
+	new := as.Revision()
+	b2.Close()
+	as.Close()
+
+	if old != new {
+		t.Fatalf("expected revision %d, got %d", old, new)
 	}
 }

@@ -80,7 +87,8 @@ func TestCheckPassword(t *testing.T) {
 		os.Remove(tPath)
 	}()

-	as := NewAuthStore(b)
+	as := NewAuthStore(b, dummyIndexWaiter)
+	defer as.Close()
 	err := enableAuthAndCreateRoot(as)
 	if err != nil {
 		t.Fatal(err)
@@ -124,7 +132,8 @@ func TestUserDelete(t *testing.T) {
 		os.Remove(tPath)
 	}()

-	as := NewAuthStore(b)
+	as := NewAuthStore(b, dummyIndexWaiter)
+	defer as.Close()
 	err := enableAuthAndCreateRoot(as)
 	if err != nil {
 		t.Fatal(err)
@@ -160,7 +169,8 @@ func TestUserChangePassword(t *testing.T) {
 		os.Remove(tPath)
 	}()

-	as := NewAuthStore(b)
+	as := NewAuthStore(b, dummyIndexWaiter)
+	defer as.Close()
 	err := enableAuthAndCreateRoot(as)
 	if err != nil {
 		t.Fatal(err)
@@ -205,7 +215,8 @@ func TestRoleAdd(t *testing.T) {
 		os.Remove(tPath)
 	}()

-	as := NewAuthStore(b)
+	as := NewAuthStore(b, dummyIndexWaiter)
+	defer as.Close()
 	err := enableAuthAndCreateRoot(as)
 	if err != nil {
 		t.Fatal(err)
@@ -225,7 +236,8 @@ func TestUserGrant(t *testing.T) {
 		os.Remove(tPath)
 	}()

-	as := NewAuthStore(b)
+	as := NewAuthStore(b, dummyIndexWaiter)
+	defer as.Close()
 	err := enableAuthAndCreateRoot(as)
 	if err != nil {
 		t.Fatal(err)
@@ -256,4 +268,93 @@ func TestUserGrant(t *testing.T) {
 	if err != ErrUserNotFound {
 		t.Fatalf("expected %v, got %v", ErrUserNotFound, err)
 	}
+
+	// non-admin user
+	err = as.IsAdminPermitted(&AuthInfo{Username: "foo", Revision: 1})
+	if err != ErrPermissionDenied {
+		t.Errorf("expected %v, got %v", ErrPermissionDenied, err)
+	}
+
+	// disabled auth should return nil
+	as.AuthDisable()
+	err = as.IsAdminPermitted(&AuthInfo{Username: "root", Revision: 1})
+	if err != nil {
+		t.Errorf("expected nil, got %v", err)
+	}
+}
+
+func TestRecoverFromSnapshot(t *testing.T) {
+	as, _ := setupAuthStore(t)
+
+	ua := &pb.AuthUserAddRequest{Name: "foo"}
+	_, err := as.UserAdd(ua) // add an existing user
+	if err == nil {
+		t.Fatalf("expected %v, got %v", ErrUserAlreadyExist, err)
+	}
+	if err != ErrUserAlreadyExist {
+		t.Fatalf("expected %v, got %v", ErrUserAlreadyExist, err)
+	}
+
+	ua = &pb.AuthUserAddRequest{Name: ""}
+	_, err = as.UserAdd(ua) // add a user with empty name
+	if err != ErrUserEmpty {
+		t.Fatal(err)
+	}
+
+	as.Close()
+
+	as2 := NewAuthStore(as.be, dummyIndexWaiter)
+	defer func(a *authStore) {
+		a.Close()
+	}(as2)
+
+	if !as2.isAuthEnabled() {
+		t.Fatal("recovering authStore from existing backend failed")
+	}
+
+	ul, err := as.UserList(&pb.AuthUserListRequest{})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !contains(ul.Users, "root") {
+		t.Errorf("expected %v in %v", "root", ul.Users)
+	}
+}
+
+func contains(array []string, str string) bool {
+	for _, s := range array {
+		if s == str {
+			return true
+		}
+	}
+	return false
+}
+
+func setupAuthStore(t *testing.T) (store *authStore, teardownfunc func(t *testing.T)) {
+	b, tPath := backend.NewDefaultTmpBackend()
+
+	as := NewAuthStore(b, dummyIndexWaiter)
+	err := enableAuthAndCreateRoot(as)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// adds a new role
+	_, err = as.RoleAdd(&pb.AuthRoleAddRequest{Name: "role-test"})
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	ua := &pb.AuthUserAddRequest{Name: "foo", Password: "bar"}
+	_, err = as.UserAdd(ua) // add a non-existing user
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	tearDown := func(t *testing.T) {
+		b.Close()
+		os.Remove(tPath)
+		as.Close()
+	}
+	return as, tearDown
 }
--- a/2
+++ b/2
@@ -48,7 +48,7 @@ etcd_setup_gopath() {
 		GOPATH=":$GOPATH"
 	fi
 	export GOPATH=${etcdGOPATH}$GOPATH
-	rm -f ${etcdGOPATH}/src
+	rm -rf ${etcdGOPATH}/src
 	mkdir -p ${etcdGOPATH}
 	ln -s ${CDIR}/cmd/vendor ${etcdGOPATH}/src
 }
--- a/client/README.md
+++ b/client/README.md
@@ -114,4 +114,4 @@ if err != nil {

 3. Default etcd/client cannot handle the case that the remote server is SIGSTOPed now. TCP keepalive mechanism doesn't help in this scenario because operating system may still send TCP keep-alive packets. Over time we'd like to improve this functionality, but solving this issue isn't high priority because a real-life case in which a server is stopped, but the connection is kept alive, hasn't been brought to our attention.

-4. etcd/client cannot detect whether the member in use is healthy when doing read requests. If the member is isolated from the cluster, etcd/client may retrieve outdated data. As a workaround, users could monitor experimental /health endpoint for member healthy information. We are improving it at [#3265](https://github.com/coreos/etcd/issues/3265).
+4. etcd/client cannot detect whether a member is healthy with watches and non-quorum read requests. If the member is isolated from the cluster, etcd/client may retrieve outdated data. Instead, users can either issue quorum read requests or monitor the /health endpoint for member health information.
--- a/client/integration/client_test.go
+++ b/client/integration/client_test.go
@@ -34,7 +34,7 @@ import (
 func TestV2NoRetryEOF(t *testing.T) {
 	defer testutil.AfterTest(t)
 	// generate an EOF response; specify address so appears first in sorted ep list
-	lEOF := integration.NewListenerWithAddr(t, fmt.Sprintf("eof:123.%d.sock", os.Getpid()))
+	lEOF := integration.NewListenerWithAddr(t, fmt.Sprintf("127.0.0.1:%05d", os.Getpid()))
 	defer lEOF.Close()
 	tries := uint32(0)
 	go func() {
@@ -65,8 +65,7 @@ func TestV2NoRetryEOF(t *testing.T) {
 // TestV2NoRetryNoLeader tests destructive api calls won't retry if given an error code.
 func TestV2NoRetryNoLeader(t *testing.T) {
 	defer testutil.AfterTest(t)
-
-	lHttp := integration.NewListenerWithAddr(t, fmt.Sprintf("errHttp:123.%d.sock", os.Getpid()))
+	lHttp := integration.NewListenerWithAddr(t, fmt.Sprintf("127.0.0.1:%05d", os.Getpid()))
 	eh := &errHandler{errCode: http.StatusServiceUnavailable}
 	srv := httptest.NewUnstartedServer(eh)
 	defer lHttp.Close()
--- a/client/keys.generated.go
+++ b/client/keys.generated.go
--- a/clientv3/balancer_test.go
+++ b/clientv3/balancer_test.go
@@ -42,14 +42,14 @@ func TestBalancerGetUnblocking(t *testing.T) {
 	if err != nil {
 		t.Errorf("Get() with up endpoints should success, got %v", err)
 	}
-	if addrFirst.Addr != endpoints[1] && addrFirst.Addr != endpoints[2] {
+	if addrFirst.Addr != endpoints[1] {
 		t.Errorf("Get() didn't return expected address, got %v", addrFirst)
 	}
 	if putFun == nil {
 		t.Errorf("Get() returned unexpected nil put function")
 	}
 	addrSecond, _, _ := sb.Get(context.Background(), unblockingOpts)
-	if addrSecond.Addr != addrSecond.Addr {
+	if addrFirst.Addr != addrSecond.Addr {
 		t.Errorf("Get() didn't return the same address as previous call, got %v and %v", addrFirst, addrSecond)
 	}

@@ -92,7 +92,7 @@ func TestBalancerGetBlocking(t *testing.T) {

 	down2 := sb.Up(grpc.Address{Addr: endpoints[2]})
 	addrSecond, _, _ := sb.Get(context.Background(), blockingOpts)
-	if addrSecond.Addr != addrSecond.Addr {
+	if addrFirst.Addr != addrSecond.Addr {
 		t.Errorf("Get() didn't return the same address as previous call, got %v and %v", addrFirst, addrSecond)
 	}

--- a/clientv3/client.go
+++ b/clientv3/client.go
@@ -21,6 +21,7 @@ import (
 	"net"
 	"net/url"
 	"strings"
+	"sync"
 	"time"

 	"github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
@@ -46,11 +47,12 @@ type Client struct {
 	Auth
 	Maintenance

-	conn         *grpc.ClientConn
-	cfg          Config
-	creds        *credentials.TransportCredentials
-	balancer     *simpleBalancer
-	retryWrapper retryRpcFunc
+	conn             *grpc.ClientConn
+	cfg              Config
+	creds            *credentials.TransportCredentials
+	balancer         *simpleBalancer
+	retryWrapper     retryRpcFunc
+	retryAuthWrapper retryRpcFunc

 	ctx    context.Context
 	cancel context.CancelFunc
@@ -59,6 +61,8 @@ type Client struct {
 	Username string
 	// Password is a password for authentication
 	Password string
+	// tokenCred is an instance of WithPerRPCCredentials()'s argument
+	tokenCred *authTokenCredential
 }

 // New creates a new etcdv3 client from a given configuration.
@@ -88,6 +92,7 @@ func NewFromConfigFile(path string) (*Client, error) {
 func (c *Client) Close() error {
 	c.cancel()
 	c.Watcher.Close()
+	c.Lease.Close()
 	return toErr(c.ctx, c.conn.Close())
 }

@@ -97,7 +102,12 @@ func (c *Client) Close() error {
 func (c *Client) Ctx() context.Context { return c.ctx }

 // Endpoints lists the registered endpoints for the client.
-func (c *Client) Endpoints() []string { return c.cfg.Endpoints }
+func (c *Client) Endpoints() (eps []string) {
+	// copy the slice; protect original endpoints from being changed
+	eps = make([]string, len(c.cfg.Endpoints))
+	copy(eps, c.cfg.Endpoints)
+	return
+}

 // SetEndpoints updates client's endpoints.
 func (c *Client) SetEndpoints(eps ...string) {
@@ -138,7 +148,8 @@ func (c *Client) autoSync() {
 }

 type authTokenCredential struct {
-	token string
+	token   string
+	tokenMu *sync.RWMutex
 }

 func (cred authTokenCredential) RequireTransportSecurity() bool {
@@ -146,6 +157,8 @@ func (cred authTokenCredential) RequireTransportSecurity() bool {
 }

 func (cred authTokenCredential) GetRequestMetadata(ctx context.Context, s ...string) (map[string]string, error) {
+	cred.tokenMu.RLock()
+	defer cred.tokenMu.RUnlock()
 	return map[string]string{
 		"token": cred.token,
 	}, nil
@@ -208,7 +221,8 @@ func (c *Client) dialSetupOpts(endpoint string, dopts ...grpc.DialOption) (opts
 			return nil, c.ctx.Err()
 		default:
 		}
-		return net.DialTimeout(proto, host, t)
+		dialer := &net.Dialer{Timeout: t}
+		return dialer.DialContext(c.ctx, proto, host)
 	}
 	opts = append(opts, grpc.WithDialer(f))

@@ -230,22 +244,58 @@ func (c *Client) Dial(endpoint string) (*grpc.ClientConn, error) {
 	return c.dial(endpoint)
 }

+func (c *Client) getToken(ctx context.Context) error {
+	var err error // return last error in a case of fail
+	var auth *authenticator
+
+	for i := 0; i < len(c.cfg.Endpoints); i++ {
+		endpoint := c.cfg.Endpoints[i]
+		host := getHost(endpoint)
+		// use dial options without dopts to avoid reusing the client balancer
+		auth, err = newAuthenticator(host, c.dialSetupOpts(endpoint))
+		if err != nil {
+			continue
+		}
+		defer auth.close()
+
+		var resp *AuthenticateResponse
+		resp, err = auth.authenticate(ctx, c.Username, c.Password)
+		if err != nil {
+			continue
+		}
+
+		c.tokenCred.tokenMu.Lock()
+		c.tokenCred.token = resp.Token
+		c.tokenCred.tokenMu.Unlock()
+
+		return nil
+	}
+
+	return err
+}
+
 func (c *Client) dial(endpoint string, dopts ...grpc.DialOption) (*grpc.ClientConn, error) {
 	opts := c.dialSetupOpts(endpoint, dopts...)
 	host := getHost(endpoint)
 	if c.Username != "" && c.Password != "" {
-		// use dial options without dopts to avoid reusing the client balancer
-		auth, err := newAuthenticator(host, c.dialSetupOpts(endpoint))
-		if err != nil {
-			return nil, err
+		c.tokenCred = &authTokenCredential{
+			tokenMu: &sync.RWMutex{},
 		}
-		defer auth.close()

-		resp, err := auth.authenticate(c.ctx, c.Username, c.Password)
-		if err != nil {
+		ctx := c.ctx
+		if c.cfg.DialTimeout > 0 {
+			cctx, cancel := context.WithTimeout(ctx, c.cfg.DialTimeout)
+			defer cancel()
+			ctx = cctx
+		}
+		if err := c.getToken(ctx); err != nil {
+			if err == ctx.Err() && ctx.Err() != c.ctx.Err() {
+				err = grpc.ErrClientConnTimeout
+			}
 			return nil, err
 		}
-		opts = append(opts, grpc.WithPerRPCCredentials(authTokenCredential{token: resp.Token}))
+
+		opts = append(opts, grpc.WithPerRPCCredentials(c.tokenCred))
 	}

 	// add metrics options
@@ -293,10 +343,13 @@ func newClient(cfg *Config) (*Client, error) {
 	client.balancer = newSimpleBalancer(cfg.Endpoints)
 	conn, err := client.dial(cfg.Endpoints[0], grpc.WithBalancer(client.balancer))
 	if err != nil {
+		client.cancel()
+		client.balancer.Close()
 		return nil, err
 	}
 	client.conn = conn
 	client.retryWrapper = client.newRetryWrapper()
+	client.retryAuthWrapper = client.newAuthRetryWrapper()

 	// wait for a connection
 	if cfg.DialTimeout > 0 {
@@ -310,6 +363,7 @@ func newClient(cfg *Config) (*Client, error) {
 		}
 		if !hasConn {
 			client.cancel()
+			client.balancer.Close()
 			conn.Close()
 			return nil, grpc.ErrClientConnTimeout
 		}
--- a/clientv3/client_test.go
+++ b/clientv3/client_test.go
@@ -16,6 +16,7 @@ package clientv3

 import (
 	"fmt"
+	"net"
 	"testing"
 	"time"

@@ -25,36 +26,89 @@ import (
 	"google.golang.org/grpc"
 )

-func TestDialTimeout(t *testing.T) {
+func TestDialCancel(t *testing.T) {
 	defer testutil.AfterTest(t)

-	donec := make(chan error)
-	go func() {
-		// without timeout, grpc keeps redialing if connection refused
-		cfg := Config{
-			Endpoints:   []string{"localhost:12345"},
-			DialTimeout: 2 * time.Second}
-		c, err := New(cfg)
-		if c != nil || err == nil {
-			t.Errorf("new client should fail")
-		}
-		donec <- err
-	}()
-
-	time.Sleep(10 * time.Millisecond)
-
-	select {
-	case err := <-donec:
-		t.Errorf("dial didn't wait (%v)", err)
-	default:
+	// accept first connection so client is created with dial timeout
+	ln, err := net.Listen("unix", "dialcancel:12345")
+	if err != nil {
+		t.Fatal(err)
 	}
+	defer ln.Close()
+
+	ep := "unix://dialcancel:12345"
+	cfg := Config{
+		Endpoints:   []string{ep},
+		DialTimeout: 30 * time.Second}
+	c, err := New(cfg)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// connect to ipv4 blackhole so dial blocks
+	c.SetEndpoints("http://254.0.0.1:12345")
+
+	// issue Get to force redial attempts
+	go c.Get(context.TODO(), "abc")
+
+	// wait a little bit so client close is after dial starts
+	time.Sleep(100 * time.Millisecond)
+
+	donec := make(chan struct{})
+	go func() {
+		defer close(donec)
+		c.Close()
+	}()

 	select {
 	case <-time.After(5 * time.Second):
-		t.Errorf("failed to timeout dial on time")
-	case err := <-donec:
-		if err != grpc.ErrClientConnTimeout {
-			t.Errorf("unexpected error %v, want %v", err, grpc.ErrClientConnTimeout)
+		t.Fatalf("failed to close")
+	case <-donec:
+	}
+}
+
+func TestDialTimeout(t *testing.T) {
+	defer testutil.AfterTest(t)
+
+	testCfgs := []Config{
+		{
+			Endpoints:   []string{"http://254.0.0.1:12345"},
+			DialTimeout: 2 * time.Second,
+		},
+		{
+			Endpoints:   []string{"http://254.0.0.1:12345"},
+			DialTimeout: time.Second,
+			Username:    "abc",
+			Password:    "def",
+		},
+	}
+
+	for i, cfg := range testCfgs {
+		donec := make(chan error)
+		go func() {
+			// without timeout, dial continues forever on ipv4 blackhole
+			c, err := New(cfg)
+			if c != nil || err == nil {
+				t.Errorf("#%d: new client should fail", i)
+			}
+			donec <- err
+		}()
+
+		time.Sleep(10 * time.Millisecond)
+
+		select {
+		case err := <-donec:
+			t.Errorf("#%d: dial didn't wait (%v)", i, err)
+		default:
+		}
+
+		select {
+		case <-time.After(5 * time.Second):
+			t.Errorf("#%d: failed to timeout dial on time", i)
+		case err := <-donec:
+			if err != grpc.ErrClientConnTimeout {
+				t.Errorf("#%d: unexpected error %v, want %v", i, err, grpc.ErrClientConnTimeout)
+			}
 		}
 	}
 }
--- a/clientv3/concurrency/stm.go
+++ b/clientv3/concurrency/stm.go
@@ -249,11 +249,10 @@ func (s *stmReadCommitted) commit() *v3.TxnResponse {
 }

 func isKeyCurrent(k string, r *v3.GetResponse) v3.Cmp {
-	rev := r.Header.Revision + 1
 	if len(r.Kvs) != 0 {
-		rev = r.Kvs[0].ModRevision + 1
+		return v3.Compare(v3.ModRevision(k), "=", r.Kvs[0].ModRevision)
 	}
-	return v3.Compare(v3.ModRevision(k), "<", rev)
+	return v3.Compare(v3.ModRevision(k), "=", 0)
 }

 func respToValue(resp *v3.GetResponse) string {
--- a/clientv3/integration/lease_test.go
+++ b/clientv3/integration/lease_test.go
@@ -17,10 +17,12 @@ package integration
 import (
 	"reflect"
 	"sort"
+	"sync"
 	"testing"
 	"time"

 	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/clientv3/concurrency"
 	"github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
 	"github.com/coreos/etcd/integration"
 	"github.com/coreos/etcd/pkg/testutil"
@@ -154,6 +156,30 @@ func TestLeaseKeepAlive(t *testing.T) {
 	}
 }

+func TestLeaseKeepAliveOneSecond(t *testing.T) {
+	defer testutil.AfterTest(t)
+
+	clus := integration.NewClusterV3(t, &integration.ClusterConfig{Size: 1})
+	defer clus.Terminate(t)
+
+	cli := clus.Client(0)
+
+	resp, err := cli.Grant(context.Background(), 1)
+	if err != nil {
+		t.Errorf("failed to create lease %v", err)
+	}
+	rc, kerr := cli.KeepAlive(context.Background(), resp.ID)
+	if kerr != nil {
+		t.Errorf("failed to keepalive lease %v", kerr)
+	}
+
+	for i := 0; i < 3; i++ {
+		if _, ok := <-rc; !ok {
+			t.Errorf("chan is closed, want not closed")
+		}
+	}
+}
+
 // TODO: add a client that can connect to all the members of cluster via unix sock.
 // TODO: test handle more complicated failures.
 func TestLeaseKeepAliveHandleFailure(t *testing.T) {
@@ -510,3 +536,121 @@ func TestLeaseTimeToLive(t *testing.T) {
 		t.Fatalf("unexpected keys %+v", lresp.Keys)
 	}
 }
+
+// TestLeaseRenewLostQuorum ensures keepalives work after losing quorum
+// for a while.
+func TestLeaseRenewLostQuorum(t *testing.T) {
+	defer testutil.AfterTest(t)
+
+	clus := integration.NewClusterV3(t, &integration.ClusterConfig{Size: 3})
+	defer clus.Terminate(t)
+
+	cli := clus.Client(0)
+	r, err := cli.Grant(context.TODO(), 4)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	kctx, kcancel := context.WithCancel(context.Background())
+	defer kcancel()
+	ka, err := cli.KeepAlive(kctx, r.ID)
+	if err != nil {
+		t.Fatal(err)
+	}
+	// consume first keepalive so next message sends when cluster is down
+	<-ka
+
+	// force keepalive stream message to timeout
+	clus.Members[1].Stop(t)
+	clus.Members[2].Stop(t)
+	// Use TTL-1 since the client closes the keepalive channel if no
+	// keepalive arrives before the lease deadline.
+	// The cluster has 1 second to recover and reply to the keepalive.
+	time.Sleep(time.Duration(r.TTL-1) * time.Second)
+	clus.Members[1].Restart(t)
+	clus.Members[2].Restart(t)
+
+	select {
+	case _, ok := <-ka:
+		if !ok {
+			t.Fatalf("keepalive closed")
+		}
+	case <-time.After(time.Duration(r.TTL) * time.Second):
+		t.Fatalf("timed out waiting for keepalive")
+	}
+}
+
+func TestLeaseKeepAliveLoopExit(t *testing.T) {
+	defer testutil.AfterTest(t)
+
+	clus := integration.NewClusterV3(t, &integration.ClusterConfig{Size: 1})
+	defer clus.Terminate(t)
+
+	ctx := context.Background()
+	cli := clus.Client(0)
+
+	resp, err := cli.Grant(ctx, 5)
+	if err != nil {
+		t.Fatal(err)
+	}
+	cli.Lease.Close()
+
+	_, err = cli.KeepAlive(ctx, resp.ID)
+	if _, ok := err.(clientv3.ErrKeepAliveHalted); !ok {
+		t.Fatalf("expected %T, got %v(%T)", clientv3.ErrKeepAliveHalted{}, err, err)
+	}
+}
+
+// TestV3LeaseFailureOverlap issues Grant and Keepalive requests to a cluster
+// before, during, and after quorum loss to confirm Grant/Keepalive tolerates
+// transient cluster failure.
+func TestV3LeaseFailureOverlap(t *testing.T) {
+	clus := integration.NewClusterV3(t, &integration.ClusterConfig{Size: 2})
+	defer clus.Terminate(t)
+
+	numReqs := 5
+	cli := clus.Client(0)
+
+	// bring up a session, tear it down
+	updown := func(i int) error {
+		sess, err := concurrency.NewSession(cli)
+		if err != nil {
+			return err
+		}
+		ch := make(chan struct{})
+		go func() {
+			defer close(ch)
+			sess.Close()
+		}()
+		select {
+		case <-ch:
+		case <-time.After(time.Minute / 4):
+			t.Fatalf("timeout %d", i)
+		}
+		return nil
+	}
+
+	var wg sync.WaitGroup
+	mkReqs := func(n int) {
+		wg.Add(numReqs)
+		for i := 0; i < numReqs; i++ {
+			go func() {
+				defer wg.Done()
+				err := updown(n)
+				if err == nil || err == rpctypes.ErrTimeoutDueToConnectionLost {
+					return
+				}
+				t.Fatal(err)
+			}()
+		}
+	}
+
+	mkReqs(1)
+	clus.Members[1].Stop(t)
+	mkReqs(2)
+	time.Sleep(time.Second)
+	mkReqs(3)
+	clus.Members[1].Restart(t)
+	mkReqs(4)
+	wg.Wait()
+}
--- a/clientv3/integration/watch_test.go
+++ b/clientv3/integration/watch_test.go
@@ -347,7 +347,57 @@ func putAndWatch(t *testing.T, wctx *watchctx, key, val string) {
 	}
 }

-// TestWatchResumeComapcted checks that the watcher gracefully closes in case
+func TestWatchResumeInitRev(t *testing.T) {
+	defer testutil.AfterTest(t)
+	clus := integration.NewClusterV3(t, &integration.ClusterConfig{Size: 1})
+	defer clus.Terminate(t)
+
+	cli := clus.Client(0)
+	if _, err := cli.Put(context.TODO(), "b", "2"); err != nil {
+		t.Fatal(err)
+	}
+	if _, err := cli.Put(context.TODO(), "a", "3"); err != nil {
+		t.Fatal(err)
+	}
+	// if resume is broken, it'll pick up this key first instead of a=3
+	if _, err := cli.Put(context.TODO(), "a", "4"); err != nil {
+		t.Fatal(err)
+	}
+
+	wch := clus.Client(0).Watch(context.Background(), "a", clientv3.WithRev(1), clientv3.WithCreatedNotify())
+	if resp, ok := <-wch; !ok || resp.Header.Revision != 4 {
+		t.Fatalf("got (%v, %v), expected create notification rev=4", resp, ok)
+	}
+	// pause wch
+	clus.Members[0].DropConnections()
+	clus.Members[0].PauseConnections()
+
+	select {
+	case resp, ok := <-wch:
+		t.Skipf("wch should block, got (%+v, %v); drop not fast enough", resp, ok)
+	case <-time.After(100 * time.Millisecond):
+	}
+
+	// resume wch
+	clus.Members[0].UnpauseConnections()
+
+	select {
+	case resp, ok := <-wch:
+		if !ok {
+			t.Fatal("unexpected watch close")
+		}
+		if len(resp.Events) == 0 {
+			t.Fatal("expected event on watch")
+		}
+		if string(resp.Events[0].Kv.Value) != "3" {
+			t.Fatalf("expected value=3, got event %+v", resp.Events[0])
+		}
+	case <-time.After(5 * time.Second):
+		t.Fatal("watch timed out")
+	}
+}
+
+// TestWatchResumeCompacted checks that the watcher gracefully closes in case
 // that it tries to resume to a revision that's been compacted out of the store.
 // Since the watcher's server restarts with stale data, the watcher will receive
 // either a compaction error or all keys by staying in sync before the compaction
--- a/clientv3/lease.go
+++ b/clientv3/lease.go
@@ -69,6 +69,21 @@ const (
 	NoLease LeaseID = 0
 )

+// ErrKeepAliveHalted is returned if client keep alive loop halts with an unexpected error.
+//
+// This usually means that automatic lease renewal via KeepAlive is broken, but KeepAliveOnce will still work as expected.
+type ErrKeepAliveHalted struct {
+	Reason error
+}
+
+func (e ErrKeepAliveHalted) Error() string {
+	s := "etcdclient: leases keep alive halted"
+	if e.Reason != nil {
+		s += ": " + e.Reason.Error()
+	}
+	return s
+}
+
 type Lease interface {
 	// Grant creates a new lease.
 	Grant(ctx context.Context, ttl int64) (*LeaseGrantResponse, error)
@@ -94,8 +109,9 @@ type Lease interface {
 type lessor struct {
 	mu sync.Mutex // guards all fields

-	// donec is closed when recvKeepAliveLoop stops
-	donec chan struct{}
+	// donec is closed and loopErr is set when recvKeepAliveLoop stops
+	donec   chan struct{}
+	loopErr error

 	remote pb.LeaseClient

@@ -161,9 +177,6 @@ func (l *lessor) Grant(ctx context.Context, ttl int64) (*LeaseGrantResponse, err
 		if isHaltErr(cctx, err) {
 			return nil, toErr(cctx, err)
 		}
-		if nerr := l.newStream(); nerr != nil {
-			return nil, nerr
-		}
 	}
 }

@@ -182,9 +195,6 @@ func (l *lessor) Revoke(ctx context.Context, id LeaseID) (*LeaseRevokeResponse,
 		if isHaltErr(ctx, err) {
 			return nil, toErr(ctx, err)
 		}
-		if nerr := l.newStream(); nerr != nil {
-			return nil, nerr
-		}
 	}
 }

@@ -216,6 +226,15 @@ func (l *lessor) KeepAlive(ctx context.Context, id LeaseID) (<-chan *LeaseKeepAl
 	ch := make(chan *LeaseKeepAliveResponse, leaseResponseChSize)

 	l.mu.Lock()
+	// ensure that recvKeepAliveLoop is still running
+	select {
+	case <-l.donec:
+		err := l.loopErr
+		l.mu.Unlock()
+		close(ch)
+		return ch, ErrKeepAliveHalted{Reason: err}
+	default:
+	}
 	ka, ok := l.keepAlives[id]
 	if !ok {
 		// create fresh keep alive
@@ -255,10 +274,6 @@ func (l *lessor) KeepAliveOnce(ctx context.Context, id LeaseID) (*LeaseKeepAlive
 		if isHaltErr(ctx, err) {
 			return nil, toErr(ctx, err)
 		}
-
-		if nerr := l.newStream(); nerr != nil {
-			return nil, nerr
-		}
 	}
 }

@@ -327,10 +342,11 @@ func (l *lessor) keepAliveOnce(ctx context.Context, id LeaseID) (*LeaseKeepAlive
 	return karesp, nil
 }

-func (l *lessor) recvKeepAliveLoop() {
+func (l *lessor) recvKeepAliveLoop() (gerr error) {
 	defer func() {
 		l.mu.Lock()
 		close(l.donec)
+		l.loopErr = gerr
 		for _, ka := range l.keepAlives {
 			ka.Close()
 		}
@@ -343,21 +359,35 @@ func (l *lessor) recvKeepAliveLoop() {
 		resp, err := stream.Recv()
 		if err != nil {
 			if isHaltErr(l.stopCtx, err) {
-				return
+				return err
 			}
 			stream, serr = l.resetRecv()
 			continue
 		}
 		l.recvKeepAlive(resp)
 	}
+	return serr
 }

 // resetRecv opens a new lease stream and starts sending LeaseKeepAliveRequests
 func (l *lessor) resetRecv() (pb.Lease_LeaseKeepAliveClient, error) {
-	if err := l.newStream(); err != nil {
+	sctx, cancel := context.WithCancel(l.stopCtx)
+	stream, err := l.remote.LeaseKeepAlive(sctx, grpc.FailFast(false))
+	if err = toErr(sctx, err); err != nil {
+		cancel()
 		return nil, err
 	}
-	stream := l.getKeepAliveStream()
+
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	if l.stream != nil && l.streamCancel != nil {
+		l.stream.CloseSend()
+		l.streamCancel()
+	}
+
+	l.streamCancel = cancel
+	l.stream = stream
+
 	go l.sendKeepAliveLoop(stream)
 	return stream, nil
 }
@@ -386,7 +416,7 @@ func (l *lessor) recvKeepAlive(resp *pb.LeaseKeepAliveResponse) {
 	}

 	// send update to all channels
-	nextKeepAlive := time.Now().Add(1 + time.Duration(karesp.TTL/3)*time.Second)
+	nextKeepAlive := time.Now().Add((time.Duration(karesp.TTL) * time.Second) / 3.0)
 	ka.deadline = time.Now().Add(time.Duration(karesp.TTL) * time.Second)
 	for _, ch := range ka.chs {
 		select {
@@ -453,32 +483,6 @@ func (l *lessor) sendKeepAliveLoop(stream pb.Lease_LeaseKeepAliveClient) {
 	}
 }

-func (l *lessor) getKeepAliveStream() pb.Lease_LeaseKeepAliveClient {
-	l.mu.Lock()
-	defer l.mu.Unlock()
-	return l.stream
-}
-
-func (l *lessor) newStream() error {
-	sctx, cancel := context.WithCancel(l.stopCtx)
-	stream, err := l.remote.LeaseKeepAlive(sctx, grpc.FailFast(false))
-	if err != nil {
-		cancel()
-		return toErr(sctx, err)
-	}
-
-	l.mu.Lock()
-	defer l.mu.Unlock()
-	if l.stream != nil && l.streamCancel != nil {
-		l.stream.CloseSend()
-		l.streamCancel()
-	}
-
-	l.streamCancel = cancel
-	l.stream = stream
-	return nil
-}
-
 func (ka *keepAlive) Close() {
 	close(ka.donec)
 	for _, ch := range ka.chs {
--- a/clientv3/retry.go
+++ b/clientv3/retry.go
@@ -33,13 +33,14 @@ func (c *Client) newRetryWrapper() retryRpcFunc {
 				return nil
 			}

-			// only retry if unavailable
-			if grpc.Code(err) != codes.Unavailable {
+			eErr := rpctypes.Error(err)
+			// always stop retry on etcd errors
+			if _, ok := eErr.(rpctypes.EtcdError); ok {
 				return err
 			}
-			// always stop retry on etcd errors
-			eErr := rpctypes.Error(err)
-			if _, ok := eErr.(rpctypes.EtcdError); ok {
+
+			// only retry if unavailable
+			if grpc.Code(err) != codes.Unavailable {
 				return err
 			}

@@ -54,17 +55,52 @@ func (c *Client) newRetryWrapper() retryRpcFunc {
 	}
 }

-type retryKVClient struct {
-	pb.KVClient
-	retryf retryRpcFunc
+func (c *Client) newAuthRetryWrapper() retryRpcFunc {
+	return func(rpcCtx context.Context, f rpcFunc) error {
+		for {
+			err := f(rpcCtx)
+			if err == nil {
+				return nil
+			}
+
+			// always stop retry on etcd errors other than invalid auth token
+			if rpctypes.Error(err) == rpctypes.ErrInvalidAuthToken {
+				gterr := c.getToken(rpcCtx)
+				if gterr != nil {
+					return err // return the original error for simplicity
+				}
+				continue
+			}
+
+			return err
+		}
+	}
 }

 // RetryKVClient implements a KVClient that uses the client's FailFast retry policy.
 func RetryKVClient(c *Client) pb.KVClient {
-	return &retryKVClient{pb.NewKVClient(c.conn), c.retryWrapper}
+	retryWrite := &retryWriteKVClient{pb.NewKVClient(c.conn), c.retryWrapper}
+	return &retryKVClient{&retryWriteKVClient{retryWrite, c.retryAuthWrapper}}
 }

-func (rkv *retryKVClient) Put(ctx context.Context, in *pb.PutRequest, opts ...grpc.CallOption) (resp *pb.PutResponse, err error) {
+type retryKVClient struct {
+	*retryWriteKVClient
+}
+
+func (rkv *retryKVClient) Range(ctx context.Context, in *pb.RangeRequest, opts ...grpc.CallOption) (resp *pb.RangeResponse, err error) {
+	err = rkv.retryf(ctx, func(rctx context.Context) error {
+		resp, err = rkv.retryWriteKVClient.Range(rctx, in, opts...)
+		return err
+	})
+	return resp, err
+}
+
+type retryWriteKVClient struct {
+	pb.KVClient
+	retryf retryRpcFunc
+}
+
+func (rkv *retryWriteKVClient) Put(ctx context.Context, in *pb.PutRequest, opts ...grpc.CallOption) (resp *pb.PutResponse, err error) {
 	err = rkv.retryf(ctx, func(rctx context.Context) error {
 		resp, err = rkv.KVClient.Put(rctx, in, opts...)
 		return err
@@ -72,7 +108,7 @@ func (rkv *retryKVClient) Put(ctx context.Context, in *pb.PutRequest, opts ...gr
 	return resp, err
 }

-func (rkv *retryKVClient) DeleteRange(ctx context.Context, in *pb.DeleteRangeRequest, opts ...grpc.CallOption) (resp *pb.DeleteRangeResponse, err error) {
+func (rkv *retryWriteKVClient) DeleteRange(ctx context.Context, in *pb.DeleteRangeRequest, opts ...grpc.CallOption) (resp *pb.DeleteRangeResponse, err error) {
 	err = rkv.retryf(ctx, func(rctx context.Context) error {
 		resp, err = rkv.KVClient.DeleteRange(rctx, in, opts...)
 		return err
@@ -80,7 +116,7 @@ func (rkv *retryKVClient) DeleteRange(ctx context.Context, in *pb.DeleteRangeReq
 	return resp, err
 }

-func (rkv *retryKVClient) Txn(ctx context.Context, in *pb.TxnRequest, opts ...grpc.CallOption) (resp *pb.TxnResponse, err error) {
+func (rkv *retryWriteKVClient) Txn(ctx context.Context, in *pb.TxnRequest, opts ...grpc.CallOption) (resp *pb.TxnResponse, err error) {
 	err = rkv.retryf(ctx, func(rctx context.Context) error {
 		resp, err = rkv.KVClient.Txn(rctx, in, opts...)
 		return err
@@ -88,7 +124,7 @@ func (rkv *retryKVClient) Txn(ctx context.Context, in *pb.TxnRequest, opts ...gr
 	return resp, err
 }

-func (rkv *retryKVClient) Compact(ctx context.Context, in *pb.CompactionRequest, opts ...grpc.CallOption) (resp *pb.CompactionResponse, err error) {
+func (rkv *retryWriteKVClient) Compact(ctx context.Context, in *pb.CompactionRequest, opts ...grpc.CallOption) (resp *pb.CompactionResponse, err error) {
 	err = rkv.retryf(ctx, func(rctx context.Context) error {
 		resp, err = rkv.KVClient.Compact(rctx, in, opts...)
 		return err
@@ -103,7 +139,8 @@ type retryLeaseClient struct {

 // RetryLeaseClient implements a LeaseClient that uses the client's FailFast retry policy.
 func RetryLeaseClient(c *Client) pb.LeaseClient {
-	return &retryLeaseClient{pb.NewLeaseClient(c.conn), c.retryWrapper}
+	retry := &retryLeaseClient{pb.NewLeaseClient(c.conn), c.retryWrapper}
+	return &retryLeaseClient{retry, c.retryAuthWrapper}
 }

 func (rlc *retryLeaseClient) LeaseGrant(ctx context.Context, in *pb.LeaseGrantRequest, opts ...grpc.CallOption) (resp *pb.LeaseGrantResponse, err error) {
--- a/clientv3/watch.go
+++ b/clientv3/watch.go
@@ -132,6 +132,8 @@ type watchGrpcStream struct {
 	errc chan error
 	// closingc gets the watcherStream of closing watchers
 	closingc chan *watcherStream
+	// wg is Done when all substream goroutines have exited
+	wg sync.WaitGroup

 	// resumec closes to signal that all substreams should begin resuming
 	resumec chan struct{}
@@ -406,7 +408,7 @@ func (w *watchGrpcStream) run() {
 		for range closing {
 			w.closeSubstream(<-w.closingc)
 		}
-
+		w.wg.Wait()
 		w.owner.closeStream(w)
 	}()

@@ -431,6 +433,7 @@ func (w *watchGrpcStream) run() {
 			}

 			ws.donec = make(chan struct{})
+			w.wg.Add(1)
 			go w.serveSubstream(ws, w.resumec)

 			// queue up for watcher creation/resume
@@ -576,6 +579,7 @@ func (w *watchGrpcStream) serveSubstream(ws *watcherStream, resumec chan struct{
 		if !resuming {
 			w.closingc <- ws
 		}
+		w.wg.Done()
 	}()

 	emptyWr := &WatchResponse{}
@@ -612,10 +616,24 @@ func (w *watchGrpcStream) serveSubstream(ws *watcherStream, resumec chan struct{
 					if ws.initReq.createdNotify {
 						ws.outc <- *wr
 					}
+					// once the watch channel is returned, a current revision
+					// watch must resume at the store revision. This is necessary
+					// for the following case to work as expected:
+					//	wch := m1.Watch("a")
+					//	m2.Put("a", "b")
+					//	<-wch
+					// If the revision is only bound on the first observed event,
+					// if wch is disconnected before the Put is issued, then reconnects
+					// after it is committed, it'll miss the Put.
+					if ws.initReq.rev == 0 {
+						nextRev = wr.Header.Revision
+					}
 				}
+			} else {
+				// current progress of watch; <= store revision
+				nextRev = wr.Header.Revision
 			}

-			nextRev = wr.Header.Revision
 			if len(wr.Events) > 0 {
 				nextRev = wr.Events[len(wr.Events)-1].Kv.ModRevision + 1
 			}
@@ -674,6 +692,7 @@ func (w *watchGrpcStream) newWatchClient() (pb.Watch_WatchClient, error) {
 			continue
 		}
 		ws.donec = make(chan struct{})
+		w.wg.Add(1)
 		go w.serveSubstream(ws, w.resumec)
 	}

@@ -694,6 +713,10 @@ func (w *watchGrpcStream) waitCancelSubstreams(stopc <-chan struct{}) <-chan str
 		go func(ws *watcherStream) {
 			defer wg.Done()
 			if ws.closing {
+				if ws.initReq.ctx.Err() != nil && ws.outc != nil {
+					close(ws.outc)
+					ws.outc = nil
+				}
 				return
 			}
 			select {
--- a/cmd/vendor/github.com/beorn7/perks/LICENSE
+++ b/cmd/vendor/github.com/beorn7/perks/LICENSE
@@ -0,0 +1,20 @@
+Copyright (C) 2013 Blake Mizerany
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/cmd/vendor/github.com/beorn7/perks/quantile/stream.go
+++ b/cmd/vendor/github.com/beorn7/perks/quantile/stream.go
@@ -133,7 +133,7 @@ func (s *Stream) Query(q float64) float64 {
 		if l == 0 {
 			return 0
 		}
-		i := int(float64(l) * q)
+		i := int(math.Ceil(float64(l) * q))
 		if i > 0 {
 			i -= 1
 		}
--- a/cmd/vendor/github.com/boltdb/bolt/bolt_amd64.go
+++ b/cmd/vendor/github.com/boltdb/bolt/bolt_amd64.go
@@ -1,7 +0,0 @@
-package bolt
-
-// maxMapSize represents the largest mmap size supported by Bolt.
-const maxMapSize = 0xFFFFFFFFFFFF // 256TB
-
-// maxAllocSize is the size used when creating array pointers.
-const maxAllocSize = 0x7FFFFFFF
--- a/cmd/vendor/github.com/coreos/bbolt/LICENSE
+++ b/cmd/vendor/github.com/coreos/bbolt/LICENSE
--- a/cmd/vendor/github.com/coreos/bbolt/bolt_386.go
+++ b/cmd/vendor/github.com/coreos/bbolt/bolt_386.go
@@ -5,3 +5,6 @@ const maxMapSize = 0x7FFFFFFF // 2GB

 // maxAllocSize is the size used when creating array pointers.
 const maxAllocSize = 0xFFFFFFF
+
+// Are unaligned load/stores broken on this arch?
+var brokenUnaligned = false
--- a/cmd/vendor/github.com/coreos/bbolt/bolt_amd64.go
+++ b/cmd/vendor/github.com/coreos/bbolt/bolt_amd64.go
@@ -0,0 +1,10 @@
+package bolt
+
+// maxMapSize represents the largest mmap size supported by Bolt.
+const maxMapSize = 0xFFFFFFFFFFFF // 256TB
+
+// maxAllocSize is the size used when creating array pointers.
+const maxAllocSize = 0x7FFFFFFF
+
+// Are unaligned load/stores broken on this arch?
+var brokenUnaligned = false
--- a/cmd/vendor/github.com/coreos/bbolt/bolt_arm.go
+++ b/cmd/vendor/github.com/coreos/bbolt/bolt_arm.go
@@ -0,0 +1,28 @@
+package bolt
+
+import "unsafe"
+
+// maxMapSize represents the largest mmap size supported by Bolt.
+const maxMapSize = 0x7FFFFFFF // 2GB
+
+// maxAllocSize is the size used when creating array pointers.
+const maxAllocSize = 0xFFFFFFF
+
+// Are unaligned load/stores broken on this arch?
+var brokenUnaligned bool
+
+func init() {
+	// Simple check to see whether this arch handles unaligned load/stores
+	// correctly.
+
+	// ARM9 and older devices require load/stores to be from/to aligned
+	// addresses. If not, the lower 2 bits are cleared and that address is
+	// read in a jumbled up order.
+
+	// See http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka15414.html
+
+	raw := [6]byte{0xfe, 0xef, 0x11, 0x22, 0x22, 0x11}
+	val := *(*uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(&raw)) + 2))
+
+	brokenUnaligned = val != 0x11222211
+}
--- a/cmd/vendor/github.com/coreos/bbolt/bolt_arm64.go
+++ b/cmd/vendor/github.com/coreos/bbolt/bolt_arm64.go
@@ -7,3 +7,6 @@ const maxMapSize = 0xFFFFFFFFFFFF // 256TB

 // maxAllocSize is the size used when creating array pointers.
 const maxAllocSize = 0x7FFFFFFF
+
+// Are unaligned load/stores broken on this arch?
+var brokenUnaligned = false
--- a/cmd/vendor/github.com/coreos/bbolt/bolt_linux.go
+++ b/cmd/vendor/github.com/coreos/bbolt/bolt_linux.go
--- a/cmd/vendor/github.com/coreos/bbolt/bolt_mips64x.go
+++ b/cmd/vendor/github.com/coreos/bbolt/bolt_mips64x.go
@@ -0,0 +1,12 @@
+// +build mips64 mips64le
+
+package bolt
+
+// maxMapSize represents the largest mmap size supported by Bolt.
+const maxMapSize = 0x8000000000 // 512GB
+
+// maxAllocSize is the size used when creating array pointers.
+const maxAllocSize = 0x7FFFFFFF
+
+// Are unaligned load/stores broken on this arch?
+var brokenUnaligned = false
--- a/cmd/vendor/github.com/coreos/bbolt/bolt_mipsx.go
+++ b/cmd/vendor/github.com/coreos/bbolt/bolt_mipsx.go
@@ -1,7 +1,12 @@
+// +build mips mipsle
+
 package bolt

 // maxMapSize represents the largest mmap size supported by Bolt.
-const maxMapSize = 0x7FFFFFFF // 2GB
+const maxMapSize = 0x40000000 // 1GB

 // maxAllocSize is the size used when creating array pointers.
 const maxAllocSize = 0xFFFFFFF
+
+// Are unaligned load/stores broken on this arch?
+var brokenUnaligned = false
--- a/cmd/vendor/github.com/coreos/bbolt/bolt_openbsd.go
+++ b/cmd/vendor/github.com/coreos/bbolt/bolt_openbsd.go
--- a/cmd/vendor/github.com/coreos/bbolt/bolt_ppc.go
+++ b/cmd/vendor/github.com/coreos/bbolt/bolt_ppc.go
--- a/cmd/vendor/github.com/coreos/bbolt/bolt_ppc64.go
+++ b/cmd/vendor/github.com/coreos/bbolt/bolt_ppc64.go
@@ -7,3 +7,6 @@ const maxMapSize = 0xFFFFFFFFFFFF // 256TB

 // maxAllocSize is the size used when creating array pointers.
 const maxAllocSize = 0x7FFFFFFF
+
+// Are unaligned load/stores broken on this arch?
+var brokenUnaligned = false
--- a/cmd/vendor/github.com/coreos/bbolt/bolt_ppc64le.go
+++ b/cmd/vendor/github.com/coreos/bbolt/bolt_ppc64le.go
@@ -7,3 +7,6 @@ const maxMapSize = 0xFFFFFFFFFFFF // 256TB

 // maxAllocSize is the size used when creating array pointers.
 const maxAllocSize = 0x7FFFFFFF
+
+// Are unaligned load/stores broken on this arch?
+var brokenUnaligned = false
--- a/cmd/vendor/github.com/coreos/bbolt/bolt_s390x.go
+++ b/cmd/vendor/github.com/coreos/bbolt/bolt_s390x.go
@@ -7,3 +7,6 @@ const maxMapSize = 0xFFFFFFFFFFFF // 256TB

 // maxAllocSize is the size used when creating array pointers.
 const maxAllocSize = 0x7FFFFFFF
+
+// Are unaligned load/stores broken on this arch?
+var brokenUnaligned = false
--- a/cmd/vendor/github.com/coreos/bbolt/bolt_unix.go
+++ b/cmd/vendor/github.com/coreos/bbolt/bolt_unix.go
@@ -13,29 +13,32 @@ import (
 // flock acquires an advisory lock on a file descriptor.
 func flock(db *DB, mode os.FileMode, exclusive bool, timeout time.Duration) error {
 	var t time.Time
+	if timeout != 0 {
+		t = time.Now()
+	}
+	fd := db.file.Fd()
+	flag := syscall.LOCK_NB
+	if exclusive {
+		flag |= syscall.LOCK_EX
+	} else {
+		flag |= syscall.LOCK_SH
+	}
 	for {
-		// If we're beyond our timeout then return an error.
-		// This can only occur after we've attempted a flock once.
-		if t.IsZero() {
-			t = time.Now()
-		} else if timeout > 0 && time.Since(t) > timeout {
-			return ErrTimeout
-		}
-		flag := syscall.LOCK_SH
-		if exclusive {
-			flag = syscall.LOCK_EX
-		}
-
-		// Otherwise attempt to obtain an exclusive lock.
-		err := syscall.Flock(int(db.file.Fd()), flag|syscall.LOCK_NB)
+		// Attempt to obtain an exclusive lock.
+		err := syscall.Flock(int(fd), flag)
 		if err == nil {
 			return nil
 		} else if err != syscall.EWOULDBLOCK {
 			return err
 		}

+		// If we timed out then return an error.
+		if timeout != 0 && time.Since(t) > timeout-flockRetryTimeout {
+			return ErrTimeout
+		}
+
 		// Wait for a bit and try again.
-		time.Sleep(50 * time.Millisecond)
+		time.Sleep(flockRetryTimeout)
 	}
 }

--- a/cmd/vendor/github.com/coreos/bbolt/bolt_unix_solaris.go
+++ b/cmd/vendor/github.com/coreos/bbolt/bolt_unix_solaris.go
@@ -13,34 +13,33 @@ import (
 // flock acquires an advisory lock on a file descriptor.
 func flock(db *DB, mode os.FileMode, exclusive bool, timeout time.Duration) error {
 	var t time.Time
+	if timeout != 0 {
+		t = time.Now()
+	}
+	fd := db.file.Fd()
+	var lockType int16
+	if exclusive {
+		lockType = syscall.F_WRLCK
+	} else {
+		lockType = syscall.F_RDLCK
+	}
 	for {
-		// If we're beyond our timeout then return an error.
-		// This can only occur after we've attempted a flock once.
-		if t.IsZero() {
-			t = time.Now()
-		} else if timeout > 0 && time.Since(t) > timeout {
-			return ErrTimeout
-		}
-		var lock syscall.Flock_t
-		lock.Start = 0
-		lock.Len = 0
-		lock.Pid = 0
-		lock.Whence = 0
-		lock.Pid = 0
-		if exclusive {
-			lock.Type = syscall.F_WRLCK
-		} else {
-			lock.Type = syscall.F_RDLCK
-		}
-		err := syscall.FcntlFlock(db.file.Fd(), syscall.F_SETLK, &lock)
+		// Attempt to obtain an exclusive lock.
+		lock := syscall.Flock_t{Type: lockType}
+		err := syscall.FcntlFlock(fd, syscall.F_SETLK, &lock)
 		if err == nil {
 			return nil
 		} else if err != syscall.EAGAIN {
 			return err
 		}

+		// If we timed out then return an error.
+		if timeout != 0 && time.Since(t) > timeout-flockRetryTimeout {
+			return ErrTimeout
+		}
+
 		// Wait for a bit and try again.
-		time.Sleep(50 * time.Millisecond)
+		time.Sleep(flockRetryTimeout)
 	}
 }

--- a/cmd/vendor/github.com/coreos/bbolt/bolt_windows.go
+++ b/cmd/vendor/github.com/coreos/bbolt/bolt_windows.go
@@ -59,29 +59,30 @@ func flock(db *DB, mode os.FileMode, exclusive bool, timeout time.Duration) erro
 	db.lockfile = f

 	var t time.Time
+	if timeout != 0 {
+		t = time.Now()
+	}
+	fd := f.Fd()
+	var flag uint32 = flagLockFailImmediately
+	if exclusive {
+		flag |= flagLockExclusive
+	}
 	for {
-		// If we're beyond our timeout then return an error.
-		// This can only occur after we've attempted a flock once.
-		if t.IsZero() {
-			t = time.Now()
-		} else if timeout > 0 && time.Since(t) > timeout {
-			return ErrTimeout
-		}
-
-		var flag uint32 = flagLockFailImmediately
-		if exclusive {
-			flag |= flagLockExclusive
-		}
-
-		err := lockFileEx(syscall.Handle(db.lockfile.Fd()), flag, 0, 1, 0, &syscall.Overlapped{})
+		// Attempt to obtain an exclusive lock.
+		err := lockFileEx(syscall.Handle(fd), flag, 0, 1, 0, &syscall.Overlapped{})
 		if err == nil {
 			return nil
 		} else if err != errLockViolation {
 			return err
 		}

+		// If we timed oumercit then return an error.
+		if timeout != 0 && time.Since(t) > timeout-flockRetryTimeout {
+			return ErrTimeout
+		}
+
 		// Wait for a bit and try again.
-		time.Sleep(50 * time.Millisecond)
+		time.Sleep(flockRetryTimeout)
 	}
 }

@@ -89,7 +90,7 @@ func flock(db *DB, mode os.FileMode, exclusive bool, timeout time.Duration) erro
 func funlock(db *DB) error {
 	err := unlockFileEx(syscall.Handle(db.lockfile.Fd()), 0, 1, 0, &syscall.Overlapped{})
 	db.lockfile.Close()
-	os.Remove(db.path+lockExt)
+	os.Remove(db.path + lockExt)
 	return err
 }

--- a/cmd/vendor/github.com/coreos/bbolt/boltsync_unix.go
+++ b/cmd/vendor/github.com/coreos/bbolt/boltsync_unix.go
--- a/cmd/vendor/github.com/coreos/bbolt/bucket.go
+++ b/cmd/vendor/github.com/coreos/bbolt/bucket.go
@@ -14,13 +14,6 @@ const (
 	MaxValueSize = (1 << 31) - 2
 )

-const (
-	maxUint = ^uint(0)
-	minUint = 0
-	maxInt  = int(^uint(0) >> 1)
-	minInt  = -maxInt - 1
-)
-
 const bucketHeaderSize = int(unsafe.Sizeof(bucket{}))

 const (
@@ -130,9 +123,17 @@ func (b *Bucket) Bucket(name []byte) *Bucket {
 func (b *Bucket) openBucket(value []byte) *Bucket {
 	var child = newBucket(b.tx)

+	// If unaligned load/stores are broken on this arch and value is
+	// unaligned simply clone to an aligned byte array.
+	unaligned := brokenUnaligned && uintptr(unsafe.Pointer(&value[0]))&3 != 0
+
+	if unaligned {
+		value = cloneBytes(value)
+	}
+
 	// If this is a writable transaction then we need to copy the bucket entry.
 	// Read-only transactions can point directly at the mmap entry.
-	if b.tx.writable {
+	if b.tx.writable && !unaligned {
 		child.bucket = &bucket{}
 		*child.bucket = *(*bucket)(unsafe.Pointer(&value[0]))
 	} else {
@@ -167,9 +168,8 @@ func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) {
 	if bytes.Equal(key, k) {
 		if (flags & bucketLeafFlag) != 0 {
 			return nil, ErrBucketExists
-		} else {
-			return nil, ErrIncompatibleValue
 		}
+		return nil, ErrIncompatibleValue
 	}

 	// Create empty, inline bucket.
@@ -316,7 +316,12 @@ func (b *Bucket) Delete(key []byte) error {

 	// Move cursor to correct position.
 	c := b.Cursor()
-	_, _, flags := c.seek(key)
+	k, _, flags := c.seek(key)
+
+	// Return nil if the key doesn't exist.
+	if !bytes.Equal(key, k) {
+		return nil
+	}

 	// Return an error if there is already existing bucket value.
 	if (flags & bucketLeafFlag) != 0 {
@@ -329,6 +334,28 @@ func (b *Bucket) Delete(key []byte) error {
 	return nil
 }

+// Sequence returns the current integer for the bucket without incrementing it.
+func (b *Bucket) Sequence() uint64 { return b.bucket.sequence }
+
+// SetSequence updates the sequence number for the bucket.
+func (b *Bucket) SetSequence(v uint64) error {
+	if b.tx.db == nil {
+		return ErrTxClosed
+	} else if !b.Writable() {
+		return ErrTxNotWritable
+	}
+
+	// Materialize the root node if it hasn't been already so that the
+	// bucket will be saved during commit.
+	if b.rootNode == nil {
+		_ = b.node(b.root, nil)
+	}
+
+	// Increment and return the sequence.
+	b.bucket.sequence = v
+	return nil
+}
+
 // NextSequence returns an autoincrementing integer for the bucket.
 func (b *Bucket) NextSequence() (uint64, error) {
 	if b.tx.db == nil {
--- a/cmd/vendor/github.com/coreos/bbolt/cursor.go
+++ b/cmd/vendor/github.com/coreos/bbolt/cursor.go
--- a/cmd/vendor/github.com/coreos/bbolt/db.go
+++ b/cmd/vendor/github.com/coreos/bbolt/db.go
@@ -7,8 +7,7 @@ import (
 	"log"
 	"os"
 	"runtime"
-	"runtime/debug"
-	"strings"
+	"sort"
 	"sync"
 	"time"
 	"unsafe"
@@ -23,6 +22,8 @@ const version = 2
 // Represents a marker value to indicate that a file is a Bolt DB.
 const magic uint32 = 0xED0CDAED

+const pgidNoFreelist pgid = 0xffffffffffffffff
+
 // IgnoreNoSync specifies whether the NoSync field of a DB is ignored when
 // syncing changes to a file.  This is required as some operating systems,
 // such as OpenBSD, do not have a unified buffer cache (UBC) and writes
@@ -39,6 +40,9 @@ const (
 // default page size for db is set to the OS page size.
 var defaultPageSize = os.Getpagesize()

+// The time elapsed between consecutive file locking attempts.
+const flockRetryTimeout = 50 * time.Millisecond
+
 // DB represents a collection of buckets persisted to a file on disk.
 // All data access is performed through transactions which can be obtained through the DB.
 // All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called.
@@ -61,6 +65,11 @@ type DB struct {
 	// THIS IS UNSAFE. PLEASE USE WITH CAUTION.
 	NoSync bool

+	// When true, skips syncing freelist to disk. This improves the database
+	// write performance under normal operation, but requires a full database
+	// re-sync during recovery.
+	NoFreelistSync bool
+
 	// When true, skips the truncate call when growing the database.
 	// Setting this to true is only safe on non-ext3/ext4 systems.
 	// Skipping truncation avoids preallocation of hard drive space and
@@ -107,9 +116,11 @@ type DB struct {
 	opened   bool
 	rwtx     *Tx
 	txs      []*Tx
-	freelist *freelist
 	stats    Stats

+	freelist     *freelist
+	freelistLoad sync.Once
+
 	pagePool sync.Pool

 	batchMu sync.Mutex
@@ -148,14 +159,17 @@ func (db *DB) String() string {
 // If the file does not exist then it will be created automatically.
 // Passing in nil options will cause Bolt to open the database with the default options.
 func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
-	var db = &DB{opened: true}
-
+	db := &DB{
+		opened: true,
+	}
 	// Set default options if no options are provided.
 	if options == nil {
 		options = DefaultOptions
 	}
+	db.NoSync = options.NoSync
 	db.NoGrowSync = options.NoGrowSync
 	db.MmapFlags = options.MmapFlags
+	db.NoFreelistSync = options.NoFreelistSync

 	// Set default values for later DB operations.
 	db.MaxBatchSize = DefaultMaxBatchSize
@@ -184,6 +198,7 @@ func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
 	// The database file is locked using the shared lock (more than one process may
 	// hold a lock at the same time) otherwise (options.ReadOnly is set).
 	if err := flock(db, mode, !db.readOnly, options.Timeout); err != nil {
+		db.lockfile = nil // make 'unused' happy. TODO: rework locks
 		_ = db.close()
 		return nil, err
 	}
@@ -191,6 +206,11 @@ func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
 	// Default values for test hooks
 	db.ops.writeAt = db.file.WriteAt

+	if db.pageSize = options.PageSize; db.pageSize == 0 {
+		// Set the default page size to the OS page size.
+		db.pageSize = defaultPageSize
+	}
+
 	// Initialize the database if it doesn't exist.
 	if info, err := db.file.Stat(); err != nil {
 		return nil, err
@@ -202,20 +222,21 @@ func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
 	} else {
 		// Read the first meta page to determine the page size.
 		var buf [0x1000]byte
-		if _, err := db.file.ReadAt(buf[:], 0); err == nil {
-			m := db.pageInBuffer(buf[:], 0).meta()
-			if err := m.validate(); err != nil {
-				// If we can't read the page size, we can assume it's the same
-				// as the OS -- since that's how the page size was chosen in the
-				// first place.
-				//
-				// If the first page is invalid and this OS uses a different
-				// page size than what the database was created with then we
-				// are out of luck and cannot access the database.
-				db.pageSize = os.Getpagesize()
-			} else {
+		// If we can't read the page size, but can read a page, assume
+		// it's the same as the OS or one given -- since that's how the
+		// page size was chosen in the first place.
+		//
+		// If the first page is invalid and this OS uses a different
+		// page size than what the database was created with then we
+		// are out of luck and cannot access the database.
+		//
+		// TODO: scan for next page
+		if bw, err := db.file.ReadAt(buf[:], 0); err == nil && bw == len(buf) {
+			if m := db.pageInBuffer(buf[:], 0).meta(); m.validate() == nil {
 				db.pageSize = int(m.pageSize)
 			}
+		} else {
+			return nil, ErrInvalid
 		}
 	}

@@ -232,14 +253,50 @@ func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
 		return nil, err
 	}

-	// Read in the freelist.
-	db.freelist = newFreelist()
-	db.freelist.read(db.page(db.meta().freelist))
+	if db.readOnly {
+		return db, nil
+	}
+
+	db.loadFreelist()
+
+	// Flush freelist when transitioning from no sync to sync so
+	// NoFreelistSync unaware boltdb can open the db later.
+	if !db.NoFreelistSync && !db.hasSyncedFreelist() {
+		tx, err := db.Begin(true)
+		if tx != nil {
+			err = tx.Commit()
+		}
+		if err != nil {
+			_ = db.close()
+			return nil, err
+		}
+	}

 	// Mark the database as opened and return.
 	return db, nil
 }

+// loadFreelist reads the freelist if it is synced, or reconstructs it
+// by scanning the DB if it is not synced. It assumes there are no
+// concurrent accesses being made to the freelist.
+func (db *DB) loadFreelist() {
+	db.freelistLoad.Do(func() {
+		db.freelist = newFreelist()
+		if !db.hasSyncedFreelist() {
+			// Reconstruct free list by scanning the DB.
+			db.freelist.readIDs(db.freepages())
+		} else {
+			// Read free list from freelist page.
+			db.freelist.read(db.page(db.meta().freelist))
+		}
+		db.stats.FreePageN = len(db.freelist.ids)
+	})
+}
+
+func (db *DB) hasSyncedFreelist() bool {
+	return db.meta().freelist != pgidNoFreelist
+}
+
 // mmap opens the underlying memory-mapped file and initializes the meta references.
 // minsz is the minimum size that the new mmap can be.
 func (db *DB) mmap(minsz int) error {
@@ -341,9 +398,6 @@ func (db *DB) mmapSize(size int) (int, error) {

 // init creates a new database file and initializes its meta pages.
 func (db *DB) init() error {
-	// Set the page size to the OS page size.
-	db.pageSize = os.Getpagesize()
-
 	// Create two meta pages on a buffer.
 	buf := make([]byte, db.pageSize*4)
 	for i := 0; i < 2; i++ {
@@ -526,21 +580,36 @@ func (db *DB) beginRWTx() (*Tx, error) {
 	t := &Tx{writable: true}
 	t.init(db)
 	db.rwtx = t
+	db.freePages()
+	return t, nil
+}

-	// Free any pages associated with closed read-only transactions.
-	var minid txid = 0xFFFFFFFFFFFFFFFF
-	for _, t := range db.txs {
-		if t.meta.txid < minid {
-			minid = t.meta.txid
-		}
+// freePages releases any pages associated with closed read-only transactions.
+func (db *DB) freePages() {
+	// Free all pending pages prior to earliest open transaction.
+	sort.Sort(txsById(db.txs))
+	minid := txid(0xFFFFFFFFFFFFFFFF)
+	if len(db.txs) > 0 {
+		minid = db.txs[0].meta.txid
 	}
 	if minid > 0 {
 		db.freelist.release(minid - 1)
 	}
-
-	return t, nil
+	// Release unused txid extents.
+	for _, t := range db.txs {
+		db.freelist.releaseRange(minid, t.meta.txid-1)
+		minid = t.meta.txid + 1
+	}
+	db.freelist.releaseRange(minid, txid(0xFFFFFFFFFFFFFFFF))
+	// Any page both allocated and freed in an extent is safe to release.
 }

+type txsById []*Tx
+
+func (t txsById) Len() int           { return len(t) }
+func (t txsById) Swap(i, j int)      { t[i], t[j] = t[j], t[i] }
+func (t txsById) Less(i, j int) bool { return t[i].meta.txid < t[j].meta.txid }
+
 // removeTx removes a transaction from the database.
 func (db *DB) removeTx(tx *Tx) {
 	// Release the read lock on the mmap.
@@ -552,7 +621,10 @@ func (db *DB) removeTx(tx *Tx) {
 	// Remove the transaction.
 	for i, t := range db.txs {
 		if t == tx {
-			db.txs = append(db.txs[:i], db.txs[i+1:]...)
+			last := len(db.txs) - 1
+			db.txs[i] = db.txs[last]
+			db.txs[last] = nil
+			db.txs = db.txs[:last]
 			break
 		}
 	}
@@ -630,11 +702,7 @@ func (db *DB) View(fn func(*Tx) error) error {
 		return err
 	}

-	if err := t.Rollback(); err != nil {
-		return err
-	}
-
-	return nil
+	return t.Rollback()
 }

 // Batch calls fn as part of a batch. It behaves similar to Update,
@@ -823,7 +891,7 @@ func (db *DB) meta() *meta {
 }

 // allocate returns a contiguous block of memory starting at a given page.
-func (db *DB) allocate(count int) (*page, error) {
+func (db *DB) allocate(txid txid, count int) (*page, error) {
 	// Allocate a temporary buffer for the page.
 	var buf []byte
 	if count == 1 {
@@ -835,7 +903,7 @@ func (db *DB) allocate(count int) (*page, error) {
 	p.overflow = uint32(count - 1)

 	// Use pages from the freelist if they are available.
-	if p.id = db.freelist.allocate(count); p.id != 0 {
+	if p.id = db.freelist.allocate(txid, count); p.id != 0 {
 		return p, nil
 	}

@@ -890,6 +958,38 @@ func (db *DB) IsReadOnly() bool {
 	return db.readOnly
 }

+func (db *DB) freepages() []pgid {
+	tx, err := db.beginTx()
+	defer func() {
+		err = tx.Rollback()
+		if err != nil {
+			panic("freepages: failed to rollback tx")
+		}
+	}()
+	if err != nil {
+		panic("freepages: failed to open read only tx")
+	}
+
+	reachable := make(map[pgid]*page)
+	nofreed := make(map[pgid]bool)
+	ech := make(chan error)
+	go func() {
+		for e := range ech {
+			panic(fmt.Sprintf("freepages: failed to get all reachable pages (%v)", e))
+		}
+	}()
+	tx.checkBucket(&tx.root, reachable, nofreed, ech)
+	close(ech)
+
+	var fids []pgid
+	for i := pgid(2); i < db.meta().pgid; i++ {
+		if _, ok := reachable[i]; !ok {
+			fids = append(fids, i)
+		}
+	}
+	return fids
+}
+
 // Options represents the options that can be set when opening a database.
 type Options struct {
 	// Timeout is the amount of time to wait to obtain a file lock.
@@ -900,6 +1000,10 @@ type Options struct {
 	// Sets the DB.NoGrowSync flag before memory mapping the file.
 	NoGrowSync bool

+	// Do not sync freelist to disk. This improves the database write performance
+	// under normal operation, but requires a full database re-sync during recovery.
+	NoFreelistSync bool
+
 	// Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to
 	// grab a shared lock (UNIX).
 	ReadOnly bool
@@ -916,6 +1020,14 @@ type Options struct {
 	// If initialMmapSize is smaller than the previous database size,
 	// it takes no effect.
 	InitialMmapSize int
+
+	// PageSize overrides the default OS page size.
+	PageSize int
+
+	// NoSync sets the initial value of DB.NoSync. Normally this can just be
+	// set directly on the DB itself when returned from Open(), but this option
+	// is useful in APIs which expose Options but not the underlying DB.
+	NoSync bool
 }

 // DefaultOptions represent the options used if nil options are passed into Open().
@@ -952,15 +1064,11 @@ func (s *Stats) Sub(other *Stats) Stats {
 	diff.PendingPageN = s.PendingPageN
 	diff.FreeAlloc = s.FreeAlloc
 	diff.FreelistInuse = s.FreelistInuse
-	diff.TxN = other.TxN - s.TxN
+	diff.TxN = s.TxN - other.TxN
 	diff.TxStats = s.TxStats.Sub(&other.TxStats)
 	return diff
 }

-func (s *Stats) add(other *Stats) {
-	s.TxStats.add(&other.TxStats)
-}
-
 type Info struct {
 	Data     uintptr
 	PageSize int
@@ -999,7 +1107,8 @@ func (m *meta) copy(dest *meta) {
 func (m *meta) write(p *page) {
 	if m.root.root >= m.pgid {
 		panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid))
-	} else if m.freelist >= m.pgid {
+	} else if m.freelist >= m.pgid && m.freelist != pgidNoFreelist {
+		// TODO: reject pgidNoFreeList if !NoFreelistSync
 		panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid))
 	}

@@ -1026,11 +1135,3 @@ func _assert(condition bool, msg string, v ...interface{}) {
 		panic(fmt.Sprintf("assertion failed: "+msg, v...))
 	}
 }
-
-func warn(v ...interface{})              { fmt.Fprintln(os.Stderr, v...) }
-func warnf(msg string, v ...interface{}) { fmt.Fprintf(os.Stderr, msg+"\n", v...) }
-
-func printstack() {
-	stack := strings.Join(strings.Split(string(debug.Stack()), "\n")[2:], "\n")
-	fmt.Fprintln(os.Stderr, stack)
-}
--- a/cmd/vendor/github.com/coreos/bbolt/doc.go
+++ b/cmd/vendor/github.com/coreos/bbolt/doc.go
--- a/cmd/vendor/github.com/coreos/bbolt/errors.go
+++ b/cmd/vendor/github.com/coreos/bbolt/errors.go
--- a/cmd/vendor/github.com/coreos/bbolt/freelist.go
+++ b/cmd/vendor/github.com/coreos/bbolt/freelist.go
@@ -6,25 +6,40 @@ import (
 	"unsafe"
 )

+// txPending holds a list of pgids and corresponding allocation txns
+// that are pending to be freed.
+type txPending struct {
+	ids              []pgid
+	alloctx          []txid // txids allocating the ids
+	lastReleaseBegin txid   // beginning txid of last matching releaseRange
+}
+
 // freelist represents a list of all pages that are available for allocation.
 // It also tracks pages that have been freed but are still in use by open transactions.
 type freelist struct {
-	ids     []pgid          // all free and available free page ids.
-	pending map[txid][]pgid // mapping of soon-to-be free page ids by tx.
-	cache   map[pgid]bool   // fast lookup of all free and pending page ids.
+	ids     []pgid              // all free and available free page ids.
+	allocs  map[pgid]txid       // mapping of txid that allocated a pgid.
+	pending map[txid]*txPending // mapping of soon-to-be free page ids by tx.
+	cache   map[pgid]bool       // fast lookup of all free and pending page ids.
 }

 // newFreelist returns an empty, initialized freelist.
 func newFreelist() *freelist {
 	return &freelist{
-		pending: make(map[txid][]pgid),
+		allocs:  make(map[pgid]txid),
+		pending: make(map[txid]*txPending),
 		cache:   make(map[pgid]bool),
 	}
 }

 // size returns the size of the page after serialization.
 func (f *freelist) size() int {
-	return pageHeaderSize + (int(unsafe.Sizeof(pgid(0))) * f.count())
+	n := f.count()
+	if n >= 0xFFFF {
+		// The first element will be used to store the count. See freelist.write.
+		n++
+	}
+	return pageHeaderSize + (int(unsafe.Sizeof(pgid(0))) * n)
 }

 // count returns count of pages on the freelist
@@ -40,27 +55,26 @@ func (f *freelist) free_count() int {
 // pending_count returns count of pending pages
 func (f *freelist) pending_count() int {
 	var count int
-	for _, list := range f.pending {
-		count += len(list)
+	for _, txp := range f.pending {
+		count += len(txp.ids)
 	}
 	return count
 }

-// all returns a list of all free ids and all pending ids in one sorted list.
-func (f *freelist) all() []pgid {
-	m := make(pgids, 0)
-
-	for _, list := range f.pending {
-		m = append(m, list...)
+// copyall copies into dst a list of all free ids and all pending ids in one sorted list.
+// f.count returns the minimum length required for dst.
+func (f *freelist) copyall(dst []pgid) {
+	m := make(pgids, 0, f.pending_count())
+	for _, txp := range f.pending {
+		m = append(m, txp.ids...)
 	}
-
 	sort.Sort(m)
-	return pgids(f.ids).merge(m)
+	mergepgids(dst, f.ids, m)
 }

 // allocate returns the starting page id of a contiguous list of pages of a given size.
 // If a contiguous block cannot be found then 0 is returned.
-func (f *freelist) allocate(n int) pgid {
+func (f *freelist) allocate(txid txid, n int) pgid {
 	if len(f.ids) == 0 {
 		return 0
 	}
@@ -93,7 +107,7 @@ func (f *freelist) allocate(n int) pgid {
 			for i := pgid(0); i < pgid(n); i++ {
 				delete(f.cache, initial+i)
 			}
-
+			f.allocs[initial] = txid
 			return initial
 		}

@@ -110,28 +124,73 @@ func (f *freelist) free(txid txid, p *page) {
 	}

 	// Free page and all its overflow pages.
-	var ids = f.pending[txid]
+	txp := f.pending[txid]
+	if txp == nil {
+		txp = &txPending{}
+		f.pending[txid] = txp
+	}
+	allocTxid, ok := f.allocs[p.id]
+	if ok {
+		delete(f.allocs, p.id)
+	} else if (p.flags & freelistPageFlag) != 0 {
+		// Freelist is always allocated by prior tx.
+		allocTxid = txid - 1
+	}
+
 	for id := p.id; id <= p.id+pgid(p.overflow); id++ {
 		// Verify that page is not already free.
 		if f.cache[id] {
 			panic(fmt.Sprintf("page %d already freed", id))
 		}
-
 		// Add to the freelist and cache.
-		ids = append(ids, id)
+		txp.ids = append(txp.ids, id)
+		txp.alloctx = append(txp.alloctx, allocTxid)
 		f.cache[id] = true
 	}
-	f.pending[txid] = ids
 }

 // release moves all page ids for a transaction id (or older) to the freelist.
 func (f *freelist) release(txid txid) {
 	m := make(pgids, 0)
-	for tid, ids := range f.pending {
+	for tid, txp := range f.pending {
 		if tid <= txid {
 			// Move transaction's pending pages to the available freelist.
 			// Don't remove from the cache since the page is still free.
-			m = append(m, ids...)
+			m = append(m, txp.ids...)
+			delete(f.pending, tid)
+		}
+	}
+	sort.Sort(m)
+	f.ids = pgids(f.ids).merge(m)
+}
+
+// releaseRange moves pending pages allocated within an extent [begin,end] to the free list.
+func (f *freelist) releaseRange(begin, end txid) {
+	if begin > end {
+		return
+	}
+	var m pgids
+	for tid, txp := range f.pending {
+		if tid < begin || tid > end {
+			continue
+		}
+		// Don't recompute freed pages if ranges haven't updated.
+		if txp.lastReleaseBegin == begin {
+			continue
+		}
+		for i := 0; i < len(txp.ids); i++ {
+			if atx := txp.alloctx[i]; atx < begin || atx > end {
+				continue
+			}
+			m = append(m, txp.ids[i])
+			txp.ids[i] = txp.ids[len(txp.ids)-1]
+			txp.ids = txp.ids[:len(txp.ids)-1]
+			txp.alloctx[i] = txp.alloctx[len(txp.alloctx)-1]
+			txp.alloctx = txp.alloctx[:len(txp.alloctx)-1]
+			i--
+		}
+		txp.lastReleaseBegin = begin
+		if len(txp.ids) == 0 {
 			delete(f.pending, tid)
 		}
 	}
@@ -142,12 +201,29 @@ func (f *freelist) release(txid txid) {
 // rollback removes the pages from a given pending tx.
 func (f *freelist) rollback(txid txid) {
 	// Remove page ids from cache.
-	for _, id := range f.pending[txid] {
-		delete(f.cache, id)
+	txp := f.pending[txid]
+	if txp == nil {
+		return
 	}
-
-	// Remove pages from pending list.
+	var m pgids
+	for i, pgid := range txp.ids {
+		delete(f.cache, pgid)
+		tx := txp.alloctx[i]
+		if tx == 0 {
+			continue
+		}
+		if tx != txid {
+			// Pending free aborted; restore page back to alloc list.
+			f.allocs[pgid] = tx
+		} else {
+			// Freed page was allocated by this txn; OK to throw away.
+			m = append(m, pgid)
+		}
+	}
+	// Remove pages from pending list and mark as free if allocated by txid.
 	delete(f.pending, txid)
+	sort.Sort(m)
+	f.ids = pgids(f.ids).merge(m)
 }

 // freed returns whether a given page is in the free list.
@@ -157,6 +233,9 @@ func (f *freelist) freed(pgid pgid) bool {

 // read initializes the freelist from a freelist page.
 func (f *freelist) read(p *page) {
+	if (p.flags & freelistPageFlag) == 0 {
+		panic(fmt.Sprintf("invalid freelist page: %d, page type is %s", p.id, p.typ()))
+	}
 	// If the page.count is at the max uint16 value (64k) then it's considered
 	// an overflow and the size of the freelist is stored as the first element.
 	idx, count := 0, int(p.count)
@@ -169,7 +248,7 @@ func (f *freelist) read(p *page) {
 	if count == 0 {
 		f.ids = nil
 	} else {
-		ids := ((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[idx:count]
+		ids := ((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[idx : idx+count]
 		f.ids = make([]pgid, len(ids))
 		copy(f.ids, ids)

@@ -181,27 +260,33 @@ func (f *freelist) read(p *page) {
 	f.reindex()
 }

+// read initializes the freelist from a given list of ids.
+func (f *freelist) readIDs(ids []pgid) {
+	f.ids = ids
+	f.reindex()
+}
+
 // write writes the page ids onto a freelist page. All free and pending ids are
 // saved to disk since in the event of a program crash, all pending ids will
 // become free.
 func (f *freelist) write(p *page) error {
 	// Combine the old free pgids and pgids waiting on an open transaction.
-	ids := f.all()

 	// Update the header flag.
 	p.flags |= freelistPageFlag

 	// The page.count can only hold up to 64k elements so if we overflow that
 	// number then we handle it by putting the size in the first element.
-	if len(ids) == 0 {
-		p.count = uint16(len(ids))
-	} else if len(ids) < 0xFFFF {
-		p.count = uint16(len(ids))
-		copy(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[:], ids)
+	lenids := f.count()
+	if lenids == 0 {
+		p.count = uint16(lenids)
+	} else if lenids < 0xFFFF {
+		p.count = uint16(lenids)
+		f.copyall(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[:])
 	} else {
 		p.count = 0xFFFF
-		((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0] = pgid(len(ids))
-		copy(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[1:], ids)
+		((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0] = pgid(lenids)
+		f.copyall(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[1:])
 	}

 	return nil
@@ -213,8 +298,8 @@ func (f *freelist) reload(p *page) {

 	// Build a cache of only pending pages.
 	pcache := make(map[pgid]bool)
-	for _, pendingIDs := range f.pending {
-		for _, pendingID := range pendingIDs {
+	for _, txp := range f.pending {
+		for _, pendingID := range txp.ids {
 			pcache[pendingID] = true
 		}
 	}
@@ -236,12 +321,12 @@ func (f *freelist) reload(p *page) {

 // reindex rebuilds the free cache based on available and pending free lists.
 func (f *freelist) reindex() {
-	f.cache = make(map[pgid]bool)
+	f.cache = make(map[pgid]bool, len(f.ids))
 	for _, id := range f.ids {
 		f.cache[id] = true
 	}
-	for _, pendingIDs := range f.pending {
-		for _, pendingID := range pendingIDs {
+	for _, txp := range f.pending {
+		for _, pendingID := range txp.ids {
 			f.cache[pendingID] = true
 		}
 	}
--- a/cmd/vendor/github.com/coreos/bbolt/node.go
+++ b/cmd/vendor/github.com/coreos/bbolt/node.go
@@ -365,7 +365,7 @@ func (n *node) spill() error {
 		}

 		// Allocate contiguous space for the node.
-		p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
+		p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
 		if err != nil {
 			return err
 		}
--- a/cmd/vendor/github.com/coreos/bbolt/page.go
+++ b/cmd/vendor/github.com/coreos/bbolt/page.go
@@ -145,12 +145,33 @@ func (a pgids) merge(b pgids) pgids {
 	// Return the opposite slice if one is nil.
 	if len(a) == 0 {
 		return b
-	} else if len(b) == 0 {
+	}
+	if len(b) == 0 {
 		return a
 	}
+	merged := make(pgids, len(a)+len(b))
+	mergepgids(merged, a, b)
+	return merged
+}

-	// Create a list to hold all elements from both lists.
-	merged := make(pgids, 0, len(a)+len(b))
+// mergepgids copies the sorted union of a and b into dst.
+// If dst is too small, it panics.
+func mergepgids(dst, a, b pgids) {
+	if len(dst) < len(a)+len(b) {
+		panic(fmt.Errorf("mergepgids bad len %d < %d + %d", len(dst), len(a), len(b)))
+	}
+	// Copy in the opposite slice if one is nil.
+	if len(a) == 0 {
+		copy(dst, b)
+		return
+	}
+	if len(b) == 0 {
+		copy(dst, a)
+		return
+	}
+
+	// Merged will hold all elements from both lists.
+	merged := dst[:0]

 	// Assign lead to the slice with a lower starting value, follow to the higher value.
 	lead, follow := a, b
@@ -172,7 +193,5 @@ func (a pgids) merge(b pgids) pgids {
 	}

 	// Append what's left in follow.
-	merged = append(merged, follow...)
-
-	return merged
+	_ = append(merged, follow...)
 }
--- a/cmd/vendor/github.com/coreos/bbolt/tx.go
+++ b/cmd/vendor/github.com/coreos/bbolt/tx.go
@@ -126,10 +126,7 @@ func (tx *Tx) DeleteBucket(name []byte) error {
 // the error is returned to the caller.
 func (tx *Tx) ForEach(fn func(name []byte, b *Bucket) error) error {
 	return tx.root.ForEach(func(k, v []byte) error {
-		if err := fn(k, tx.root.Bucket(k)); err != nil {
-			return err
-		}
-		return nil
+		return fn(k, tx.root.Bucket(k))
 	})
 }

@@ -169,28 +166,18 @@ func (tx *Tx) Commit() error {
 	// Free the old root bucket.
 	tx.meta.root.root = tx.root.root

-	opgid := tx.meta.pgid
-
-	// Free the freelist and allocate new pages for it. This will overestimate
-	// the size of the freelist but not underestimate the size (which would be bad).
-	tx.db.freelist.free(tx.meta.txid, tx.db.page(tx.meta.freelist))
-	p, err := tx.allocate((tx.db.freelist.size() / tx.db.pageSize) + 1)
-	if err != nil {
-		tx.rollback()
-		return err
+	// Free the old freelist because commit writes out a fresh freelist.
+	if tx.meta.freelist != pgidNoFreelist {
+		tx.db.freelist.free(tx.meta.txid, tx.db.page(tx.meta.freelist))
 	}
-	if err := tx.db.freelist.write(p); err != nil {
-		tx.rollback()
-		return err
-	}
-	tx.meta.freelist = p.id

-	// If the high water mark has moved up then attempt to grow the database.
-	if tx.meta.pgid > opgid {
-		if err := tx.db.grow(int(tx.meta.pgid+1) * tx.db.pageSize); err != nil {
-			tx.rollback()
+	if !tx.db.NoFreelistSync {
+		err := tx.commitFreelist()
+		if err != nil {
 			return err
 		}
+	} else {
+		tx.meta.freelist = pgidNoFreelist
 	}

 	// Write dirty pages to disk.
@@ -235,6 +222,31 @@ func (tx *Tx) Commit() error {
 	return nil
 }

+func (tx *Tx) commitFreelist() error {
+	// Allocate new pages for the new free list. This will overestimate
+	// the size of the freelist but not underestimate the size (which would be bad).
+	opgid := tx.meta.pgid
+	p, err := tx.allocate((tx.db.freelist.size() / tx.db.pageSize) + 1)
+	if err != nil {
+		tx.rollback()
+		return err
+	}
+	if err := tx.db.freelist.write(p); err != nil {
+		tx.rollback()
+		return err
+	}
+	tx.meta.freelist = p.id
+	// If the high water mark has moved up then attempt to grow the database.
+	if tx.meta.pgid > opgid {
+		if err := tx.db.grow(int(tx.meta.pgid+1) * tx.db.pageSize); err != nil {
+			tx.rollback()
+			return err
+		}
+	}
+
+	return nil
+}
+
 // Rollback closes the transaction and ignores all previous updates. Read-only
 // transactions must be rolled back and not committed.
 func (tx *Tx) Rollback() error {
@@ -305,7 +317,11 @@ func (tx *Tx) WriteTo(w io.Writer) (n int64, err error) {
 	if err != nil {
 		return 0, err
 	}
-	defer func() { _ = f.Close() }()
+	defer func() {
+		if cerr := f.Close(); err == nil {
+			err = cerr
+		}
+	}()

 	// Generate a meta page. We use the same page data for both meta pages.
 	buf := make([]byte, tx.db.pageSize)
@@ -333,7 +349,7 @@ func (tx *Tx) WriteTo(w io.Writer) (n int64, err error) {
 	}

 	// Move past the meta pages in the file.
-	if _, err := f.Seek(int64(tx.db.pageSize*2), os.SEEK_SET); err != nil {
+	if _, err := f.Seek(int64(tx.db.pageSize*2), io.SeekStart); err != nil {
 		return n, fmt.Errorf("seek: %s", err)
 	}

@@ -344,7 +360,7 @@ func (tx *Tx) WriteTo(w io.Writer) (n int64, err error) {
 		return n, err
 	}

-	return n, f.Close()
+	return n, nil
 }

 // CopyFile copies the entire database to file at the given path.
@@ -379,9 +395,14 @@ func (tx *Tx) Check() <-chan error {
 }

 func (tx *Tx) check(ch chan error) {
+	// Force loading free list if opened in ReadOnly mode.
+	tx.db.loadFreelist()
+
 	// Check if any pages are double freed.
 	freed := make(map[pgid]bool)
-	for _, id := range tx.db.freelist.all() {
+	all := make([]pgid, tx.db.freelist.count())
+	tx.db.freelist.copyall(all)
+	for _, id := range all {
 		if freed[id] {
 			ch <- fmt.Errorf("page %d: already freed", id)
 		}
@@ -392,8 +413,10 @@ func (tx *Tx) check(ch chan error) {
 	reachable := make(map[pgid]*page)
 	reachable[0] = tx.page(0) // meta0
 	reachable[1] = tx.page(1) // meta1
-	for i := uint32(0); i <= tx.page(tx.meta.freelist).overflow; i++ {
-		reachable[tx.meta.freelist+pgid(i)] = tx.page(tx.meta.freelist)
+	if tx.meta.freelist != pgidNoFreelist {
+		for i := uint32(0); i <= tx.page(tx.meta.freelist).overflow; i++ {
+			reachable[tx.meta.freelist+pgid(i)] = tx.page(tx.meta.freelist)
+		}
 	}

 	// Recursively check buckets.
@@ -451,7 +474,7 @@ func (tx *Tx) checkBucket(b *Bucket, reachable map[pgid]*page, freed map[pgid]bo

 // allocate returns a contiguous block of memory starting at a given page.
 func (tx *Tx) allocate(count int) (*page, error) {
-	p, err := tx.db.allocate(count)
+	p, err := tx.db.allocate(tx.meta.txid, count)
 	if err != nil {
 		return nil, err
 	}
--- a/Show More
+++ b/Show More