Compare commits

..

28 Commits

Author SHA1 Message Date
Yicheng Qin
ff8d1ecb9f *: bump to v2.1.2 2015-08-21 16:19:55 -07:00
Yicheng Qin
ccb67a691b pkg/netutil: stop resolving in place
It helps to copy out a and b, and not modify the original a and b.
2015-08-21 15:39:57 -07:00
Yicheng Qin
059233768e pkg/netutil: not introduce empty url when converting
It should not make slices with length and append elements at the same
time.
2015-08-21 15:39:48 -07:00
Yicheng Qin
c530acf6a4 pkg/netutil: not export resolve and urlsEqual functions
They are only used in this package, so there is no need to public them.
2015-08-21 15:39:38 -07:00
Yicheng Qin
bad1b20620 pkg/netutil: fix false negative comparison
Sort the resolved URLs before DeepEqual, so it will not compare URLs
that may be out of order due to resolution.
2015-08-21 15:39:29 -07:00
Yicheng Qin
89640cf08f etcdserver: remove TODO to delete URLStringsEqual
Discovery SRV supports to compare IP addresses with domain names,
so we need URLStringsEqual function.
2015-08-21 15:39:19 -07:00
Yicheng Qin
bbefb0ad0b Revert "Revert "Treat URLs have same IP address as same""
This reverts commit 3153e635d5.

Conflicts:
	etcdserver/config.go
2015-08-21 15:39:10 -07:00
Xiang Li
8e0706583c etcdmain: print out version information on startup
Conflicts:
	etcdmain/etcd.go
2015-08-21 15:38:34 -07:00
Yicheng Qin
cd2a2182cf etcdctl/cluster_health: set health var when checked healthy
This was a typo.
2015-08-21 15:37:26 -07:00
Xiang Li
8d410bdfcb etcdctl: use health endpoint to greatly simplify health checking 2015-08-21 15:22:19 -07:00
Xiang Li
0a2d2b8b9d etcdctl: cluster-health supports forever flag
cluster-health command supports checking the cluster health
forever.
2015-08-21 15:22:13 -07:00
Yicheng Qin
6c9e876d7a etcdctl: refactor the way to check cluster health
This method uses raft status exposed at /debug/varz to determine the
health of the cluster. It uses whether commit index increases to
determine the cluster health, and uses whether match index increases to
determine the member health.

This could fix the bug #2711 that fails to detect follower is unhealthy
because it doesn't rely on whether message in long-polling connection is sent.

This health check is stricter than the old one, and reflects the
situation that whether followers are healthy in the view of the leader. One
example is that if the follower is receiving the snapshot, it will turns
out to be unhealthy because it doesn't move forward.

`etcdctl cluster-health` will reflect the healthy view in the raft level,
while connectivity checks reflects the healthy view in transport level.
2015-08-21 15:22:05 -07:00
Xiang Li
a845f82d4f etcdctl: health use etcd/client
Conflicts:
	etcdctl/command/cluster_health.go
2015-08-21 15:21:44 -07:00
Xiang Li
c1c23626cb raft: downgrade the logging around snapshot to debugf
Snapshot related logging is spamming when leader trying to
sync a failed peer.

Conflicts:
	raft/raft.go
2015-08-21 15:11:47 -07:00
Xiang Li
ac67aa9f63 etcdhttp:write etcderror for all errors in keyhandler 2015-08-21 15:10:01 -07:00
Xiang Li
52c5203370 *: key handler should write auth error as etcd error 2015-08-21 15:09:53 -07:00
Yicheng Qin
27bfb3fcb2 etcdserver: improve error message when timeout due to leader fail 2015-08-21 15:09:46 -07:00
Yicheng Qin
084936a920 etcdserver: specify timeout caused by leader election
Before this PR, the timeout caused by leader election returns:

```
14:45:37 etcd2 | 2015-08-12 14:45:37.786349 E | etcdhttp: got unexpected
response error (etcdserver: request timed out)
```

After this PR:

```
15:52:54 etcd1 | 2015-08-12 15:52:54.389523 E | etcdhttp: etcdserver:
request timed out, possibly due to leader down
```

Conflicts:
	etcdserver/raft.go
2015-08-21 15:09:32 -07:00
Brandon Philips
d2ecd9cecf test: race detector doesn't work on armv7l
Test fails without this fix on armv7l:

    go test: -race is only supported on linux/amd64, freebsd/amd64, darwin/amd64 and windows/amd64
2015-08-21 14:59:07 -07:00
Brandon Philips
07b82832f0 etcdserver: move atomics to make etcd work on arm64
Follow the simple rule in the atomic package:

"On both ARM and x86-32, it is the caller's responsibility to arrange
for 64-bit alignment of 64-bit words accessed atomically. The first word
in a global variable or in an allocated struct or slice can be relied
upon to be 64-bit aligned."

Tested on a system with /proc/cpuinfo reporting:

processor       : 0
model name      : ARMv7 Processor rev 1 (v7l)
Features        : swp half thumb fastmult vfp edsp thumbee neon vfpv3
tls vfpv4 idiva idivt vfpd32 lpae evtstrm
CPU implementer : 0x41
CPU architecture: 7
CPU variant     : 0x0
CPU part        : 0xc0d
CPU revision    : 1
2015-08-21 14:58:59 -07:00
Yicheng Qin
61f4e74652 etcdmain: reject unreasonably high values of -election-timeout
This helps users to detect setting problem early.
2015-08-21 14:58:49 -07:00
Yicheng Qin
331ecdf8c8 client: return correct error for 50x response
etcd always returns 500/503 response when it may have no leader.
So we should log the other 50x response in a normal way.

This helps to log correctly when discovery meets 504 error. Before this
PR, it logs like this:

```
18:31:58 etcd2 | 2015/08/4 18:31:58 discovery: error #0: client: etcd
member https://discovery.etcd.io has no leader
18:31:58 etcd2 | 2015/08/4 18:31:58 discovery: waiting for other nodes:
error connecting to https://discovery.etcd.io, retrying in 4s
```

After this PR:

```
22:20:25 etcd2 | 2015/08/4 22:20:25 discovery: error #0: client: etcd
member https://discovery.etcd.io returns server error [Gateway Timeout]
22:20:25 etcd2 | 2015/08/4 22:20:25 discovery: waiting for other nodes:
error connecting to https://discovery.etcd.io, retrying in 4s
```

Conflicts:
	client/client.go
2015-08-21 14:55:35 -07:00
Xiang Li
3a346eac25 discovery: print out detailed cluster error
Conflicts:
	discovery/discovery.go
2015-08-21 14:54:36 -07:00
Xiang Li
97605046c1 client: return cluster error if the etcd cluster is not avaliable
Add a new ClusterError type. It contians all encountered errors and
return ClusterNotAvailable as the error string.

Conflicts:
	client/client.go
	discovery/discovery.go
2015-08-21 14:51:41 -07:00
Guohua Ouyang
41ecf7f722 etcdmain: Don't print flags when flag parse error
At present it prints the whole usage and flags, which cause the exact
error message is hidden two screens above.

Fixes #3141

Signed-off-by: Guohua Ouyang <gouyang@redhat.com>
2015-08-21 14:32:49 -07:00
Xiang Li
fcd564efb8 etcdmian: fix initialization confilct
Fix #3142

Ignore flags if etcd is already initialized.
2015-08-21 14:32:41 -07:00
Yicheng Qin
0876c5e1ef etcdmain: warn when listening on HTTP if TLS is set
If the user sets TLS info, this implies that he wants to listen on TLS.
If etcd finds that urls to listen is still HTTP schema, it prints out
warning to notify user about possible wrong setting.
2015-08-21 14:32:31 -07:00
Yicheng Qin
ef80bb5cbf pkg/transport: fix HTTPS downgrade bug for keepalive listener
If TLS config is empty, etcd downgrades keepalive listener from HTTPS to
HTTP without warning. This results in HTTPS downgrade bug for client urls.
The commit returns error if it cannot listen on TLS.
2015-08-21 14:32:18 -07:00
1118 changed files with 37277 additions and 195202 deletions

View File

@@ -2,7 +2,6 @@ language: go
sudo: false
go:
- 1.4
- 1.5
install:
- go get github.com/barakmich/go-nyet

View File

@@ -12,14 +12,6 @@ etcd is Apache 2.0 licensed and accepts contributions via GitHub pull requests.
- Fork the repository on GitHub
- Read the README.md for build instructions
## Reporting Bugs and Creating Issues
Reporting bugs is one of the best ways to contribute. However, a good bug report
has some very specific qualities, so please read over our short document on
[reporting bugs](https://github.com/coreos/etcd/blob/master/Documentation/reporting_bugs.md)
before you submit your bug report. This document might contain links known
issues, another good reason to take a look there, before reporting your bug.
## Contribution flow
This is a rough outline of what a contributor's workflow looks like:

View File

@@ -1,31 +0,0 @@
## Snapshot Migration
You can migrate a snapshot of your data from a v0.4.9+ cluster into a new etcd 2.2 cluster using a snapshot migration. After snapshot migration, the etcd indexes of your data will change. Many etcd applications rely on these indexes to behave correctly. This operation should only be done while all etcd applications are stopped.
To get started get the newest data snapshot from the 0.4.9+ cluster:
```
curl http://cluster.example.com:4001/v2/migration/snapshot > backup.snap
```
Now, import the snapshot into your new cluster:
```
etcdctl --endpoint new_cluster.example.com import --snap backup.snap
```
If you have a large amount of data, you can specify more concurrent works to copy data in parallel by using `-c` flag.
If you have hidden keys to copy, you can use `--hidden` flag to specify.
And the data will quickly copy into the new cluster:
```
entering dir: /
entering dir: /foo
entering dir: /foo/bar
copying key: /foo/bar/1 1
entering dir: /
entering dir: /foo2
entering dir: /foo2/bar2
copying key: /foo2/bar2/2 2
```

View File

@@ -8,17 +8,14 @@ When first started, etcd stores its configuration into a data directory specifie
Configuration is stored in the write ahead log and includes: the local member ID, cluster ID, and initial cluster configuration.
The write ahead log and snapshot files are used during member operation and to recover after a restart.
Having a dedicated disk to store wal files can improve the throughput and stabilize the cluster.
It is highly recommended to dedicate a wal disk and set `--wal-dir` to point to a directory on that device for a production cluster deployment.
If a members data directory is ever lost or corrupted then the user should [remove][remove-a-member] the etcd member from the cluster using `etcdctl` tool.
If a members data directory is ever lost or corrupted then the user should remove the etcd member from the cluster via the [members API][members-api].
A user should avoid restarting an etcd member with a data directory from an out-of-date backup.
Using an out-of-date data directory can lead to inconsistency as the member had agreed to store information via raft then re-joins saying it needs that information again.
For maximum safety, if an etcd member suffers any sort of data corruption or loss, it must be removed from the cluster.
Once removed the member can be re-added with an empty data directory.
[remove-a-member]: runtime-configuration.md#remove-a-member
[members-api]: other_apis.md#members-api
#### Contents
@@ -27,8 +24,6 @@ The data directory has two sub-directories in it:
1. wal: write ahead log files are stored here. For details see the [wal package documentation][wal-pkg]
2. snap: log snapshots are stored here. For details see the [snap package documentation][snap-pkg]
If `--wal-dir` flag is set, etcd will write the write ahead log files to the specified directory instead of data directory.
[wal-pkg]: http://godoc.org/github.com/coreos/etcd/wal
[snap-pkg]: http://godoc.org/github.com/coreos/etcd/snap
@@ -39,74 +34,6 @@ If `--wal-dir` flag is set, etcd will write the write ahead log files to the spe
If you are spinning up multiple clusters for testing it is recommended that you specify a unique initial-cluster-token for the different clusters.
This can protect you from cluster corruption in case of mis-configuration because two members started with different cluster tokens will refuse members from each other.
#### Monitoring
It is important to monitor your production etcd cluster for healthy information and runtime metrics.
##### Health Monitoring
At lowest level, etcd exposes health information via HTTP at `/health` in JSON format. If it returns `{"health": "true"}`, then the cluster is healthy. Please note the `/health` endpoint is still an experimental one as in etcd 2.2.
```
$ curl -L http://127.0.0.1:2379/health
{"health": "true"}
```
You can also use etcdctl to check the cluster-wide health information. It will contact all the members of the cluster and collect the health information for you.
```
$./etcdctl cluster-health
member 8211f1d0f64f3269 is healthy: got healthy result from http://127.0.0.1:12379
member 91bc3c398fb3c146 is healthy: got healthy result from http://127.0.0.1:22379
member fd422379fda50e48 is healthy: got healthy result from http://127.0.0.1:32379
cluster is healthy
```
##### Runtime Metrics
etcd uses [Prometheus](http://prometheus.io/) for metrics reporting in the server. You can read more through the runtime metrics [doc](metrics.md).
#### Debugging
Debugging a distributed system can be difficult. etcd provides several ways to make debug
easier.
##### Enabling Debug Logging
When you want to debug etcd without stopping it, you can enable debug logging at runtime.
etcd exposes logging configuration at `/config/local/log`.
```
$ curl http://127.0.0.1:2379/config/local/log -XPUT -d '{"Level":"DEBUG"}'
$ # debug logging enabled
$
$ curl http://127.0.0.1:2379/config/local/log -XPUT -d '{"Level":"INFO"}'
$ # debug logging disabled
```
##### Debugging Variables
Debug variables are exposed for real-time debugging purposes. Developers who are familiar with etcd can utilize these variables to debug unexpected behavior. etcd exposes debug variables via HTTP at `/debug/vars` in JSON format. The debug variables contains
`cmdline`, `file_descriptor_limit`, `memstats` and `raft.status`.
`cmdline` is the command line arguments passed into etcd.
`file_descriptor_limit` is the max number of file descriptors etcd can utilize.
`memstats` is well explained [here](http://golang.org/pkg/runtime/#MemStats).
`raft.status` is useful when you want to debug low level raft issues if you are familiar with raft internals. In most cases, you do not need to check `raft.status`.
```json
{
"cmdline": ["./etcd"],
"file_descriptor_limit": 0,
"memstats": {"Alloc":4105744,"TotalAlloc":42337320,"Sys":12560632,"...":"..."},
"raft.status": {"id":"ce2a822cea30bfca","term":5,"vote":"ce2a822cea30bfca","commit":23509,"lead":"ce2a822cea30bfca","raftState":"StateLeader","progress":{"ce2a822cea30bfca":{"match":23509,"next":23510,"state":"ProgressStateProbe"}}}
}
```
#### Optimal Cluster Size
The recommended etcd cluster size is 3, 5 or 7, which is decided by the fault tolerance requirement. A 7-member cluster can provide enough fault tolerance in most cases. While larger cluster provides better fault tolerance the write performance reduces since data needs to be replicated to more machines.
@@ -130,7 +57,7 @@ As you can see, adding another member to bring the size of cluster up to an odd
#### Changing Cluster Size
After your cluster is up and running, adding or removing members is done via [runtime reconfiguration](runtime-configuration.md#cluster-reconfiguration-operations), which allows the cluster to be modified without downtime. The `etcdctl` tool has a `member list`, `member add` and `member remove` commands to complete this process.
After your cluster is up and running, adding or removing members is done via [runtime reconfiguration](runtime-configuration.md), which allows the cluster to be modified without downtime. The `etcdctl` tool has a `member list`, `member add` and `member remove` commands to complete this process.
### Member Migration
@@ -140,7 +67,7 @@ The data directory contains all the data to recover a member to its point-in-tim
* Stop the member process
* Copy the data directory of the now-idle member to the new machine
* Update the peer URLs for that member to reflect the new machine according to the [runtime configuration] [change peer url]
* Update the peer URLs for that member to reflect the new machine according to the [member api] [change peer url]
* Start etcd on the new machine, using the same configuration and the copy of the data directory
This example will walk you through the process of migrating the infra1 member to a new machine:
@@ -151,11 +78,11 @@ This example will walk you through the process of migrating the infra1 member to
|infra1|10.0.1.11:2380|
|infra2|10.0.1.12:2380|
```sh
```
$ export ETCDCTL_PEERS=http://10.0.1.10:2379,http://10.0.1.11:2379,http://10.0.1.12:2379
```
```sh
```
$ etcdctl member list
84194f7c5edd8b37: name=infra0 peerURLs=http://10.0.1.10:2380 clientURLs=http://127.0.0.1:2379,http://10.0.1.10:2379
b4db3bf5e495e255: name=infra1 peerURLs=http://10.0.1.11:2380 clientURLs=http://127.0.0.1:2379,http://10.0.1.11:2379
@@ -164,59 +91,53 @@ bc1083c870280d44: name=infra2 peerURLs=http://10.0.1.12:2380 clientURLs=http://1
#### Stop the member etcd process
```sh
$ ssh 10.0.1.11
```
$ ssh core@10.0.1.11
```
```sh
$ kill `pgrep etcd`
```
$ sudo systemctl stop etcd
```
#### Copy the data directory of the now-idle member to the new machine
```
$ tar -cvzf infra1.etcd.tar.gz %data_dir%
$ tar -cvzf node1.etcd.tar.gz /var/lib/etcd/node1.etcd
```
```sh
$ scp infra1.etcd.tar.gz 10.0.1.13:~/
```
$ scp node1.etcd.tar.gz core@10.0.1.13:~/
```
#### Update the peer URLs for that member to reflect the new machine
```sh
```
$ curl http://10.0.1.10:2379/v2/members/b4db3bf5e495e255 -XPUT \
-H "Content-Type: application/json" -d '{"peerURLs":["http://10.0.1.13:2380"]}'
```
Or use `etcdctl member update` command
```sh
$ etcdctl member update b4db3bf5e495e255 http://10.0.1.13:2380
```
#### Start etcd on the new machine, using the same configuration and the copy of the data directory
```sh
$ ssh 10.0.1.13
```
```sh
$ tar -xzvf infra1.etcd.tar.gz -C %data_dir%
$ ssh core@10.0.1.13
```
```
etcd -name infra1 \
$ tar -xzvf node1.etcd.tar.gz -C /var/lib/etcd
```
```
etcd -name node1 \
-listen-peer-urls http://10.0.1.13:2380 \
-listen-client-urls http://10.0.1.13:2379,http://127.0.0.1:2379 \
-advertise-client-urls http://10.0.1.13:2379,http://127.0.0.1:2379
```
[change peer url]: runtime-configuration.md#update-a-member
[change peer url]: other_apis.md#change-the-peer-urls-of-a-member
### Disaster Recovery
etcd is designed to be resilient to machine failures. An etcd cluster can automatically recover from any number of temporary failures (for example, machine reboots), and a cluster of N members can tolerate up to _(N-1)/2_ permanent failures (where a member can no longer access the cluster, due to hardware failure or disk corruption). However, in extreme circumstances, a cluster might permanently lose enough members such that quorum is irrevocably lost. For example, if a three-node cluster suffered two simultaneous and unrecoverable machine failures, it would be normally impossible for the cluster to restore quorum and continue functioning.
etcd is designed to be resilient to machine failures. An etcd cluster can automatically recover from any number of temporary failures (for example, machine reboots), and a cluster of N members can tolerate up to _(N/2)-1_ permanent failures (where a member can no longer access the cluster, due to hardware failure or disk corruption). However, in extreme circumstances, a cluster might permanently lose enough members such that quorum is irrevocably lost. For example, if a three-node cluster suffered two simultaneous and unrecoverable machine failures, it would be normally impossible for the cluster to restore quorum and continue functioning.
To recover from such scenarios, etcd provides functionality to backup and restore the datastore and recreate the cluster without data loss.
@@ -228,8 +149,8 @@ The first step of the recovery is to backup the data directory on a functioning
```sh
etcdctl backup \
--data-dir %data_dir% \
--backup-dir %backup_data_dir%
--data-dir /var/lib/etcd \
--backup-dir /tmp/etcd_backup
```
This command will rewrite some of the metadata contained in the backup (specifically, the node ID and cluster ID), which means that the node will lose its former identity. In order to recreate a cluster from the backup, you will need to start a new, single-node cluster. The metadata is rewritten to prevent the new node from inadvertently being joined onto an existing cluster.
@@ -240,7 +161,7 @@ To restore a backup using the procedure created above, start etcd with the `-for
```sh
etcd \
-data-dir=%backup_data_dir% \
-data-dir=/tmp/etcd_backup \
-force-new-cluster \
...
```
@@ -251,18 +172,18 @@ Once you have verified that etcd has started successfully, shut it down and move
```sh
pkill etcd
rm -fr %data_dir%
mv %backup_data_dir% %data_dir%
rm -fr /var/lib/etcd
mv /tmp/etcd_backup /var/lib/etcd
etcd \
-data-dir=%data_dir% \
-data-dir=/var/lib/etcd \
...
```
#### Restoring the cluster
Now that if the node is running successfully, you should [change its advertised peer URLs](runtime-configuration.md#update-a-member), as the `--force-new-cluster` has set the peer URL to the default (listening on localhost).
Now that the node is running successfully, you should [change its advertised peer URLs](other_apis.md#change-the-peer-urls-of-a-member), as the `--force-new-cluster` has set the peer URL to the default (listening on localhost).
You can then add more nodes to the cluster and restore resiliency. See the [add a new member](runtime-configuration.md#add-a-new-member) guide for more details. **NB:** If you are trying to restore your cluster using old failed etcd nodes, please make sure you have stopped old etcd instances and removed their old data directories specified by the data-dir configuration parameter.
You can then add more nodes to the cluster and restore resiliency. See the [runtime configuration](runtime-configuration.md) guide for more details.
### Client Request Timeout

View File

@@ -82,7 +82,7 @@ X-Raft-Term: 1
- `X-Raft-Index` is similar to the etcd index but is for the underlying raft protocol
- `X-Raft-Term` is an integer that will increase whenever an etcd master election happens in the cluster. If this number is increasing rapidly, you may need to tune the election timeout. See the [tuning][tuning] section for details.
[tuning]: tuning.md
[tuning]: #tuning
### Get the value of a key
@@ -356,13 +356,6 @@ So the first watch after the get should be:
curl 'http://127.0.0.1:2379/v2/keys/foo?wait=true&waitIndex=2008'
```
#### Connection being closed prematurely
The server may close a long polling connection before emitting any events.
This can happend due to a timeout or the server being shutdown.
Since the HTTP header is sent immediately upon accepting the connection, the response will be seen as empty: `200 OK` and empty body.
The clients should be prepared to deal with this scenario and retry the watch.
### Atomically Creating In-Order Keys
Using `POST` on a directory, you can create keys with key names that are created in-order.
@@ -380,7 +373,7 @@ curl http://127.0.0.1:2379/v2/keys/queue -XPOST -d value=Job1
"action": "create",
"node": {
"createdIndex": 6,
"key": "/queue/00000000000000000006",
"key": "/queue/6",
"modifiedIndex": 6,
"value": "Job1"
}
@@ -399,7 +392,7 @@ curl http://127.0.0.1:2379/v2/keys/queue -XPOST -d value=Job2
"action": "create",
"node": {
"createdIndex": 29,
"key": "/queue/00000000000000000029",
"key": "/queue/29",
"modifiedIndex": 29,
"value": "Job2"
}
@@ -423,13 +416,13 @@ curl -s 'http://127.0.0.1:2379/v2/keys/queue?recursive=true&sorted=true'
"nodes": [
{
"createdIndex": 2,
"key": "/queue/00000000000000000002",
"key": "/queue/2",
"modifiedIndex": 2,
"value": "Job1"
},
{
"createdIndex": 3,
"key": "/queue/00000000000000000003",
"key": "/queue/3",
"modifiedIndex": 3,
"value": "Job2"
}
@@ -472,7 +465,7 @@ curl http://127.0.0.1:2379/v2/keys/dir -XPUT -d ttl=30 -d dir=true -d prevExist=
Keys that are under this directory work as usual, but when the directory expires, a watcher on a key under the directory will get an expire event:
```sh
curl 'http://127.0.0.1:2379/v2/keys/dir?wait=true'
curl 'http://127.0.0.1:2379/v2/keys/dir/asdf?wait=true'
```
```json

View File

@@ -24,6 +24,49 @@ https://github.com/coreos/etcd/blob/master/Documentation/configuration.md.
The default data dir location has changed from {$hostname}.etcd to {name}.etcd.
## Data Directory Migration
The disk format within the data directory changed with etcd 2.0.
If you run etcd 2.0 on an etcd 0.4 data directory it will automatically migrate the data and start.
You will want to coordinate this upgrade by walking through each of your machines in the cluster, stopping etcd 0.4 and then starting etcd 2.0.
If you would rather manually do the migration, to test it out first in another environment, you can use the [migration tool doc][migrationtooldoc].
[migrationtooldoc]: https://github.com/coreos/etcd/blob/master/tools/etcd-migrate/README.md
## Snapshot Migration
If you are only interested in the data in etcd you can migrate a snapshot of your data from a v0.4.9+ cluster into a new etcd 2.0 cluster using a snapshot migration.
The advantage of this method is that you are directly dumping only the etcd data so you can run your old and new cluster side-by-side, snapshot the data, import it and then point your applications at this cluster.
The disadvantage is that the etcd indexes of your data will change which may confuse applications that use etcd.
To get started get the newest data snapshot from the 0.4.9+ cluster:
```
curl http://cluster.example.com:4001/v2/migration/snapshot > backup.snap
```
Now, import the snapshot into your new cluster:
```
etcdctl -C new_cluster.example.com import --snap backup.snap
```
If you have a large amount of data, you can specify more concurrent works to copy data in parallel by using `-c` flag.
If you have hidden keys to copy, you can use `--hidden` flag to specify.
And the data will quickly copy into the new cluster:
```
entering dir: /
entering dir: /foo
entering dir: /foo/bar
copying key: /foo/bar/1 1
entering dir: /
entering dir: /foo2
entering dir: /foo2/bar2
copying key: /foo2/bar2/2 2
```
## Key-Value API
### Read consistency flag

View File

@@ -2,12 +2,4 @@
etcd benchmarks will be published regularly and tracked for each release below:
- [etcd v2.1.0-alpha](./etcd-2-1-0-alpha-benchmarks.md)
- [etcd v2.2.0-rc](./etcd-2-2-0-rc-benchmarks.md)
- [etcd v3 demo](./etcd-3-demo-benchmarks.md)
# Memory Usage Benchmarks
It records expected memory usage in different scenarios.
- [etcd v2.2.0-rc](./etcd-2-2-0-rc-memory-benchmarks.md)
- [etcd v2.1.0](etcd-2-1-0-benchmarks.md)

View File

@@ -6,7 +6,7 @@ GCE n1-highcpu-2 machine type
- 1x dedicated slow disk for the OS
- 1.8 GB memory
- 2x CPUs
- etcd version 2.1.0 alpha
- etcd version 2.1.0
## etcd Cluster

View File

@@ -1,67 +0,0 @@
## Physical machines
GCE n1-highcpu-2 machine type
- 1x dedicated local SSD mounted under /var/lib/etcd
- 1x dedicated slow disk for the OS
- 1.8 GB memory
- 2x CPUs
## etcd Cluster
3 etcd 2.2.0-rc members, each runs on a single machine.
Detailed versions:
```
etcd Version: 2.2.0-alpha.1+git
Git SHA: 59a5a7e
Go Version: go1.4.2
Go OS/Arch: linux/amd64
```
Also, we use 3 etcd 2.1.0 alpha-stage members to form cluster to get base performance. etcd's commit head is at [c7146bd5](https://github.com/coreos/etcd/commits/c7146bd5f2c73716091262edc638401bb8229144), which is the same as the one that we use in [etcd 2.1 benchmark](./etcd-2-1-0-benchmarks.md).
## Testing
Bootstrap another machine and use benchmark tool [boom](https://github.com/rakyll/boom) to send requests to each etcd member. Check [here](../../hack/benchmark/) for instructions.
## Performance
### reading one single key
| key size in bytes | number of clients | target etcd server | read QPS | 90th Percentile Latency (ms) |
|-------------------|-------------------|--------------------|----------|---------------|
| 64 | 1 | leader only | 2804 (-5%) | 0.4 (+0%) |
| 64 | 64 | leader only | 17816 (+0%) | 5.7 (-6%) |
| 64 | 256 | leader only | 18667 (-6%) | 20.4 (+2%) |
| 256 | 1 | leader only | 2181 (-15%) | 0.5 (+25%) |
| 256 | 64 | leader only | 17435 (-7%) | 6.0 (+9%) |
| 256 | 256 | leader only | 18180 (-8%) | 21.3 (+3%) |
| 64 | 64 | all servers | 46965 (-4%) | 2.1 (+0%) |
| 64 | 256 | all servers | 55286 (-6%) | 7.4 (+6%) |
| 256 | 64 | all servers | 46603 (-6%) | 2.1 (+5%) |
| 256 | 256 | all servers | 55291 (-6%) | 7.3 (+4%) |
### writing one single key
| key size in bytes | number of clients | target etcd server | write QPS | 90th Percentile Latency (ms) |
|-------------------|-------------------|--------------------|-----------|---------------|
| 64 | 1 | leader only | 76 (+22%) | 19.4 (-15%) |
| 64 | 64 | leader only | 2461 (+45%) | 31.8 (-32%) |
| 64 | 256 | leader only | 4275 (+1%) | 69.6 (-10%) |
| 256 | 1 | leader only | 64 (+20%) | 16.7 (-30%) |
| 256 | 64 | leader only | 2385 (+30%) | 31.5 (-19%) |
| 256 | 256 | leader only | 4353 (-3%) | 74.0 (+9%) |
| 64 | 64 | all servers | 2005 (+81%) | 49.8 (-55%) |
| 64 | 256 | all servers | 4868 (+35%) | 81.5 (-40%) |
| 256 | 64 | all servers | 1925 (+72%) | 47.7 (-59%) |
| 256 | 256 | all servers | 4975 (+36%) | 70.3 (-36%) |
### performance changes explanation
- read QPS in most scenarios is decreased by 5~8%. The reason is that etcd records store metrics for each store operation. The metrics is important for monitoring and debugging, so this is acceptable.
- write QPS to leader is increased by 20~30%. This is because we decouple raft main loop and entry apply loop, which avoids them blocking each other.
- write QPS to all servers is increased by 30~80% because follower could receive latest commit index earlier and commit proposals faster.

View File

@@ -1,47 +0,0 @@
## Physical machine
GCE n1-standard-2 machine type
- 1x dedicated local SSD mounted under /var/lib/etcd
- 1x dedicated slow disk for the OS
- 7.5 GB memory
- 2x CPUs
## etcd
```
etcd Version: 2.2.0-rc.0+git
Git SHA: 103cb5c
Go Version: go1.5
Go OS/Arch: linux/amd64
```
## Testing
Start 3-member etcd cluster, each of which uses 2 cores.
The length of key name is always 64 bytes, which is a reasonable length of average key bytes.
## Memory Maximal Usage
- etcd may use maximal memory if one follower is dead and the leader keeps sending snapshots.
- `max RSS` is the maximal memory usage recorded in 3 runs.
| value bytes | key number | data size(MB) | max RSS(MB) | max RSS/data rate on leader |
|-------------|-------------|---------------|-------------|-----------------------------|
| 128 | 50000 | 6 | 433 | 72x |
| 128 | 100000 | 12 | 659 | 54x |
| 128 | 200000 | 24 | 1466 | 61x |
| 1024 | 50000 | 48 | 1253 | 26x |
| 1024 | 100000 | 96 | 2344 | 24x |
| 1024 | 200000 | 192 | 4361 | 22x |
## Data Size Threshold
- When etcd reaches data size threshold, it may trigger leader election easily and drop part of proposals.
- At most cases, etcd cluster should work smoothly if it doesn't hit the threshold. If it doesn't work well due to insufficient resources, you need to decrease its data size.
| value bytes | key number limitation | suggested data size threshold(MB) | consumed RSS(MB) |
|-------------|-----------------------|-----------------------------------|------------------|
| 128 | 400K | 48 | 2400 |
| 1024 | 300K | 292 | 6500 |

View File

@@ -1,40 +0,0 @@
## Physical machines
GCE n1-highcpu-2 machine type
- 1x dedicated local SSD mounted under /var/lib/etcd
- 1x dedicated slow disk for the OS
- 1.8 GB memory
- 2x CPUs
- etcd version 2.2.0
## etcd Cluster
1 etcd member running in v3 demo mode
## Testing
Use [etcd v3 benchmark tool](../../hack/v3benchmark/).
## Performance
### reading one single key
| key size in bytes | number of clients | read QPS | 90th Percentile Latency (ms) |
|-------------------|-------------------|----------|---------------|
| 256 | 1 | 2716 | 0.4 |
| 256 | 64 | 16623 | 6.1 |
| 256 | 256 | 16622 | 21.7 |
The performance is nearly the same as the one with empty server handler.
### reading one single key after putting
| key size in bytes | number of clients | read QPS | 90th Percentile Latency (ms) |
|-------------------|-------------------|----------|---------------|
| 256 | 1 | 2269 | 0.5 |
| 256 | 64 | 13582 | 8.6 |
| 256 | 256 | 13262 | 47.5 |
The performance with empty server handler is not affected by one put. So the
performance downgrade should be caused by storage package.

View File

@@ -4,7 +4,7 @@
Starting an etcd cluster statically requires that each member knows another in the cluster. In a number of cases, you might not know the IPs of your cluster members ahead of time. In these cases, you can bootstrap an etcd cluster with the help of a discovery service.
Once an etcd cluster is up and running, adding or removing members is done via [runtime reconfiguration](runtime-configuration.md). To better understand the design behind runtime reconfiguration, we suggest you read [this](runtime-reconf-design.md).
Once an etcd cluster is up and running, adding or removing members is done via [runtime reconfiguration](runtime-configuration.md).
This guide will cover the following mechanisms for bootstrapping an etcd cluster:
@@ -38,8 +38,6 @@ Note that the URLs specified in `initial-cluster` are the _advertised peer URLs_
If you are spinning up multiple clusters (or creating and destroying a single cluster) with same configuration for testing purpose, it is highly recommended that you specify a unique `initial-cluster-token` for the different clusters. By doing this, etcd can generate unique cluster IDs and member IDs for the clusters even if they otherwise have the exact same configuration. This can protect you from cross-cluster-interaction, which might corrupt your clusters.
etcd listens on [`listen-client-urls`](configuration.md#-listen-client-urls) to accept client traffic. etcd member advertises the URLs specified in [`advertise-client-urls`](configuration.md#-advertise-client-urls) to other members, proxies, clients. Please make sure the `advertise-client-urls` are reachable from intended clients. A common mistake is setting `advertise-client-urls` to localhost or leave it as default when you want the remote clients to reach etcd.
On each machine you would start etcd with these flags:
```
@@ -124,8 +122,6 @@ There two methods that can be used for discovery:
### etcd Discovery
To better understand the design about discovery service protocol, we suggest you read [this](./discovery_protocol.md).
#### Lifetime of a Discovery URL
A discovery URL identifies a unique etcd cluster. Instead of reusing a discovery URL, you should always create discovery URLs for new clusters.
@@ -148,8 +144,6 @@ If you bootstrap an etcd cluster using discovery service with more than the expe
The URL you will use in this case will be `https://myetcd.local/v2/keys/discovery/6c007a14875d53d9bf0ef5a6fc0257c817f0fb83` and the etcd members will use the `https://myetcd.local/v2/keys/discovery/6c007a14875d53d9bf0ef5a6fc0257c817f0fb83` directory for registration as they start.
Each member must have a different name flag specified. Or discovery will fail due to duplicated name.
Now we start etcd with those relevant flags for each member:
```
@@ -200,8 +194,6 @@ ETCD_DISCOVERY=https://discovery.etcd.io/3e86b59982e49066c5d813af1c2e2579cbf573d
-discovery https://discovery.etcd.io/3e86b59982e49066c5d813af1c2e2579cbf573de
```
Each member must have a different name flag specified. Or discovery will fail due to duplicated name.
Now we start etcd with those relevant flags for each member:
```
@@ -304,8 +296,6 @@ infra2.example.com. 300 IN A 10.0.1.12
etcd cluster members can listen on domain names or IP address, the bootstrap process will resolve DNS A records.
The resolved address in `-initial-advertise-peer-urls` *must match* one of the resolved addresses in the SRV targets. The etcd member reads the resolved address to find out if it belongs to the cluster defined in the SRV records.
```
$ etcd -name infra0 \
-discovery-srv example.com \
@@ -382,10 +372,6 @@ DNS SRV records can also be used to configure the list of peers for an etcd serv
$ etcd --proxy on -discovery-srv example.com
```
#### Error Cases
You might see the an error like `cannot find local etcd $name from SRV records.`. That means the etcd member fails to find itself from the cluster defined in SRV records. The resolved address in `-initial-advertise-peer-urls` *must match* one of the resolved addresses in the SRV targets.
# 0.4 to 2.0+ Migration Guide
In etcd 2.0 we introduced the ability to listen on more than one address and to advertise multiple addresses. This makes using etcd easier when you have complex networking, such as private and public networks on various cloud providers.

View File

@@ -13,64 +13,45 @@ To start etcd automatically using custom settings at startup in Linux, using a [
##### -name
+ Human-readable name for this member.
+ default: "default"
+ env variable: ETCD_NAME
+ This value is referenced as this node's own entries listed in the `-initial-cluster` flag (Ex: `default=http://localhost:2380` or `default=http://localhost:2380,default=http://localhost:7001`). This needs to match the key used in the flag if you're using [static boostrapping](clustering.md#static).
##### -data-dir
+ Path to the data directory.
+ default: "${name}.etcd"
+ env variable: ETCD_DATA_DIR
##### -wal-dir
+ Path to the dedicated wal directory. If this flag is set, etcd will write the WAL files to the walDir rather than the dataDir. This allows a dedicated disk to be used, and helps avoid io competition between logging and other IO operations.
+ default: ""
+ env variable: ETCD_WAL_DIR
##### -snapshot-count
+ Number of committed transactions to trigger a snapshot to disk.
+ default: "10000"
+ env variable: ETCD_SNAPSHOT_COUNT
##### -heartbeat-interval
+ Time (in milliseconds) of a heartbeat interval.
+ default: "100"
+ env variable: ETCD_HEARTBEAT_INTERVAL
##### -election-timeout
+ Time (in milliseconds) for an election to timeout. See [Documentation/tuning.md](tuning.md#time-parameters) for details.
+ default: "1000"
+ env variable: ETCD_ELECTION_TIMEOUT
##### -listen-peer-urls
+ List of URLs to listen on for peer traffic. This flag tells the etcd to accept incoming requests from its peers on the specified scheme://IP:port combinations. Scheme can be either http or https.If 0.0.0.0 is specified as the IP, etcd listens to the given port on all interfaces. If an IP address is given as well as a port, etcd will listen on the given port and interface. Multiple URLs may be used to specify a number of addresses and ports to listen on. The etcd will respond to requests from any of the listed addresses and ports.
+ List of URLs to listen on for peer traffic.
+ default: "http://localhost:2380,http://localhost:7001"
+ env variable: ETCD_LISTEN_PEER_URLS
+ example: "http://10.0.0.1:2380"
+ invalid example: "http://example.com:2380" (domain name is invalid for binding)
##### -listen-client-urls
+ List of URLs to listen on for client traffic. This flag tells the etcd to accept incoming requests from the clients on the specified scheme://IP:port combinations. Scheme can be either http or https. If 0.0.0.0 is specified as the IP, etcd listens to the given port on all interfaces. If an IP address is given as well as a port, etcd will listen on the given port and interface. Multiple URLs may be used to specify a number of addresses and ports to listen on. The etcd will respond to requests from any of the listed addresses and ports.
+ List of URLs to listen on for client traffic.
+ default: "http://localhost:2379,http://localhost:4001"
+ env variable: ETCD_LISTEN_CLIENT_URLS
+ example: "http://10.0.0.1:2379"
+ invalid example: "http://example.com:2379" (domain name is invalid for binding)
##### -max-snapshots
+ Maximum number of snapshot files to retain (0 is unlimited)
+ default: 5
+ env variable: ETCD_MAX_SNAPSHOTS
+ The default for users on Windows is unlimited, and manual purging down to 5 (or your preference for safety) is recommended.
##### -max-wals
+ Maximum number of wal files to retain (0 is unlimited)
+ default: 5
+ env variable: ETCD_MAX_WALS
+ The default for users on Windows is unlimited, and manual purging down to 5 (or your preference for safety) is recommended.
##### -cors
+ Comma-separated white list of origins for CORS (cross-origin resource sharing).
+ default: none
+ env variable: ETCD_CORS
### Clustering Flags
@@ -80,55 +61,43 @@ To start etcd automatically using custom settings at startup in Linux, using a [
##### -initial-advertise-peer-urls
+ List of this member's peer URLs to advertise to the rest of the cluster. These addresses are used for communicating etcd data around the cluster. At least one must be routable to all cluster members. These URLs can contain domain names.
+ List of this member's peer URLs to advertise to the rest of the cluster. These addresses are used for communicating etcd data around the cluster. At least one must be routable to all cluster members.
+ default: "http://localhost:2380,http://localhost:7001"
+ env variable: ETCD_INITIAL_ADVERTISE_PEER_URLS
+ example: "http://example.com:2380, http://10.0.0.1:2380"
##### -initial-cluster
+ Initial cluster configuration for bootstrapping.
+ default: "default=http://localhost:2380,default=http://localhost:7001"
+ env variable: ETCD_INITIAL_CLUSTER
+ The key is the value of the `-name` flag for each node provided. The default uses `default` for the key because this is the default for the `-name` flag.
##### -initial-cluster-state
+ Initial cluster state ("new" or "existing"). Set to `new` for all members present during initial static or DNS bootstrapping. If this option is set to `existing`, etcd will attempt to join the existing cluster. If the wrong value is set, etcd will attempt to start but fail safely.
+ default: "new"
+ env variable: ETCD_INITIAL_CLUSTER_STATE
[static bootstrap]: clustering.md#static
##### -initial-cluster-token
+ Initial cluster token for the etcd cluster during bootstrap.
+ default: "etcd-cluster"
+ env variable: ETCD_INITIAL_CLUSTER_TOKEN
##### -advertise-client-urls
+ List of this member's client URLs to advertise to the rest of the cluster. These URLs can contain domain names.
+ List of this member's client URLs to advertise to the rest of the cluster.
+ default: "http://localhost:2379,http://localhost:4001"
+ env variable: ETCD_ADVERTISE_CLIENT_URLS
+ example: "http://example.com:2379, http://10.0.0.1:2379"
+ Be careful if you are advertising URLs such as http://localhost:2379 from a cluster member and are using the proxy feature of etcd. This will cause loops, because the proxy will be forwarding requests to itself until its resources (memory, file descriptors) are eventually depleted.
##### -discovery
+ Discovery URL used to bootstrap the cluster.
+ default: none
+ env variable: ETCD_DISCOVERY
##### -discovery-srv
+ DNS srv domain used to bootstrap the cluster.
+ default: none
+ env variable: ETCD_DISCOVERY_SRV
##### -discovery-fallback
+ Expected behavior ("exit" or "proxy") when discovery services fails.
+ default: "proxy"
+ env variable: ETCD_DISCOVERY_FALLBACK
##### -discovery-proxy
+ HTTP proxy to use for traffic to discovery service.
+ default: none
+ env variable: ETCD_DISCOVERY_PROXY
### Proxy Flags
@@ -137,99 +106,81 @@ To start etcd automatically using custom settings at startup in Linux, using a [
##### -proxy
+ Proxy mode setting ("off", "readonly" or "on").
+ default: "off"
+ env variable: ETCD_PROXY
##### -proxy-failure-wait
+ Time (in milliseconds) an endpoint will be held in a failed state before being reconsidered for proxied requests.
+ default: 5000
+ env variable: ETCD_PROXY_FAILURE_WAIT
##### -proxy-refresh-interval
+ Time (in milliseconds) of the endpoints refresh interval.
+ default: 30000
+ env variable: ETCD_PROXY_REFRESH_INTERVAL
##### -proxy-dial-timeout
+ Time (in milliseconds) for a dial to timeout or 0 to disable the timeout
+ default: 1000
+ env variable: ETCD_PROXY_DIAL_TIMEOUT
##### -proxy-write-timeout
+ Time (in milliseconds) for a write to timeout or 0 to disable the timeout.
+ default: 5000
+ env variable: ETCD_PROXY_WRITE_TIMEOUT
##### -proxy-read-timeout
+ Time (in milliseconds) for a read to timeout or 0 to disable the timeout.
+ Don't change this value if you use watches because they are using long polling requests.
+ default: 0
+ env variable: ETCD_PROXY_READ_TIMEOUT
### Security Flags
The security flags help to [build a secure etcd cluster][security].
##### -ca-file [DEPRECATED]
+ Path to the client server TLS CA file. `-ca-file ca.crt` could be replaced by `-trusted-ca-file ca.crt -client-cert-auth` and etcd will perform the same.
+ Path to the client server TLS CA file.
+ default: none
+ env variable: ETCD_CA_FILE
##### -cert-file
+ Path to the client server TLS cert file.
+ default: none
+ env variable: ETCD_CERT_FILE
##### -key-file
+ Path to the client server TLS key file.
+ default: none
+ env variable: ETCD_KEY_FILE
##### -client-cert-auth
+ Enable client cert authentication.
+ default: false
+ env variable: ETCD_CLIENT_CERT_AUTH
##### -trusted-ca-file
+ Path to the client server TLS trusted CA key file.
+ default: none
+ env variable: ETCD_TRUSTED_CA_FILE
##### -peer-ca-file [DEPRECATED]
+ Path to the peer server TLS CA file. `-peer-ca-file ca.crt` could be replaced by `-peer-trusted-ca-file ca.crt -peer-client-cert-auth` and etcd will perform the same.
+ Path to the peer server TLS CA file.
+ default: none
+ env variable: ETCD_PEER_CA_FILE
##### -peer-cert-file
+ Path to the peer server TLS cert file.
+ default: none
+ env variable: ETCD_PEER_CERT_FILE
##### -peer-key-file
+ Path to the peer server TLS key file.
+ default: none
+ env variable: ETCD_PEER_KEY_FILE
##### -peer-client-cert-auth
+ Enable peer client cert authentication.
+ default: false
+ env variable: ETCD_PEER_CLIENT_CERT_AUTH
##### -peer-trusted-ca-file
+ Path to the peer server TLS trusted CA file.
+ default: none
+ env variable: ETCD_PEER_TRUSTED_CA_FILE
### Logging Flags
##### -debug
+ Drop the default log level to DEBUG for all subpackages.
+ default: false (INFO for all packages)
+ env variable: ETCD_DEBUG
##### -log-package-levels
+ Set individual etcd subpackages to specific log levels. An example being `etcdserver=WARNING,security=DEBUG`
+ default: none (INFO for all packages)
+ env variable: ETCD_LOG_PACKAGE_LEVELS
### Unsafe Flags
@@ -241,14 +192,6 @@ Follow the instructions when using these flags.
##### -force-new-cluster
+ Force to create a new one-member cluster. It commits configuration changes in force to remove all existing members in the cluster and add itself. It needs to be set to [restore a backup][restore].
+ default: false
+ env variable: ETCD_FORCE_NEW_CLUSTER
### Experimental Flags
##### -experimental-v3demo
+ Enable experimental [v3 demo API](rfc/v3api.proto).
+ default: false
+ env variable: ETCD_EXPERIMENTAL_V3DEMO
### Miscellaneous Flags

View File

@@ -1,109 +0,0 @@
# etcd release guide
The guide talks about how to release a new version of etcd.
The procedure includes some manual steps for sanity checking but it can probably be further scripted. Please keep this document up-to-date if you want to make changes to the release process.
## Prepare Release
Set desired version as environment variable for following steps. Here is an example to release 2.1.3:
```
export VERSION=v2.1.3
export PREV_VERSION=v2.1.2
```
All releases version numbers follow the format of [semantic versioning 2.0.0](http://semver.org/).
### Major, Minor Version Release, or its Pre-release
- Ensure the relevant milestone on GitHub is complete. All referenced issues should be closed, or moved elsewhere.
- Remove this release from [roadmap](https://github.com/coreos/etcd/blob/master/ROADMAP.md), if necessary.
- Ensure the latest upgrade documentation is available.
- Bump [hardcoded MinClusterVerion in the repository](https://github.com/coreos/etcd/blob/master/version/version.go#L29), if necessary.
- Add feature capability maps for the new version, if necessary.
### Patch Version Release
- Discuss about commits that are backported to the patch release. The commits should not include merge commits.
- Cherry-pick these commits starting from the oldest one into stable branch.
## Write Release Note
- Write introduction for the new release. For example, what major bug we fix, what new features we introduce or what performance improvement we make.
- Write changelog for the last release. ChangeLog should be straightforward and easy to understand for the end-user.
- Put `[GH XXXX]` at the head of change line to reference Pull Request that introduces the change. Moreover, add a link on it to jump to the Pull Request.
## Tag Version
- Bump [hardcoded Version in the repository](https://github.com/coreos/etcd/blob/master/version/version.go#L30) to the latest version `${VERSION}`.
- Ensure all tests on CI system are passed.
- Manually check etcd is buildable in Linux, Darwin and Windows.
- Manually check upgrade etcd cluster of previous minor version works well.
- Manually check new features work well.
- Add a signed tag through `git tag -s ${VERSION}`.
- Sanity check tag correctness through `git show tags/$VERSION`.
- Push the tag to GitHub through `git push origin tags/$VERSION`. This assumes `origin` corresponds to "https://github.com/coreos/etcd".
## Build Release Binaries and Images
- Ensure `actool` is available, or installing it through `go get github.com/appc/spec/actool`.
- Ensure `docker` is available.
Run release script in root directory:
```
./scripts/release.sh ${VERSION}
```
It generates all release binaries and images under directory ./release.
## Sign Binaries and Images
Choose appropriate private key to sign the generated binaries and images.
The following commands are used for public release sign:
```
cd release
# personal GPG is okay for now
for i in etcd-*{.zip,.tar.gz}; do gpg --sign ${i}; done
# use `CoreOS ACI Builder <release@coreos.com>` secret key
gpg -u 88182190 -a --output etcd-${VERSION}-linux-amd64.aci.asc --detach-sig etcd-${VERSION}-linux-amd64.aci
```
## Publish Release Page in GitHub
- Set release title as the version name.
- Follow the format of previous release pages.
- Attach the generated binaries, aci image and signatures.
- Select whether it is a pre-release.
- Publish the release!
## Publish Docker Image in Quay.io
- Push docker image:
```
docker login quay.io
docker push quay.io/coreos/etcd:${VERSION}
```
- Add `latest` tag to the new image on [quay.io](https://quay.io/repository/coreos/etcd?tag=latest&tab=tags) if this is a stable release.
## Announce to etcd-dev Googlegroup
- Follow the format of [previous release emails](https://groups.google.com/forum/#!forum/etcd-dev).
- Make sure to include a list of authors that contributed since the previous release - something like the following might be handy:
```
git log ...${PREV_VERSION} --pretty=format:"%an" | sort | uniq | tr '\n' ',' | sed -e 's#,#, #g' -e 's#, $##'
```
- Send email to etcd-dev@googlegroups.com
## Post Release
- Create new stable branch through `git push origin ${VERSION_MAJOR}.${VERSION_MINOR}` if this is a major stable release. This assumes `origin` corresponds to "https://github.com/coreos/etcd".
- Bump [hardcoded Version in the repository](https://github.com/coreos/etcd/blob/master/version/version.go#L30) to the version `${VERSION}+git`.

View File

@@ -1,109 +0,0 @@
# Discovery Service Protocol
Discovery service protocol helps new etcd member to discover all other members in cluster bootstrap phase using a shared discovery URL.
Discovery service protocol is _only_ used in cluster bootstrap phase, and cannot be used for runtime reconfiguration or cluster monitoring.
The protocol uses a new discovery token to bootstrap one _unique_ etcd cluster. Remember that one discovery token can represent only one etcd cluster. As long as discovery protocol on this token starts, even if fails halfway, it must not be used to bootstrap another etcd cluster.
The rest of this article will walk through the discovery process with examples that correspond to a self-hosted discovery cluster. The public discovery service, discovery.etcd.io, functions the same way, but with a layer of polish to abstract away ugly URLs, generate UUIDs automatically, and provide some protections against excessive requests. At its core, the public discovery service still uses an etcd cluster as the data store as described in this document.
## The Protocol Workflow
The idea of discovery protocol is to use an internal etcd cluster to coordinate bootstrap of a new cluster. First, all new members interact with discovery service and help to generate the expected member list. Then each new member bootstraps its server using this list, which performs the same functionality as -initial-cluster flag.
In the following example workflow, we will list each step of protocol in curl format for ease of understanding.
By convention the etcd discovery protocol uses the key prefix `_etcd/registry`. If `http://example.com` hosts a etcd cluster for discovery service, a full URL to discovery keyspace will be `http://example.com/v2/keys/_etcd/registry`. We will use this as the URL prefix in the example.
### Creating a New Discovery Token
Generate a unique token that will identify the new cluster. This will be used as a unique prefix in discovery keyspace in the following steps. An easy way to do this is to use `uuidgen`:
```
UUID=$(uuidgen)
```
### Specifying the Expected Cluster Size
You need to specify the expected cluster size for this discovery token. The size is used by the discovery service to know when it has found all members that will initially form the cluster.
```
curl -X PUT http://example.com/v2/keys/_etcd/registry/${UUID}/_config/size -d value=${cluster_size}
```
Usually the cluster size is 3, 5 or 7. Check [optimal cluster size](admin_guide.md#optimal-cluster-size) for more details.
### Bringing up etcd Processes
Now that you have your discovery URL, you can use it as `-discovery` flag and bring up etcd processes. Every etcd process will follow this next few steps internally if given a `-discovery` flag.
### Registering itself
The first thing for etcd process is to register itself into the discovery URL as a member. This is done by creating member ID as a key in the discovery URL.
```
curl -X PUT http://example.com/v2/keys/_etcd/registry/${UUID}/${member_id}?prevExist=false -d value="${member_name}=${member_peer_url_1}&${member_name}=${member_peer_url_2}"
```
### Checking the Status
It checks the expected cluster size and registration status in discovery URL, and decides what the next action is.
```
curl -X GET http://example.com/v2/keys/_etcd/registry/${UUID}/_config/size
curl -X GET http://example.com/v2/keys/_etcd/registry/${UUID}
```
If registered members are still not enough, it will wait for left members to appear.
If the number of registered members is bigger than the expected size N, it treats the first N registered members as the member list for the cluster. If the member itself is in the member list, the discovery procedure succeeds and it fetches all peers through the member list. If it is not in the member list, the discovery procedure finishes with the failure that the cluster has been full.
In etcd implementation, the member may check the cluster status even before registering itself. So it could fail quickly if the cluster has been full.
### Waiting for All Members
The wait process is described in details [here](https://github.com/coreos/etcd/blob/master/Documentation/api.md#waiting-for-a-change).
```
curl -X GET http://example.com/v2/keys/_etcd/registry/${UUID}?wait=true&waitIndex=${current_etcd_index}
```
It keeps waiting until finding all members.
## Public Discovery Service
CoreOS Inc. hosts a public discovery service at https://discovery.etcd.io/ , which provides some nice features for ease of use.
### Mask Key Prefix
Public discovery service will redirect `https://discovery.etcd.io/${UUID}` to etcd cluster behind for the key at `/v2/keys/_etcd/registry`. It masks register key prefix for short and readable discovery url.
### Get new token
```
GET /new
Sent query:
size=${cluster_size}
Possible status codes:
200 OK
400 Bad Request
200 Body:
generated discovery url
```
The generation process in the service follows the step from [Creating a New Discovery Token](#creating-a-new-discovery-token) to [Specifying the Expected Cluster Size](#specifying-the-expected-cluster-size).
### Check Discovery Status
```
GET /${UUID}
```
You can check the status for this discovery token, including the machines that have been registered, by requesting the value of the UUID.
### Open-source repository
The repository is located at https://github.com/coreos/discovery.etcd.io. You could use it to build your own public discovery service.

View File

@@ -1,80 +0,0 @@
# FAQ
## 1) How come I can read an old version of the data when a majority of the members are down?
In situations where a client connects to a minority, etcd
favors by default availability over consistency. This means that even though
data might be “out of date”, it is still better to return something versus
nothing.
In order to confirm that a read is up to date with a majority of the cluster,
the client can use the `quorum=true` parameter on reads of keys. This means
that a majority of the cluster is checked on reads before returning the data,
otherwise the read will timeout and fail.
## 2) With quorum=false, doesnt this mean that if my client switched the member it was connected to, that it could experience a logical ordering where the cluster goes backwards in time?
Yes, but this could be handled at the etcd client implementation via
remembering the last seen index. The “index” is the cluster's single
irrevocable sequence of the entire modification history. The client could
remember the last seen index, and determine via comparing the index returned on
the GET whether or not the state of the key-value pair is before or after its
last seen state.
## 3) What happens if a watch is registered on a minority member?
The watch will stay untriggered, even as modifications are occurring in the
majority quorum. This is an open issue, and is being addressed in v3. There are
multiple ways to work around the watch trigger not firing.
1) build a signaling mechanism independent of etcd. This could be as simple as
a “pulse” to the client to reissue a GET with quorum=true for the most recent
version of the data.
2) poll on the `/v2/keys` endpoint and check that the raft-index is increasing every
timeout.
## 4) What is a proxy used for?
A proxy is a redirection server to the etcd cluster. The proxy handles the
redirection of a client to the current configuration of the etcd cluster. A
typical usecase is to start a proxy on a machine, and on first boot up of the
proxy specify both the `--proxy` flag and the `--initial-cluster` flag.
From there, any etcdctl client that starts up automatically speaks to the local
proxy and the proxy redirects operations to the current configuration of the
cluster it was originally paired with.
In the v2 spec of etcd, proxies cannot be promoted to members of the cluster.
They also cannot be promoted to followers or at any point become part of the
replication of the etcd cluster itself.
## 5) How is cluster membership and health handled in etcd v2?
The design goal of etcd is that reconfiguration is simply an API, and health
monitoring and addition/removal of members is up to the individual application
and their integration with the reconfiguration API.
Thus, a member that is down, even infinitely, will never be automatically
removed from the etcd cluster member list.
This makes sense because its usually an application level / administrative
action to determine whether a reconfiguration should happen based on health.
For more information, refer to [Documentation/runtime-reconfiguration.md].
## 6) how does --peers work with etcdctl?
The `--peers` flag can specify any number of etcd cluster members in a comma
separated list. This list might be a subset, equal to, or more than the actual
etcd cluster member list itself.
If only one peer is specified via the `--peers` flag, the etcdctl discovers the
rest of the cluster via the member list of that one peer, and then it randomly
chooses a member to use. Again, the client can use the `quorum=true` flag on
reads, which will always fail when using a member in the minority.
If peers from multiple clusters are specified via the `--peers` flag, etcdctl
will randomly choose a peer, and the request will simply get routed to one of
the clusters. This is probably not what you want.

View File

@@ -45,7 +45,6 @@
**C libraries**
- [jdarcy/etcd-api](https://github.com/jdarcy/etcd-api) - Supports v2
- [shafreeck/cetcd](https://github.com/shafreeck/cetcd) - Supports v2
**C++ libraries**
- [edwardcapriolo/etcdcpp](https://github.com/edwardcapriolo/etcdcpp) - Supports v2

View File

@@ -1,12 +1,12 @@
## Proxy
etcd can now run as a transparent proxy. Running etcd as a proxy allows for easily discovery of etcd within your infrastructure, since it can run on each machine as a local service. In this mode, etcd acts as a reverse proxy and forwards client requests to an active etcd cluster. The etcd proxy does not participate in the consensus replication of the etcd cluster, thus it neither increases the resilience nor decreases the write performance of the etcd cluster.
etcd can now run as a transparent proxy. Running etcd as a proxy allows for easily discovery of etcd within your infrastructure, since it can run on each machine as a local service. In this mode, etcd acts as a reverse proxy and forwards client requests to an active etcd cluster. The etcd proxy does not participant in the consensus replication of the etcd cluster, thus it neither increases the resilience nor decreases the write performance of the etcd cluster.
etcd currently supports two proxy modes: `readwrite` and `readonly`. The default mode is `readwrite`, which forwards both read and write requests to the etcd cluster. A `readonly` etcd proxy only forwards read requests to the etcd cluster, and returns `HTTP 501` to all write requests.
The proxy will shuffle the list of cluster members periodically to avoid sending all connections to a single member.
The member list used by proxy consists of all client URLs advertised within the cluster, as specified in each members' `-advertise-client-urls` flag. If this flag is set incorrectly, requests sent to the proxy are forwarded to wrong addresses and then fail. Including URLs in the `-advertise-client-urls` flag that point to the proxy itself, e.g. http://localhost:2379, is even more problematic as it will cause loops, because the proxy keeps trying to forward requests to itself until its resources (memory, file descriptors) are eventually depleted. The fix for this problem is to restart etcd member with correct `-advertise-client-urls` flag. After client URLs list in proxy is recalculated, which happens every 30 seconds, requests will be forwarded correctly.
The member list used by proxy consists of all client URLs advertised within the cluster, as specified in each members' `-advertise-client-urls` flag. If this flag is set incorrectly, requests sent to the proxy are forwarded to wrong addresses and then fail. The fix for this problem is to restart etcd member with correct `-advertise-client-urls` flag. After client URLs list in proxy is recalculated, which happens every 30 seconds, requests will be forwarded correctly.
### Using an etcd proxy
To start etcd in proxy mode, you need to provide three flags: `proxy`, `listen-client-urls`, and `initial-cluster` (or `discovery`).
@@ -17,7 +17,6 @@ The proxy will be listening on `listen-client-urls` and forward requests to the
#### Start an etcd proxy with a static configuration
To start a proxy that will connect to a statically defined etcd cluster, specify the `initial-cluster` flag:
```
etcd -proxy on -listen-client-urls http://127.0.0.1:8080 -initial-cluster infra0=http://10.0.1.10:2380,infra1=http://10.0.1.11:2380,infra2=http://10.0.1.12:2380
```

View File

@@ -1,43 +0,0 @@
## Reporting Bugs
If you find bugs or documentation mistakes in etcd project, please let us know by [opening an issue](https://github.com/coreos/etcd/issues/new). We treat bugs and mistakes very seriously and believe no issue is too small. Before creating a bug report, please check there that one does not already exist.
To make your bug report accurate and easy to understand, please try to create bug reports that are:
- Specific. Include as much details as possible: which version, what environment, what configuration, etc. You can also attach etcd log (the starting log with etcd configuration is especially important).
- Reproducible. Include the steps to reproduce the problem. We understand some issues might be hard to reproduce, please includes the steps that might lead to the problem. You can also attach the affected etcd data dir and stack strace to the bug report.
- Isolated. Please try to isolate and reproduce the bug with minimum dependencies. It would significantly slow down the speed to fix a bug if too many dependencies are involved in a bug report. Debugging external systems that rely on etcd is out of scope, but we are happy to point you in the right direction or help you interact with etcd in the correct manner.
- Unique. Do not duplicate existing bug report.
- Scoped. One bug per report. Do not follow up with another bug inside one report.
You might also want to read [Elika Etemads article on filing good bug reports](http://fantasai.inkedblade.net/style/talks/filing-good-bugs/) before creating a bug report.
We might ask you for further information to locate a bug. A duplicated bug report will be closed.
## Frequently Asked Questions
### How to get stack trace
``` bash
$ kill -QUIT $PID
```
### How to get etcd version
``` bash
$ etcd --version
```
### How to get etcd configuration and log when it runs as systemd service etcd2.service
``` bash
$ sudo systemctl cat etcd2
$ sudo journalctl -u etcd2
```
Due to an upstream systemd bug, journald may miss the last few log lines when its process exit. If journalctl tells you that etcd stops without fatal or panic message, you could try `sudo journalctl -f -t etcd2` to get full log.

View File

@@ -14,14 +14,14 @@
- more efficient/ low cost keep alive
- a logical group of TTL keys
5. Replace CAS/CAD with multi-object Txn
5. Replace CAS/CAD with multi-object Tnx
- MUCH MORE powerful and flexible
6. Support efficient watching with multiple ranges
7. RPC API supports the completed set of APIs.
- more efficient than JSON/HTTP
- additional txn/lease support
- additional tnx/lease support
8. HTTP API supports a subset of APIs.
- easy for people to try out etcd
@@ -42,7 +42,7 @@ Put( PutRequest { key = foo, value = bar } )
PutResponse {
cluster_id = 0x1000,
member_id = 0x1,
revision = 1,
index = 1,
raft_term = 0x1,
}
```
@@ -54,14 +54,14 @@ Get ( RangeRequest { key = foo } )
RangeResponse {
cluster_id = 0x1000,
member_id = 0x1,
revision = 1,
index = 1,
raft_term = 0x1,
kvs = {
{
key = foo,
value = bar,
create_revision = 1,
mod_revision = 1,
create_index = 1,
mod_index = 1,
version = 1;
},
},
@@ -75,35 +75,35 @@ Range ( RangeRequest { key = foo, end_key = foo80, limit = 30 } )
RangeResponse {
cluster_id = 0x1000,
member_id = 0x1,
revision = 100,
index = 100,
raft_term = 0x1,
kvs = {
{
key = foo0,
value = bar0,
create_revision = 1,
mod_revision = 1,
create_index = 1,
mod_index = 1,
version = 1;
},
...,
{
key = foo30,
value = bar30,
create_revision = 30,
mod_revision = 30,
create_index = 30,
mod_index = 30,
version = 1;
},
},
}
```
#### Finish a txn (assume we have foo0=bar0, foo1=bar1)
#### Finish a tnx (assume we have foo0=bar0, foo1=bar1)
```
Txn(TxnRequest {
// mod_revision of foo0 is equal to 1, mod_revision of foo1 is greater than 1
Tnx(TnxRequest {
// mod_index of foo0 is equal to 1, mod_index of foo1 is greater than 1
compare = {
{compareType = equal, key = foo0, mod_revision = 1},
{compareType = greater, key = foo1, mod_revision = 1}}
{compareType = equal, key = foo0, mod_index = 1},
{compareType = greater, key = foo1, mod_index = 1}}
},
// if the comparison succeeds, put foo2 = bar2
success = {PutRequest { key = foo2, value = success }},
@@ -111,10 +111,10 @@ Txn(TxnRequest {
failure = {PutRequest { key = foo2, value = failure }},
)
TxnResponse {
TnxResponse {
cluster_id = 0x1000,
member_id = 0x1,
revision = 3,
index = 3,
raft_term = 0x1,
succeeded = true,
responses = {
@@ -122,7 +122,7 @@ TxnResponse {
{
cluster_id = 0x1000,
member_id = 0x1,
revision = 3,
index = 3,
raft_term = 0x1,
}
}
@@ -135,8 +135,8 @@ TxnResponse {
Watch( WatchRequest{
key = foo,
end_key = fop, // prefix foo
start_revision = 20,
end_revision = 10000,
start_index = 20,
end_index = 10000,
// server decided notification frequency
progress_notification = true,
}
@@ -147,14 +147,14 @@ Watch( WatchRequest{
WatchResponse {
cluster_id = 0x1000,
member_id = 0x1,
revision = 3,
index = 3,
raft_term = 0x1,
event_type = put,
kv = {
key = foo0,
value = bar0,
create_revision = 1,
mod_revision = 1,
create_index = 1,
mod_index = 1,
version = 1;
},
}
@@ -164,7 +164,7 @@ WatchResponse {
WatchResponse {
cluster_id = 0x1000,
member_id = 0x1,
revision = 2000,
index = 2000,
raft_term = 0x1,
// nil event as notification
}
@@ -175,14 +175,14 @@ WatchResponse {
WatchResponse {
cluster_id = 0x1000,
member_id = 0x1,
revision = 3000,
index = 3000,
raft_term = 0x1,
event_type = put,
kv = {
key = foo0,
value = bar3000,
create_revision = 1,
mod_revision = 3000,
create_index = 1,
mod_index = 3000,
version = 2;
},
}

View File

@@ -6,19 +6,19 @@ service etcd {
rpc Range(RangeRequest) returns (RangeResponse) {}
// Put puts the given key into the store.
// A put request increases the revision of the store,
// A put request increases the index of the store,
// and generates one event in the event history.
rpc Put(PutRequest) returns (PutResponse) {}
// Delete deletes the given range from the store.
// A delete request increase the revision of the store,
// A delete request increase the index of the store,
// and generates one event in the event history.
rpc DeleteRange(DeleteRangeRequest) returns (DeleteRangeResponse) {}
// Txn processes all the requests in one transaction.
// A txn request increases the revision of the store,
// and generates events with the same revision in the event history.
rpc Txn(TxnRequest) returns (TxnResponse) {}
// Tnx processes all the requests in one transaction.
// A tnx request increases the index of the store,
// and generates events with the same index in the event history.
rpc Tnx(TnxRequest) returns (TnxResponse) {}
// Watch watches the events happening or happened in etcd. Both input and output
// are stream. One watch rpc can watch for multiple ranges and get a stream of
@@ -41,10 +41,10 @@ service etcd {
// LeaseAttach attaches keys with a lease.
rpc LeaseAttach(LeaseAttachRequest) returns (LeaseAttachResponse) {}
// LeaseTxn likes Txn. It has two addition success and failure LeaseAttachRequest list.
// If the Txn is successful, then the success list will be executed. Or the failure list
// LeaseTnx likes Tnx. It has two addition success and failure LeaseAttachRequest list.
// If the Tnx is successful, then the success list will be executed. Or the failure list
// will be executed.
rpc LeaseTxn(LeaseTxnRequest) returns (LeaseTxnResponse) {}
rpc LeaseTnx(LeaseTnxRequest) returns (LeaseTnxResponse) {}
// KeepAlive keeps the lease alive.
rpc LeaseKeepAlive(stream LeaseKeepAliveRequest) returns (stream LeaseKeepAliveResponse) {}
@@ -52,54 +52,51 @@ service etcd {
message ResponseHeader {
// an error type message?
string error = 1;
uint64 cluster_id = 2;
uint64 member_id = 3;
// revision of the store when the request was applied.
int64 revision = 4;
optional string error = 1;
optional uint64 cluster_id = 2;
optional uint64 member_id = 3;
// index of the store when the request was applied.
optional int64 index = 4;
// term of raft when the request was applied.
uint64 raft_term = 5;
optional uint64 raft_term = 5;
}
message RangeRequest {
// if the range_end is not given, the request returns the key.
bytes key = 1;
optional bytes key = 1;
// if the range_end is given, it gets the keys in range [key, range_end).
bytes range_end = 2;
optional bytes range_end = 2;
// limit the number of keys returned.
int64 limit = 3;
// range over the store at the given revision.
// if revision is less or equal to zero, range over the newest store.
// if the revision has been compacted, ErrCompaction will be returned in
// response.
int64 revision = 4;
optional int64 limit = 3;
// the response will be consistent with previous request with same token if the token is
// given and is vaild.
optional bytes consistent_token = 4;
}
message RangeResponse {
ResponseHeader header = 1;
repeated storagepb.KeyValue kvs = 2;
// more indicates if there are more keys to return in the requested range.
bool more = 3;
optional ResponseHeader header = 1;
repeated KeyValue kvs = 2;
optional bytes consistent_token = 3;
}
message PutRequest {
bytes key = 1;
bytes value = 2;
optional bytes key = 1;
optional bytes value = 2;
}
message PutResponse {
ResponseHeader header = 1;
optional ResponseHeader header = 1;
}
message DeleteRangeRequest {
// if the range_end is not given, the request deletes the key.
bytes key = 1;
optional bytes key = 1;
// if the range_end is given, it deletes the keys in range [key, range_end).
bytes range_end = 2;
optional bytes range_end = 2;
}
message DeleteRangeResponse {
ResponseHeader header = 1;
optional ResponseHeader header = 1;
}
message RequestUnion {
@@ -112,44 +109,38 @@ message RequestUnion {
message ResponseUnion {
oneof response {
RangeResponse response_range = 1;
RangeResponse reponse_range = 1;
PutResponse response_put = 2;
DeleteRangeResponse response_delete_range = 3;
}
}
message Compare {
enum CompareResult {
enum CompareType {
EQUAL = 0;
GREATER = 1;
LESS = 2;
}
enum CompareTarget {
VERSION = 0;
CREATE = 1;
MOD = 2;
VALUE= 3;
}
CompareResult result = 1;
CompareTarget target = 2;
optional CompareType type = 1;
// key path
bytes key = 3;
oneof target_union {
optional bytes key = 2;
oneof target {
// version of the given key
int64 version = 4;
// create revision of the given key
int64 create_revision = 5;
// last modified revision of the given key
int64 mod_revision = 6;
int64 version = 3;
// create index of the given key
int64 create_index = 4;
// last modified index of the given key
int64 mod_index = 5;
// value of the given key
bytes value = 7;
bytes value = 6;
}
}
// If the comparisons succeed, then the success requests will be processed in order,
// and the response will contain their respective responses in order.
// If the comparisons fail, then the failure requests will be processed in order,
// and the response will contain their respective responses in order.
// First all the compare requests are processed.
// If all the compare succeed, all the success
// requests will be processed.
// Or all the failure requests will be processed and
// all the errors in the comparison will be returned.
// From google paxosdb paper:
// Our implementation hinges around a powerful primitive which we call MultiOp. All other database
@@ -166,44 +157,44 @@ message Compare {
// if guard evaluates to
// true.
// 3. A list of database operations called f op. Like t op, but executed if guard evaluates to false.
message TxnRequest {
message TnxRequest {
repeated Compare compare = 1;
repeated RequestUnion success = 2;
repeated RequestUnion failure = 3;
}
message TxnResponse {
ResponseHeader header = 1;
bool succeeded = 2;
message TnxResponse {
optional ResponseHeader header = 1;
optional bool succeeded = 2;
repeated ResponseUnion responses = 3;
}
message KeyValue {
bytes key = 1;
int64 create_revision = 2;
// mod_revision is the last modified revision of the key.
int64 mod_revision = 3;
optional bytes key = 1;
// mod_index is the last modified index of the key.
optional int64 create_index = 2;
optional int64 mod_index = 3;
// version is the version of the key. A deletion resets
// the version to zero and any modification of the key
// increases its version.
int64 version = 4;
bytes value = 5;
optional int64 version = 4;
optional bytes value = 5;
}
message WatchRangeRequest {
// if the range_end is not given, the request returns the key.
bytes key = 1;
optional bytes key = 1;
// if the range_end is given, it gets the keys in range [key, range_end).
bytes range_end = 2;
// start_revision is an optional revision (including) to watch from. No start_revision is "now".
int64 start_revision = 3;
// end_revision is an optional revision (excluding) to end watch. No end_revision is "forever".
int64 end_revision = 4;
bool progress_notification = 5;
optional bytes range_end = 2;
// start_index is an optional index (including) to watch from. No start_index is "now".
optional int64 start_index = 3;
// end_index is an optional index (excluding) to end watch. No end_index is "forever".
optional int64 end_index = 4;
optional bool progress_notification = 5;
}
message WatchRangeResponse {
ResponseHeader header = 1;
optional ResponseHeader header = 1;
repeated Event events = 2;
}
@@ -213,73 +204,69 @@ message Event {
DELETE = 1;
EXPIRE = 2;
}
EventType event_type = 1;
optional EventType event_type = 1;
// a put event contains the current key-value
// a delete/expire event contains the previous
// key-value
KeyValue kv = 2;
optional KeyValue kv = 2;
}
// Compaction compacts the kv store upto the given revision (including).
// It removes the old versions of a key. It keeps the newest version of
// the key even if its latest modification revision is smaller than the given
// revision.
message CompactionRequest {
int64 revision = 1;
optional int64 index = 1;
}
message CompactionResponse {
ResponseHeader header = 1;
optional ResponseHeader header = 1;
}
message LeaseCreateRequest {
// advisory ttl in seconds
int64 ttl = 1;
optional int64 ttl = 1;
}
message LeaseCreateResponse {
ResponseHeader header = 1;
int64 lease_id = 2;
optional ResponseHeader header = 1;
optional int64 lease_id = 2;
// server decided ttl in second
int64 ttl = 3;
string error = 4;
optional int64 ttl = 3;
optional string error = 4;
}
message LeaseRevokeRequest {
int64 lease_id = 1;
optional int64 lease_id = 1;
}
message LeaseRevokeResponse {
ResponseHeader header = 1;
optional ResponseHeader header = 1;
}
message LeaseTxnRequest {
TxnRequest request = 1;
message LeaseTnxRequest {
optional TnxRequest request = 1;
repeated LeaseAttachRequest success = 2;
repeated LeaseAttachRequest failure = 3;
}
message LeaseTxnResponse {
ResponseHeader header = 1;
TxnResponse response = 2;
message LeaseTnxResponse {
optional ResponseHeader header = 1;
optional TnxResponse response = 2;
repeated LeaseAttachResponse attach_responses = 3;
}
message LeaseAttachRequest {
int64 lease_id = 1;
bytes key = 2;
optional int64 lease_id = 1;
optional bytes key = 2;
}
message LeaseAttachResponse {
ResponseHeader header = 1;
optional ResponseHeader header = 1;
}
message LeaseKeepAliveRequest {
int64 lease_id = 1;
optional int64 lease_id = 1;
}
message LeaseKeepAliveResponse {
ResponseHeader header = 1;
int64 lease_id = 2;
int64 ttl = 3;
optional ResponseHeader header = 1;
optional int64 lease_id = 2;
optional int64 ttl = 3;
}

View File

@@ -4,8 +4,6 @@ etcd comes with support for incremental runtime reconfiguration, which allows us
Reconfiguration requests can only be processed when the the majority of the cluster members are functioning. It is **highly recommended** to always have a cluster size greater than two in production. It is unsafe to remove a member from a two member cluster. The majority of a two member cluster is also two. If there is a failure during the removal process, the cluster might not able to make progress and need to [restart from majority failure][majority failure].
To better understand the design behind runtime reconfiguration, we suggest you read [this](runtime-reconf-design.md).
[majority failure]: #restart-cluster-from-majority-failure
## Reconfiguration Use Cases
@@ -39,7 +37,7 @@ To replace the machine, follow the instructions for [removing the member][remove
### Restart Cluster from Majority Failure
If the majority of your cluster is lost or all of your nodes have changed IP addresses, then you need to take manual action in order to recover safely.
If the majority of your cluster is lost, then you need to take manual action in order to recover safely.
The basic steps in the recovery process include [creating a new cluster using the old data][disaster recovery], forcing a single member to act as the leader, and finally using runtime configuration to [add new members][add member] to this new cluster one at a time.
[add member]: #add-a-new-member
@@ -54,38 +52,28 @@ This is essentially the same requirement as for any other write to etcd.
All changes to the cluster are done one at a time:
* To update a single member peerURLs you will make an update operation
* To replace a single member you will make an add then a remove operation
* To increase from 3 to 5 members you will make two add operations
* To decrease from 5 to 3 you will make two remove operations
To replace a single member you will make an add then a remove operation
To increase from 3 to 5 members you will make two add operations
To decrease from 5 to 3 you will make two remove operations
All of these examples will use the `etcdctl` command line tool that ships with etcd.
If you want to use the member API directly you can find the documentation [here](other_apis.md).
### Update a Member
If you would like to update a member IP address (peerURLs), first, we need to find the target member's ID. You can list all members with `etcdctl`:
```sh
$ etcdctl member list
6e3bd23ae5f1eae0: name=node2 peerURLs=http://localhost:23802 clientURLs=http://127.0.0.1:23792
924e2e83e93f2560: name=node3 peerURLs=http://localhost:23803 clientURLs=http://127.0.0.1:23793
a8266ecf031671f3: name=node1 peerURLs=http://localhost:23801 clientURLs=http://127.0.0.1:23791
```
In this example let's `update` a8266ecf031671f3 member ID and change its peerURLs value to http://10.0.1.10:2380
```sh
$ etcdctl member update a8266ecf031671f3 http://10.0.1.10:2380
Updated member with ID a8266ecf031671f3 in cluster
```
### Remove a Member
First, we need to find the target member's ID. You can list all members with `etcdctl`:
```
$ etcdctl member list
6e3bd23ae5f1eae0: name=node2 peerURLs=http://localhost:7002 clientURLs=http://127.0.0.1:4002
924e2e83e93f2560: name=node3 peerURLs=http://localhost:7003 clientURLs=http://127.0.0.1:4003
a8266ecf031671f3: name=node1 peerURLs=http://localhost:7001 clientURLs=http://127.0.0.1:4001
```
Let us say the member ID we want to remove is a8266ecf031671f3.
We then use the `remove` command to perform the removal:
```sh
```
$ etcdctl member remove a8266ecf031671f3
Removed member a8266ecf031671f3 from cluster
```
@@ -107,7 +95,7 @@ Adding a member is a two step process:
Using `etcdctl` let's add the new member to the cluster by specifying its [name](configuration.md#-name) and [advertised peer URLs](configuration.md#-initial-advertise-peer-urls):
```sh
```
$ etcdctl member add infra3 http://10.0.1.13:2380
added member 9bf1b35fc7761a23 to cluster
@@ -119,11 +107,11 @@ ETCD_INITIAL_CLUSTER_STATE=existing
`etcdctl` has informed the cluster about the new member and printed out the environment variables needed to successfully start it.
Now start the new etcd process with the relevant flags for the new member:
```sh
```
$ export ETCD_NAME="infra3"
$ export ETCD_INITIAL_CLUSTER="infra0=http://10.0.1.10:2380,infra1=http://10.0.1.11:2380,infra2=http://10.0.1.12:2380,infra3=http://10.0.1.13:2380"
$ export ETCD_INITIAL_CLUSTER_STATE=existing
$ etcd -listen-client-urls http://10.0.1.13:2379 -advertise-client-urls http://10.0.1.13:2379 -listen-peer-urls http://10.0.1.13:2380 -initial-advertise-peer-urls http://10.0.1.13:2380 -data-dir %data_dir%
$ etcd -listen-client-urls http://10.0.1.13:2379 -advertise-client-urls http://10.0.1.13:2379 -listen-peer-urls http://10.0.1.13:2380 -initial-advertise-peer-urls http://10.0.1.13:2380
```
The new member will run as a part of the cluster and immediately begin catching up with the rest of the cluster.
@@ -136,7 +124,7 @@ If you add a new member to a 1-node cluster, the cluster cannot make progress be
In the following case we have not included our new host in the list of enumerated nodes.
If this is a new cluster, the node must be added to the list of initial cluster members.
```sh
```
$ etcd -name infra3 \
-initial-cluster infra0=http://10.0.1.10:2380,infra1=http://10.0.1.11:2380,infra2=http://10.0.1.12:2380 \
-initial-cluster-state existing
@@ -146,7 +134,7 @@ exit 1
In this case we give a different address (10.0.1.14:2380) to the one that we used to join the cluster (10.0.1.13:2380).
```sh
```
$ etcd -name infra4 \
-initial-cluster infra0=http://10.0.1.10:2380,infra1=http://10.0.1.11:2380,infra2=http://10.0.1.12:2380,infra4=http://10.0.1.14:2380 \
-initial-cluster-state existing
@@ -156,7 +144,7 @@ exit 1
When we start etcd using the data directory of a removed member, etcd will exit automatically if it connects to any alive member in the cluster:
```sh
```
$ etcd
etcd: this member has been permanently removed from the cluster. Exiting.
exit 1

View File

@@ -1,47 +0,0 @@
### Design of Runtime Reconfiguration
Runtime reconfiguration is one of the hardest and most error prone features in a distributed system, especially in a consensus based system like etcd.
Read on to learn about the design of etcd's runtime reconfiguration commands and how we tackled these problems.
### Two Phase Config Changes Keep you Safe
In etcd, every runtime reconfiguration has to go through [two phases](Documentation/runtime-configuration.md#add-a-new-member) for safety reasons. For example, to add a member you need to first inform cluster of new configuration and then start the new member.
Phase 1 - Inform cluster of new configuration
To add a member into etcd cluster, you need to make an API call to request a new member to be added to the cluster. And this is only way that you can add a new member into an existing cluster. The API call returns when the cluster agrees on the configuration change.
Phase 2 - Start new member
To join the etcd member into the existing cluster, you need to specify the correct `initial-cluster` and set `initial-cluster-state` to `existing`. When the member starts, it will contact the existing cluster first and verify the current cluster configuration matches the expected one specified in `initial-cluster`. When the new member successfully starts, you know your cluster reached the expected configuration.
By splitting the process into two discrete phases users are forced to be explicit regarding cluster membership changes. This actually gives users more flexibility and makes things easier to reason about. For example, if there is an attempt to add a new member with the same ID as an existing member in an etcd cluster, the action will fail immediately during phase one without impacting the running cluster. Similar protection is provided to prevent adding new members by mistake. If a new etcd member attempts to join the cluster before the cluster has accepted the configuration change,, it will not be accepted by the cluster.
Without the explicit workflow around cluster membership etcd would be vulnerable to unexpected cluster membership changes. For example, if etcd is running under an init system such as systemd, etcd would be restarted after being removed via the membership API, and attempt to rejoin the cluster on startup. This cycle would continue every time a member is removed via the API and systemd is set to restart etcd after failing, which is unexpected.
We think runtime reconfiguration should be a low frequent operation. We made the decision to keep it explicit and user-driven to ensure configuration safety and keep your cluster always running smoothly under your control.
### Permanent Loss of Quorum Requires New Cluster
If a cluster permanently loses a majority of its members, a new cluster will need to be started from an old data directory to recover the previous state.
It is entirely possible to force removing the failed members from the existing cluster to recover. However, we decided not to support this method since it bypasses the normal consensus committing phase, which is unsafe. If the member to remove is not actually dead or you force to remove different members through different members in the same cluster, you will end up with diverged cluster with same clusterID. This is very dangerous and hard to debug/fix afterwards.
If you have a correct deployment, the possibility of permanent majority lose is very low. But it is a severe enough problem that worth special care. We strongly suggest you to read the [disaster recovery documentation](admin_guide.md#disaster-recovery) and prepare for permanent majority lose before you put etcd into production.
### Do Not Use Public Discovery Service For Runtime Reconfiguration
The public discovery service should only be used for bootstrapping a cluster. To join member into an existing cluster, you should use runtime reconfiguration API.
Discovery service is designed for bootstrapping an etcd cluster in the cloud environment, when you do not know the IP addresses of all the members beforehand. After you successfully bootstrap a cluster, the IP addresses of all the members are known. Technically, you should not need the discovery service any more.
It seems that using public discovery service is a convenient way to do runtime reconfiguration, after all discovery service already has all the cluster configuration information. However relying on public discovery service brings troubles:
1. it introduces a external dependencies for the entire life-cycle of your cluster, not just bootstrap time. If there is a network issue between your cluster and public discover service, your cluster will suffer from it.
2. public discovery service must reflect correct runtime configuration of your cluster during it life-cycle. It has to provide security mechanism to avoid bad actions, and it is hard.
3. public discovery service has to keep tens of thousands of cluster configurations. Our public discovery service backend is not ready for that workload.
If you want to have a discovery service that supports runtime reconfiguration, the best choice is to build your private one.

View File

@@ -4,7 +4,7 @@ etcd supports SSL/TLS as well as authentication through client certificates, bot
To get up and running you first need to have a CA certificate and a signed key pair for one member. It is recommended to create and sign a new key pair for every member in a cluster.
For convenience the [cfssl](https://github.com/cloudflare/cfssl) tool provides an easy interface to certificate generation, and we provide a full example using the tool at [here](../hack/tls-setup). Alternatively this site provides a good reference on how to generate self-signed key pairs:
For convenience the [etcd-ca](https://github.com/coreos/etcd-ca) tool provides an easy interface to certificate generation, alternatively this site provides a good reference on how to generate self-signed key pairs:
http://www.g-loaded.eu/2005/11/10/be-your-own-ca/

View File

@@ -10,7 +10,7 @@ The network isn't the only source of latency. Each request and response may be i
The underlying distributed consensus protocol relies on two separate time parameters to ensure that nodes can handoff leadership if one stalls or goes offline.
The first parameter is called the *Heartbeat Interval*.
This is the frequency with which the leader will notify followers that it is still the leader.
For best pratices, the parameter should be set around round-trip time between members.
etcd batches commands together for higher throughput so this heartbeat interval is also a delay for how long it takes for commands to be committed.
By default, etcd uses a `100ms` heartbeat interval.
The second parameter is the *Election Timeout*.
@@ -18,22 +18,18 @@ This timeout is how long a follower node will go without hearing a heartbeat bef
By default, etcd uses a `1000ms` election timeout.
Adjusting these values is a trade off.
The value of heartbeat interval is recommended to be around the maximum of average round-trip time (RTT) between members, normally around 0.5-1.5x the round-trip time.
If heartbeat interval is too low, etcd will send unnecessary messages that increase the usage of CPU and network resources.
On the other side, a too high heartbeat interval leads to high election timeout. Higher election timeout takes longer time to detect a leader failure.
The easiest way to measure round-trip time (RTT) is to use [PING utility](https://en.wikipedia.org/wiki/Ping_(networking_utility)).
Lowering the heartbeat interval will cause individual commands to be committed faster but it will lower the overall throughput of etcd.
If your etcd instances have low utilization then lowering the heartbeat interval can improve your command response time.
The election timeout should be set based on the heartbeat interval and average round-trip time between members.
Election timeouts must be at least 10 times the round-trip time so it can account for variance in your network.
For example, if the round-trip time between your members is 10ms then you should have at least a 100ms election timeout.
The election timeout should be set based on the heartbeat interval and your network ping time between nodes.
Election timeouts should be at least 10 times your ping time so it can account for variance in your network.
For example, if the ping time between your nodes is 10ms then you should have at least a 100ms election timeout.
The upper limit of election timeout is 50000ms, which should only be used when deploying global etcd cluster. First, 5s is the upper limit of average global round-trip time. A reasonable round-trip time for the continental united states is 130ms, and the time between US and japan is around 350-400ms. Because package gets delayed a lot, and network situation may be terrible, 5s is a safe value for it. Then, because election timeout should be an order of magnitude bigger than broadcast time, 50s becomes its maximum.
You should also set your election timeout to at least 5 to 10 times your heartbeat interval to account for variance in leader replication.
For a heartbeat interval of 50ms you should set your election timeout to at least 250ms - 500ms.
The heartbeat interval and election timeout value should be the same for all members in one cluster. Setting different values for etcd members may disrupt cluster stability.
You can override the default values on the command line:
```sh

View File

@@ -1,128 +0,0 @@
## Upgrade etcd from 2.1 to 2.2
In the general case, upgrading from etcd 2.1 to 2.2 can be a zero-downtime, rolling upgrade:
- one by one, stop the etcd v2.1 processes and replace them with etcd v2.2 processes
- after you are running all v2.2 processes, new features in v2.2 are available to the cluster
Before [starting an upgrade](#upgrade-procedure), read through the rest of this guide to prepare.
### Upgrade Checklists
#### Upgrade Requirement
To upgrade an existing etcd deployment to 2.2, you must be running 2.1. If youre running a version of etcd before 2.1, you must upgrade to [2.1](https://github.com/coreos/etcd/releases/tag/v2.1.2) before upgrading to 2.2.
Also, to ensure a smooth rolling upgrade, your running cluster must be healthy. You can check the health of the cluster by using `etcdctl cluster-health` command.
#### Preparedness
Before upgrading etcd, always test the services relying on etcd in a staging environment before deploying the upgrade to the production environment.
You might also want to [backup your data directory](admin_guide.md#backing-up-the-datastore) for a potential [downgrade](#downgrade).
#### Mixed Versions
While upgrading, an etcd cluster supports mixed versions of etcd members. The cluster is only considered upgraded once all its members are upgraded to 2.2.
Internally, etcd members negotiate with each other to determine the overall etcd cluster version, which controls the reported cluster version and the supported features.
#### Limitations
If you have a data size larger than 100MB you should contact us before upgrading, so we can make sure the upgrades work smoothly.
Every etcd 2.2 member will do health checking across the cluster periodically. etcd 2.1 member does not support health checking. During the upgrade, etcd 2.2 member will log warning about the unhealthy state of etcd 2.1 member. You can ignore the warning.
#### Downgrade
If all members have been upgraded to v2.2, the cluster will be upgraded to v2.2, and downgrade is **not possible**. If any member is still v2.1, the cluster will remain in v2.1, and you can go back to use v2.1 binary.
Please [backup your data directory](admin_guide.md#backing-up-the-datastore) of all etcd members if you want to downgrade the cluster, even if it is upgraded.
### Upgrade Procedure
In the example, we upgrade a three member v2.1 cluster running on local machine.
#### 1. Check upgrade requirements.
```
$ etcdctl cluster-health
member 6e3bd23ae5f1eae0 is healthy: got healthy result from http://localhost:22379
member 924e2e83e93f2560 is healthy: got healthy result from http://localhost:32379
member a8266ecf031671f3 is healthy: got healthy result from http://localhost:12379
cluster is healthy
$ curl http://localhost:4001/version
{"etcdserver":"2.1.x","etcdcluster":"2.1.0"}
```
#### 2. Stop the existing etcd process
You will see similar error logging from other etcd processes in your cluster. This is normal, since you just shut down a member and the connection is broken.
```
2015/09/2 09:48:35 etcdserver: failed to reach the peerURL(http://localhost:12380) of member a8266ecf031671f3 (Get http://localhost:12380/version: dial tcp [::1]:12380: getsockopt: connection refused)
2015/09/2 09:48:35 etcdserver: cannot get the version of member a8266ecf031671f3 (Get http://localhost:12380/version: dial tcp [::1]:12380: getsockopt: connection refused)
2015/09/2 09:48:35 rafthttp: failed to write a8266ecf031671f3 on stream Message (write tcp 127.0.0.1:32380->127.0.0.1:64394: write: broken pipe)
2015/09/2 09:48:35 rafthttp: failed to write a8266ecf031671f3 on pipeline (dial tcp [::1]:12380: getsockopt: connection refused)
2015/09/2 09:48:40 etcdserver: failed to reach the peerURL(http://localhost:7001) of member a8266ecf031671f3 (Get http://localhost:7001/version: dial tcp [::1]:12380: getsockopt: connection refused)
2015/09/2 09:48:40 etcdserver: cannot get the version of member a8266ecf031671f3 (Get http://localhost:12380/version: dial tcp [::1]:12380: getsockopt: connection refused)
2015/09/2 09:48:40 rafthttp: failed to heartbeat a8266ecf031671f3 on stream MsgApp v2 (write tcp 127.0.0.1:32380->127.0.0.1:64393: write: broken pipe)
```
You will see logging output like this from ungraded member due to a mixed version cluster. You can ignore this while upgrading.
```
2015/09/2 09:48:45 etcdserver: the etcd version 2.1.2+git is not up-to-date
2015/09/2 09:48:45 etcdserver: member a8266ecf031671f3 has a higher version &{2.2.0-rc.0+git 2.1.0}
```
You will also see logging output like this from the newly upgraded member, since etcd 2.1 member does not support health checking. You can ignore this while upgrading.
```
2015-09-02 09:55:42.691384 W | rafthttp: the connection to peer 6e3bd23ae5f1eae0 is unhealthy
2015-09-02 09:55:42.705626 W | rafthttp: the connection to peer 924e2e83e93f2560 is unhealthy
```
You could [backup your data directory](https://github.com/coreos/etcd/blob/7f7e2cc79d9c5c342a6eb1e48c386b0223cf934e/Documentation/admin_guide.md#backing-up-the-datastore) for data safety.
```
$ etcdctl backup \
--data-dir /var/lib/etcd \
--backup-dir /tmp/etcd_backup
```
#### 3. Drop-in etcd v2.2 binary and start the new etcd process
Now, you can start the etcd v2.2 binary with the previous configuration.
You will see the etcd start and publish its information to the cluster.
```
2015-09-02 09:56:46.117609 I | etcdserver: published {Name:infra2 ClientURLs:[http://localhost:22380]} to cluster e9c7614f68f35fb2
```
You could verify the cluster becomes healthy.
```
$ etcdctl cluster-health
member 6e3bd23ae5f1eae0 is healthy: got healthy result from http://localhost:22379
member 924e2e83e93f2560 is healthy: got healthy result from http://localhost:32379
member a8266ecf031671f3 is healthy: got healthy result from http://localhost:12379
cluster is healthy
```
#### 4. Repeat step 2 to step 3 for all other members
#### 5. Finish
When all members are upgraded, you will see the cluster is upgraded to 2.2 successfully:
```
2015-09-02 09:56:54.896848 N | etcdserver: updated the cluster version from 2.1 to 2.2
```
```
$ curl http://127.0.0.1:4001/version
{"etcdserver":"2.2.x","etcdcluster":"2.2.0"}
```

136
Godeps/Godeps.json generated
View File

@@ -1,6 +1,6 @@
{
"ImportPath": "github.com/coreos/etcd",
"GoVersion": "go1.5.1",
"GoVersion": "go1.4.1",
"Packages": [
"./..."
],
@@ -10,68 +10,44 @@
"Comment": "null-5",
"Rev": "75cd24fc2f2c2a2088577d12123ddee5f54e0675"
},
{
"ImportPath": "github.com/akrennmair/gopcap",
"Rev": "00e11033259acb75598ba416495bb708d864a010"
},
{
"ImportPath": "github.com/beorn7/perks/quantile",
"Rev": "b965b613227fddccbfffe13eae360ed3fa822f8d"
},
{
"ImportPath": "github.com/bgentry/speakeasy",
"Rev": "36e9cfdd690967f4f690c6edcc9ffacd006014a0"
"Rev": "5dfe43257d1f86b96484e760f2f0c4e2559089c7"
},
{
"ImportPath": "github.com/boltdb/bolt",
"Comment": "v1.1.0-19-g0b00eff",
"Rev": "0b00effdd7a8270ebd91c24297e51643e370dd52"
"Comment": "v1.0-71-g71f28ea",
"Rev": "71f28eaecbebd00604d87bb1de0dae8fcfa54bbd"
},
{
"ImportPath": "github.com/cheggaaa/pb",
"Rev": "da1f27ad1d9509b16f65f52fd9d8138b0f2dc7b2"
"ImportPath": "github.com/bradfitz/http2",
"Rev": "3e36af6d3af0e56fa3da71099f864933dea3d9fb"
},
{
"ImportPath": "github.com/codegangsta/cli",
"Comment": "1.2.0-183-gb5232bb",
"Rev": "b5232bb2934f606f9f27a1305f1eea224e8e8b88"
"Comment": "1.2.0-26-gf7ebb76",
"Rev": "f7ebb761e83e21225d1d8954fde853bf8edd46c4"
},
{
"ImportPath": "github.com/coreos/gexpect",
"Rev": "5173270e159f5aa8fbc999dc7e3dcb50f4098a69"
"ImportPath": "github.com/coreos/go-etcd/etcd",
"Comment": "v2.0.0-13-g4cceaf7",
"Rev": "4cceaf7283b76f27c4a732b20730dcdb61053bf5"
},
{
"ImportPath": "github.com/coreos/go-semver/semver",
"Rev": "568e959cd89871e61434c1143528d9162da89ef2"
},
{
"ImportPath": "github.com/coreos/go-systemd/daemon",
"Comment": "v3-6-gcea488b",
"Rev": "cea488b4e6855fee89b6c22a811e3c5baca861b6"
},
{
"ImportPath": "github.com/coreos/go-systemd/journal",
"Comment": "v3-6-gcea488b",
"Rev": "cea488b4e6855fee89b6c22a811e3c5baca861b6"
},
{
"ImportPath": "github.com/coreos/go-systemd/util",
"Comment": "v3-6-gcea488b",
"Rev": "cea488b4e6855fee89b6c22a811e3c5baca861b6"
},
{
"ImportPath": "github.com/coreos/pkg/capnslog",
"Rev": "2c77715c4df99b5420ffcae14ead08f52104065d"
},
{
"ImportPath": "github.com/cpuguy83/go-md2man/md2man",
"Comment": "v1.0.4",
"Rev": "71acacd42f85e5e82f70a55327789582a5200a90"
"Rev": "99f6e6b8f8ea30b0f82769c1411691c44a66d015"
},
{
"ImportPath": "github.com/gogo/protobuf/proto",
"Comment": "v0.1-118-ge8904f5",
"Rev": "e8904f58e872a473a5b91bc9bf3377d223555263"
"Rev": "64f27bf06efee53589314a6e5a4af34cdd85adf6"
},
{
"ImportPath": "github.com/golang/glog",
@@ -79,79 +55,43 @@
},
{
"ImportPath": "github.com/golang/protobuf/proto",
"Rev": "6aaa8d47701fa6cf07e914ec01fde3d4a1fe79c3"
"Rev": "5677a0e3d5e89854c9974e1256839ee23f8233ca"
},
{
"ImportPath": "github.com/google/btree",
"Rev": "cc6329d4279e3f025a53a83c397d2339b5705c45"
},
{
"ImportPath": "github.com/inconshreveable/mousetrap",
"Rev": "76626ae9c91c4f2a10f34cad8ce83ea42c93bb75"
},
{
"ImportPath": "github.com/jonboulle/clockwork",
"Rev": "72f9bd7c4e0c2a40055ab3d0f09654f730cce982"
},
{
"ImportPath": "github.com/kballard/go-shellquote",
"Rev": "d8ec1a69a250a17bb0e419c386eac1f3711dc142"
},
{
"ImportPath": "github.com/kr/pty",
"Comment": "release.r56-29-gf7ee69f",
"Rev": "f7ee69f31298ecbe5d2b349c711e2547a617d398"
},
{
"ImportPath": "github.com/matttproud/golang_protobuf_extensions/pbutil",
"Rev": "fc2b8d3a73c4867e51861bbdd5ae3c1f0869dd6a"
},
{
"ImportPath": "github.com/olekukonko/ts",
"Rev": "ecf753e7c962639ab5a1fb46f7da627d4c0a04b8"
"ImportPath": "github.com/prometheus/client_golang/model",
"Comment": "0.5.0-10-ga842dc1",
"Rev": "a842dc11e0621c34a71cab634d1d0190a59802a8"
},
{
"ImportPath": "github.com/prometheus/client_golang/prometheus",
"Comment": "0.7.0-52-ge51041b",
"Rev": "e51041b3fa41cece0dca035740ba6411905be473"
"Comment": "0.5.0-10-ga842dc1",
"Rev": "a842dc11e0621c34a71cab634d1d0190a59802a8"
},
{
"ImportPath": "github.com/prometheus/client_golang/text",
"Comment": "0.5.0-10-ga842dc1",
"Rev": "a842dc11e0621c34a71cab634d1d0190a59802a8"
},
{
"ImportPath": "github.com/prometheus/client_model/go",
"Comment": "model-0.0.2-12-gfa8ad6f",
"Rev": "fa8ad6fec33561be4280a8f0514318c79d7f6cb6"
},
{
"ImportPath": "github.com/prometheus/common/expfmt",
"Rev": "ffe929a3f4c4faeaa10f2b9535c2b1be3ad15650"
},
{
"ImportPath": "github.com/prometheus/common/model",
"Rev": "ffe929a3f4c4faeaa10f2b9535c2b1be3ad15650"
},
{
"ImportPath": "github.com/prometheus/procfs",
"Rev": "454a56f35412459b5e684fd5ec0f9211b94f002a"
},
{
"ImportPath": "github.com/russross/blackfriday",
"Comment": "v1.4-2-g300106c",
"Rev": "300106c228d52c8941d4b3de6054a6062a86dda3"
},
{
"ImportPath": "github.com/shurcooL/sanitized_anchor_name",
"Rev": "10ef21a441db47d8b13ebcc5fd2310f636973c77"
},
{
"ImportPath": "github.com/spacejam/loghisto",
"Rev": "323309774dec8b7430187e46cd0793974ccca04a"
},
{
"ImportPath": "github.com/spf13/cobra",
"Rev": "1c44ec8d3f1552cac48999f9306da23c4d8a288b"
},
{
"ImportPath": "github.com/spf13/pflag",
"Rev": "08b1a584251b5b62f458943640fc8ebd4d50aaa5"
"Rev": "ee2372b58cee877abe07cde670d04d3b3bac5ee6"
},
{
"ImportPath": "github.com/stretchr/testify/assert",
@@ -159,11 +99,7 @@
},
{
"ImportPath": "github.com/ugorji/go/codec",
"Rev": "f1f1a805ed361a0e078bb537e4ea78cd37dcf065"
},
{
"ImportPath": "github.com/xiang90/probing",
"Rev": "6a0cc1ae81b4cc11db5e491e030e4b98fba79c19"
"Rev": "821cda7e48749cacf7cad2c6ed01e96457ca7e9d"
},
{
"ImportPath": "golang.org/x/crypto/bcrypt",
@@ -175,27 +111,23 @@
},
{
"ImportPath": "golang.org/x/net/context",
"Rev": "04b9de9b512f58addf28c9853d50ebef61c3953e"
"Rev": "7dbad50ab5b31073856416cdcfeb2796d682f844"
},
{
"ImportPath": "golang.org/x/net/http2",
"Rev": "04b9de9b512f58addf28c9853d50ebef61c3953e"
"ImportPath": "golang.org/x/oauth2",
"Rev": "3046bc76d6dfd7d3707f6640f85e42d9c4050f50"
},
{
"ImportPath": "golang.org/x/net/internal/timeseries",
"Rev": "04b9de9b512f58addf28c9853d50ebef61c3953e"
"ImportPath": "google.golang.org/cloud/compute/metadata",
"Rev": "f20d6dcccb44ed49de45ae3703312cb46e627db1"
},
{
"ImportPath": "golang.org/x/net/trace",
"Rev": "04b9de9b512f58addf28c9853d50ebef61c3953e"
},
{
"ImportPath": "golang.org/x/sys/unix",
"Rev": "9c60d1c508f5134d1ca726b4641db998f2523357"
"ImportPath": "google.golang.org/cloud/internal",
"Rev": "f20d6dcccb44ed49de45ae3703312cb46e627db1"
},
{
"ImportPath": "google.golang.org/grpc",
"Rev": "e29d659177655e589850ba7d3d83f7ce12ef23dd"
"Rev": "f5ebd86be717593ab029545492c93ddf8914832b"
}
]
}

View File

@@ -1,5 +0,0 @@
#*
*~
/tools/pass/pass
/tools/pcaptest/pcaptest
/tools/tcpdump/tcpdump

View File

@@ -1,27 +0,0 @@
Copyright (c) 2009-2011 Andreas Krennmair. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Andreas Krennmair nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -1,11 +0,0 @@
# PCAP
This is a simple wrapper around libpcap for Go. Originally written by Andreas
Krennmair <ak@synflood.at> and only minorly touched up by Mark Smith <mark@qq.is>.
Please see the included pcaptest.go and tcpdump.go programs for instructions on
how to use this library.
Miek Gieben <miek@miek.nl> has created a more Go-like package and replaced functionality
with standard functions from the standard library. The package has also been renamed to
pcap.

View File

@@ -1,527 +0,0 @@
package pcap
import (
"encoding/binary"
"fmt"
"net"
"reflect"
"strings"
)
const (
TYPE_IP = 0x0800
TYPE_ARP = 0x0806
TYPE_IP6 = 0x86DD
TYPE_VLAN = 0x8100
IP_ICMP = 1
IP_INIP = 4
IP_TCP = 6
IP_UDP = 17
)
const (
ERRBUF_SIZE = 256
// According to pcap-linktype(7).
LINKTYPE_NULL = 0
LINKTYPE_ETHERNET = 1
LINKTYPE_TOKEN_RING = 6
LINKTYPE_ARCNET = 7
LINKTYPE_SLIP = 8
LINKTYPE_PPP = 9
LINKTYPE_FDDI = 10
LINKTYPE_ATM_RFC1483 = 100
LINKTYPE_RAW = 101
LINKTYPE_PPP_HDLC = 50
LINKTYPE_PPP_ETHER = 51
LINKTYPE_C_HDLC = 104
LINKTYPE_IEEE802_11 = 105
LINKTYPE_FRELAY = 107
LINKTYPE_LOOP = 108
LINKTYPE_LINUX_SLL = 113
LINKTYPE_LTALK = 104
LINKTYPE_PFLOG = 117
LINKTYPE_PRISM_HEADER = 119
LINKTYPE_IP_OVER_FC = 122
LINKTYPE_SUNATM = 123
LINKTYPE_IEEE802_11_RADIO = 127
LINKTYPE_ARCNET_LINUX = 129
LINKTYPE_LINUX_IRDA = 144
LINKTYPE_LINUX_LAPD = 177
)
type addrHdr interface {
SrcAddr() string
DestAddr() string
Len() int
}
type addrStringer interface {
String(addr addrHdr) string
}
func decodemac(pkt []byte) uint64 {
mac := uint64(0)
for i := uint(0); i < 6; i++ {
mac = (mac << 8) + uint64(pkt[i])
}
return mac
}
// Decode decodes the headers of a Packet.
func (p *Packet) Decode() {
if len(p.Data) <= 14 {
return
}
p.Type = int(binary.BigEndian.Uint16(p.Data[12:14]))
p.DestMac = decodemac(p.Data[0:6])
p.SrcMac = decodemac(p.Data[6:12])
if len(p.Data) >= 15 {
p.Payload = p.Data[14:]
}
switch p.Type {
case TYPE_IP:
p.decodeIp()
case TYPE_IP6:
p.decodeIp6()
case TYPE_ARP:
p.decodeArp()
case TYPE_VLAN:
p.decodeVlan()
}
}
func (p *Packet) headerString(headers []interface{}) string {
// If there's just one header, return that.
if len(headers) == 1 {
if hdr, ok := headers[0].(fmt.Stringer); ok {
return hdr.String()
}
}
// If there are two headers (IPv4/IPv6 -> TCP/UDP/IP..)
if len(headers) == 2 {
// Commonly the first header is an address.
if addr, ok := p.Headers[0].(addrHdr); ok {
if hdr, ok := p.Headers[1].(addrStringer); ok {
return fmt.Sprintf("%s %s", p.Time, hdr.String(addr))
}
}
}
// For IP in IP, we do a recursive call.
if len(headers) >= 2 {
if addr, ok := headers[0].(addrHdr); ok {
if _, ok := headers[1].(addrHdr); ok {
return fmt.Sprintf("%s > %s IP in IP: ",
addr.SrcAddr(), addr.DestAddr(), p.headerString(headers[1:]))
}
}
}
var typeNames []string
for _, hdr := range headers {
typeNames = append(typeNames, reflect.TypeOf(hdr).String())
}
return fmt.Sprintf("unknown [%s]", strings.Join(typeNames, ","))
}
// String prints a one-line representation of the packet header.
// The output is suitable for use in a tcpdump program.
func (p *Packet) String() string {
// If there are no headers, print "unsupported protocol".
if len(p.Headers) == 0 {
return fmt.Sprintf("%s unsupported protocol %d", p.Time, int(p.Type))
}
return fmt.Sprintf("%s %s", p.Time, p.headerString(p.Headers))
}
// Arphdr is a ARP packet header.
type Arphdr struct {
Addrtype uint16
Protocol uint16
HwAddressSize uint8
ProtAddressSize uint8
Operation uint16
SourceHwAddress []byte
SourceProtAddress []byte
DestHwAddress []byte
DestProtAddress []byte
}
func (arp *Arphdr) String() (s string) {
switch arp.Operation {
case 1:
s = "ARP request"
case 2:
s = "ARP Reply"
}
if arp.Addrtype == LINKTYPE_ETHERNET && arp.Protocol == TYPE_IP {
s = fmt.Sprintf("%012x (%s) > %012x (%s)",
decodemac(arp.SourceHwAddress), arp.SourceProtAddress,
decodemac(arp.DestHwAddress), arp.DestProtAddress)
} else {
s = fmt.Sprintf("addrtype = %d protocol = %d", arp.Addrtype, arp.Protocol)
}
return
}
func (p *Packet) decodeArp() {
if len(p.Payload) < 8 {
return
}
pkt := p.Payload
arp := new(Arphdr)
arp.Addrtype = binary.BigEndian.Uint16(pkt[0:2])
arp.Protocol = binary.BigEndian.Uint16(pkt[2:4])
arp.HwAddressSize = pkt[4]
arp.ProtAddressSize = pkt[5]
arp.Operation = binary.BigEndian.Uint16(pkt[6:8])
if len(pkt) < int(8+2*arp.HwAddressSize+2*arp.ProtAddressSize) {
return
}
arp.SourceHwAddress = pkt[8 : 8+arp.HwAddressSize]
arp.SourceProtAddress = pkt[8+arp.HwAddressSize : 8+arp.HwAddressSize+arp.ProtAddressSize]
arp.DestHwAddress = pkt[8+arp.HwAddressSize+arp.ProtAddressSize : 8+2*arp.HwAddressSize+arp.ProtAddressSize]
arp.DestProtAddress = pkt[8+2*arp.HwAddressSize+arp.ProtAddressSize : 8+2*arp.HwAddressSize+2*arp.ProtAddressSize]
p.Headers = append(p.Headers, arp)
if len(pkt) >= int(8+2*arp.HwAddressSize+2*arp.ProtAddressSize) {
p.Payload = p.Payload[8+2*arp.HwAddressSize+2*arp.ProtAddressSize:]
}
}
// IPadr is the header of an IP packet.
type Iphdr struct {
Version uint8
Ihl uint8
Tos uint8
Length uint16
Id uint16
Flags uint8
FragOffset uint16
Ttl uint8
Protocol uint8
Checksum uint16
SrcIp []byte
DestIp []byte
}
func (p *Packet) decodeIp() {
if len(p.Payload) < 20 {
return
}
pkt := p.Payload
ip := new(Iphdr)
ip.Version = uint8(pkt[0]) >> 4
ip.Ihl = uint8(pkt[0]) & 0x0F
ip.Tos = pkt[1]
ip.Length = binary.BigEndian.Uint16(pkt[2:4])
ip.Id = binary.BigEndian.Uint16(pkt[4:6])
flagsfrags := binary.BigEndian.Uint16(pkt[6:8])
ip.Flags = uint8(flagsfrags >> 13)
ip.FragOffset = flagsfrags & 0x1FFF
ip.Ttl = pkt[8]
ip.Protocol = pkt[9]
ip.Checksum = binary.BigEndian.Uint16(pkt[10:12])
ip.SrcIp = pkt[12:16]
ip.DestIp = pkt[16:20]
pEnd := int(ip.Length)
if pEnd > len(pkt) {
pEnd = len(pkt)
}
if len(pkt) >= pEnd && int(ip.Ihl*4) < pEnd {
p.Payload = pkt[ip.Ihl*4 : pEnd]
} else {
p.Payload = []byte{}
}
p.Headers = append(p.Headers, ip)
p.IP = ip
switch ip.Protocol {
case IP_TCP:
p.decodeTcp()
case IP_UDP:
p.decodeUdp()
case IP_ICMP:
p.decodeIcmp()
case IP_INIP:
p.decodeIp()
}
}
func (ip *Iphdr) SrcAddr() string { return net.IP(ip.SrcIp).String() }
func (ip *Iphdr) DestAddr() string { return net.IP(ip.DestIp).String() }
func (ip *Iphdr) Len() int { return int(ip.Length) }
type Vlanhdr struct {
Priority byte
DropEligible bool
VlanIdentifier int
Type int // Not actually part of the vlan header, but the type of the actual packet
}
func (v *Vlanhdr) String() {
fmt.Sprintf("VLAN Priority:%d Drop:%v Tag:%d", v.Priority, v.DropEligible, v.VlanIdentifier)
}
func (p *Packet) decodeVlan() {
pkt := p.Payload
vlan := new(Vlanhdr)
if len(pkt) < 4 {
return
}
vlan.Priority = (pkt[2] & 0xE0) >> 13
vlan.DropEligible = pkt[2]&0x10 != 0
vlan.VlanIdentifier = int(binary.BigEndian.Uint16(pkt[:2])) & 0x0FFF
vlan.Type = int(binary.BigEndian.Uint16(p.Payload[2:4]))
p.Headers = append(p.Headers, vlan)
if len(pkt) >= 5 {
p.Payload = p.Payload[4:]
}
switch vlan.Type {
case TYPE_IP:
p.decodeIp()
case TYPE_IP6:
p.decodeIp6()
case TYPE_ARP:
p.decodeArp()
}
}
type Tcphdr struct {
SrcPort uint16
DestPort uint16
Seq uint32
Ack uint32
DataOffset uint8
Flags uint16
Window uint16
Checksum uint16
Urgent uint16
Data []byte
}
const (
TCP_FIN = 1 << iota
TCP_SYN
TCP_RST
TCP_PSH
TCP_ACK
TCP_URG
TCP_ECE
TCP_CWR
TCP_NS
)
func (p *Packet) decodeTcp() {
if len(p.Payload) < 20 {
return
}
pkt := p.Payload
tcp := new(Tcphdr)
tcp.SrcPort = binary.BigEndian.Uint16(pkt[0:2])
tcp.DestPort = binary.BigEndian.Uint16(pkt[2:4])
tcp.Seq = binary.BigEndian.Uint32(pkt[4:8])
tcp.Ack = binary.BigEndian.Uint32(pkt[8:12])
tcp.DataOffset = (pkt[12] & 0xF0) >> 4
tcp.Flags = binary.BigEndian.Uint16(pkt[12:14]) & 0x1FF
tcp.Window = binary.BigEndian.Uint16(pkt[14:16])
tcp.Checksum = binary.BigEndian.Uint16(pkt[16:18])
tcp.Urgent = binary.BigEndian.Uint16(pkt[18:20])
if len(pkt) >= int(tcp.DataOffset*4) {
p.Payload = pkt[tcp.DataOffset*4:]
}
p.Headers = append(p.Headers, tcp)
p.TCP = tcp
}
func (tcp *Tcphdr) String(hdr addrHdr) string {
return fmt.Sprintf("TCP %s:%d > %s:%d %s SEQ=%d ACK=%d LEN=%d",
hdr.SrcAddr(), int(tcp.SrcPort), hdr.DestAddr(), int(tcp.DestPort),
tcp.FlagsString(), int64(tcp.Seq), int64(tcp.Ack), hdr.Len())
}
func (tcp *Tcphdr) FlagsString() string {
var sflags []string
if 0 != (tcp.Flags & TCP_SYN) {
sflags = append(sflags, "syn")
}
if 0 != (tcp.Flags & TCP_FIN) {
sflags = append(sflags, "fin")
}
if 0 != (tcp.Flags & TCP_ACK) {
sflags = append(sflags, "ack")
}
if 0 != (tcp.Flags & TCP_PSH) {
sflags = append(sflags, "psh")
}
if 0 != (tcp.Flags & TCP_RST) {
sflags = append(sflags, "rst")
}
if 0 != (tcp.Flags & TCP_URG) {
sflags = append(sflags, "urg")
}
if 0 != (tcp.Flags & TCP_NS) {
sflags = append(sflags, "ns")
}
if 0 != (tcp.Flags & TCP_CWR) {
sflags = append(sflags, "cwr")
}
if 0 != (tcp.Flags & TCP_ECE) {
sflags = append(sflags, "ece")
}
return fmt.Sprintf("[%s]", strings.Join(sflags, " "))
}
type Udphdr struct {
SrcPort uint16
DestPort uint16
Length uint16
Checksum uint16
}
func (p *Packet) decodeUdp() {
if len(p.Payload) < 8 {
return
}
pkt := p.Payload
udp := new(Udphdr)
udp.SrcPort = binary.BigEndian.Uint16(pkt[0:2])
udp.DestPort = binary.BigEndian.Uint16(pkt[2:4])
udp.Length = binary.BigEndian.Uint16(pkt[4:6])
udp.Checksum = binary.BigEndian.Uint16(pkt[6:8])
p.Headers = append(p.Headers, udp)
p.UDP = udp
if len(p.Payload) >= 8 {
p.Payload = pkt[8:]
}
}
func (udp *Udphdr) String(hdr addrHdr) string {
return fmt.Sprintf("UDP %s:%d > %s:%d LEN=%d CHKSUM=%d",
hdr.SrcAddr(), int(udp.SrcPort), hdr.DestAddr(), int(udp.DestPort),
int(udp.Length), int(udp.Checksum))
}
type Icmphdr struct {
Type uint8
Code uint8
Checksum uint16
Id uint16
Seq uint16
Data []byte
}
func (p *Packet) decodeIcmp() *Icmphdr {
if len(p.Payload) < 8 {
return nil
}
pkt := p.Payload
icmp := new(Icmphdr)
icmp.Type = pkt[0]
icmp.Code = pkt[1]
icmp.Checksum = binary.BigEndian.Uint16(pkt[2:4])
icmp.Id = binary.BigEndian.Uint16(pkt[4:6])
icmp.Seq = binary.BigEndian.Uint16(pkt[6:8])
p.Payload = pkt[8:]
p.Headers = append(p.Headers, icmp)
return icmp
}
func (icmp *Icmphdr) String(hdr addrHdr) string {
return fmt.Sprintf("ICMP %s > %s Type = %d Code = %d ",
hdr.SrcAddr(), hdr.DestAddr(), icmp.Type, icmp.Code)
}
func (icmp *Icmphdr) TypeString() (result string) {
switch icmp.Type {
case 0:
result = fmt.Sprintf("Echo reply seq=%d", icmp.Seq)
case 3:
switch icmp.Code {
case 0:
result = "Network unreachable"
case 1:
result = "Host unreachable"
case 2:
result = "Protocol unreachable"
case 3:
result = "Port unreachable"
default:
result = "Destination unreachable"
}
case 8:
result = fmt.Sprintf("Echo request seq=%d", icmp.Seq)
case 30:
result = "Traceroute"
}
return
}
type Ip6hdr struct {
// http://www.networksorcery.com/enp/protocol/ipv6.htm
Version uint8 // 4 bits
TrafficClass uint8 // 8 bits
FlowLabel uint32 // 20 bits
Length uint16 // 16 bits
NextHeader uint8 // 8 bits, same as Protocol in Iphdr
HopLimit uint8 // 8 bits
SrcIp []byte // 16 bytes
DestIp []byte // 16 bytes
}
func (p *Packet) decodeIp6() {
if len(p.Payload) < 40 {
return
}
pkt := p.Payload
ip6 := new(Ip6hdr)
ip6.Version = uint8(pkt[0]) >> 4
ip6.TrafficClass = uint8((binary.BigEndian.Uint16(pkt[0:2]) >> 4) & 0x00FF)
ip6.FlowLabel = binary.BigEndian.Uint32(pkt[0:4]) & 0x000FFFFF
ip6.Length = binary.BigEndian.Uint16(pkt[4:6])
ip6.NextHeader = pkt[6]
ip6.HopLimit = pkt[7]
ip6.SrcIp = pkt[8:24]
ip6.DestIp = pkt[24:40]
if len(p.Payload) >= 40 {
p.Payload = pkt[40:]
}
p.Headers = append(p.Headers, ip6)
switch ip6.NextHeader {
case IP_TCP:
p.decodeTcp()
case IP_UDP:
p.decodeUdp()
case IP_ICMP:
p.decodeIcmp()
case IP_INIP:
p.decodeIp()
}
}
func (ip6 *Ip6hdr) SrcAddr() string { return net.IP(ip6.SrcIp).String() }
func (ip6 *Ip6hdr) DestAddr() string { return net.IP(ip6.DestIp).String() }
func (ip6 *Ip6hdr) Len() int { return int(ip6.Length) }

View File

@@ -1,247 +0,0 @@
package pcap
import (
"bytes"
"testing"
"time"
)
var testSimpleTcpPacket *Packet = &Packet{
Data: []byte{
0x00, 0x00, 0x0c, 0x9f, 0xf0, 0x20, 0xbc, 0x30, 0x5b, 0xe8, 0xd3, 0x49,
0x08, 0x00, 0x45, 0x00, 0x01, 0xa4, 0x39, 0xdf, 0x40, 0x00, 0x40, 0x06,
0x55, 0x5a, 0xac, 0x11, 0x51, 0x49, 0xad, 0xde, 0xfe, 0xe1, 0xc5, 0xf7,
0x00, 0x50, 0xc5, 0x7e, 0x0e, 0x48, 0x49, 0x07, 0x42, 0x32, 0x80, 0x18,
0x00, 0x73, 0xab, 0xb1, 0x00, 0x00, 0x01, 0x01, 0x08, 0x0a, 0x03, 0x77,
0x37, 0x9c, 0x42, 0x77, 0x5e, 0x3a, 0x47, 0x45, 0x54, 0x20, 0x2f, 0x20,
0x48, 0x54, 0x54, 0x50, 0x2f, 0x31, 0x2e, 0x31, 0x0d, 0x0a, 0x48, 0x6f,
0x73, 0x74, 0x3a, 0x20, 0x77, 0x77, 0x77, 0x2e, 0x66, 0x69, 0x73, 0x68,
0x2e, 0x63, 0x6f, 0x6d, 0x0d, 0x0a, 0x43, 0x6f, 0x6e, 0x6e, 0x65, 0x63,
0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x6b, 0x65, 0x65, 0x70, 0x2d, 0x61,
0x6c, 0x69, 0x76, 0x65, 0x0d, 0x0a, 0x55, 0x73, 0x65, 0x72, 0x2d, 0x41,
0x67, 0x65, 0x6e, 0x74, 0x3a, 0x20, 0x4d, 0x6f, 0x7a, 0x69, 0x6c, 0x6c,
0x61, 0x2f, 0x35, 0x2e, 0x30, 0x20, 0x28, 0x58, 0x31, 0x31, 0x3b, 0x20,
0x4c, 0x69, 0x6e, 0x75, 0x78, 0x20, 0x78, 0x38, 0x36, 0x5f, 0x36, 0x34,
0x29, 0x20, 0x41, 0x70, 0x70, 0x6c, 0x65, 0x57, 0x65, 0x62, 0x4b, 0x69,
0x74, 0x2f, 0x35, 0x33, 0x35, 0x2e, 0x32, 0x20, 0x28, 0x4b, 0x48, 0x54,
0x4d, 0x4c, 0x2c, 0x20, 0x6c, 0x69, 0x6b, 0x65, 0x20, 0x47, 0x65, 0x63,
0x6b, 0x6f, 0x29, 0x20, 0x43, 0x68, 0x72, 0x6f, 0x6d, 0x65, 0x2f, 0x31,
0x35, 0x2e, 0x30, 0x2e, 0x38, 0x37, 0x34, 0x2e, 0x31, 0x32, 0x31, 0x20,
0x53, 0x61, 0x66, 0x61, 0x72, 0x69, 0x2f, 0x35, 0x33, 0x35, 0x2e, 0x32,
0x0d, 0x0a, 0x41, 0x63, 0x63, 0x65, 0x70, 0x74, 0x3a, 0x20, 0x74, 0x65,
0x78, 0x74, 0x2f, 0x68, 0x74, 0x6d, 0x6c, 0x2c, 0x61, 0x70, 0x70, 0x6c,
0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x2f, 0x78, 0x68, 0x74, 0x6d,
0x6c, 0x2b, 0x78, 0x6d, 0x6c, 0x2c, 0x61, 0x70, 0x70, 0x6c, 0x69, 0x63,
0x61, 0x74, 0x69, 0x6f, 0x6e, 0x2f, 0x78, 0x6d, 0x6c, 0x3b, 0x71, 0x3d,
0x30, 0x2e, 0x39, 0x2c, 0x2a, 0x2f, 0x2a, 0x3b, 0x71, 0x3d, 0x30, 0x2e,
0x38, 0x0d, 0x0a, 0x41, 0x63, 0x63, 0x65, 0x70, 0x74, 0x2d, 0x45, 0x6e,
0x63, 0x6f, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x67, 0x7a, 0x69, 0x70,
0x2c, 0x64, 0x65, 0x66, 0x6c, 0x61, 0x74, 0x65, 0x2c, 0x73, 0x64, 0x63,
0x68, 0x0d, 0x0a, 0x41, 0x63, 0x63, 0x65, 0x70, 0x74, 0x2d, 0x4c, 0x61,
0x6e, 0x67, 0x75, 0x61, 0x67, 0x65, 0x3a, 0x20, 0x65, 0x6e, 0x2d, 0x55,
0x53, 0x2c, 0x65, 0x6e, 0x3b, 0x71, 0x3d, 0x30, 0x2e, 0x38, 0x0d, 0x0a,
0x41, 0x63, 0x63, 0x65, 0x70, 0x74, 0x2d, 0x43, 0x68, 0x61, 0x72, 0x73,
0x65, 0x74, 0x3a, 0x20, 0x49, 0x53, 0x4f, 0x2d, 0x38, 0x38, 0x35, 0x39,
0x2d, 0x31, 0x2c, 0x75, 0x74, 0x66, 0x2d, 0x38, 0x3b, 0x71, 0x3d, 0x30,
0x2e, 0x37, 0x2c, 0x2a, 0x3b, 0x71, 0x3d, 0x30, 0x2e, 0x33, 0x0d, 0x0a,
0x0d, 0x0a,
}}
func BenchmarkDecodeSimpleTcpPacket(b *testing.B) {
for i := 0; i < b.N; i++ {
testSimpleTcpPacket.Decode()
}
}
func TestDecodeSimpleTcpPacket(t *testing.T) {
p := testSimpleTcpPacket
p.Decode()
if p.DestMac != 0x00000c9ff020 {
t.Error("Dest mac", p.DestMac)
}
if p.SrcMac != 0xbc305be8d349 {
t.Error("Src mac", p.SrcMac)
}
if len(p.Headers) != 2 {
t.Error("Incorrect number of headers", len(p.Headers))
return
}
if ip, ipOk := p.Headers[0].(*Iphdr); ipOk {
if ip.Version != 4 {
t.Error("ip Version", ip.Version)
}
if ip.Ihl != 5 {
t.Error("ip header length", ip.Ihl)
}
if ip.Tos != 0 {
t.Error("ip TOS", ip.Tos)
}
if ip.Length != 420 {
t.Error("ip Length", ip.Length)
}
if ip.Id != 14815 {
t.Error("ip ID", ip.Id)
}
if ip.Flags != 0x02 {
t.Error("ip Flags", ip.Flags)
}
if ip.FragOffset != 0 {
t.Error("ip Fragoffset", ip.FragOffset)
}
if ip.Ttl != 64 {
t.Error("ip TTL", ip.Ttl)
}
if ip.Protocol != 6 {
t.Error("ip Protocol", ip.Protocol)
}
if ip.Checksum != 0x555A {
t.Error("ip Checksum", ip.Checksum)
}
if !bytes.Equal(ip.SrcIp, []byte{172, 17, 81, 73}) {
t.Error("ip Src", ip.SrcIp)
}
if !bytes.Equal(ip.DestIp, []byte{173, 222, 254, 225}) {
t.Error("ip Dest", ip.DestIp)
}
if tcp, tcpOk := p.Headers[1].(*Tcphdr); tcpOk {
if tcp.SrcPort != 50679 {
t.Error("tcp srcport", tcp.SrcPort)
}
if tcp.DestPort != 80 {
t.Error("tcp destport", tcp.DestPort)
}
if tcp.Seq != 0xc57e0e48 {
t.Error("tcp seq", tcp.Seq)
}
if tcp.Ack != 0x49074232 {
t.Error("tcp ack", tcp.Ack)
}
if tcp.DataOffset != 8 {
t.Error("tcp dataoffset", tcp.DataOffset)
}
if tcp.Flags != 0x18 {
t.Error("tcp flags", tcp.Flags)
}
if tcp.Window != 0x73 {
t.Error("tcp window", tcp.Window)
}
if tcp.Checksum != 0xabb1 {
t.Error("tcp checksum", tcp.Checksum)
}
if tcp.Urgent != 0 {
t.Error("tcp urgent", tcp.Urgent)
}
} else {
t.Error("Second header is not TCP header")
}
} else {
t.Error("First header is not IP header")
}
if string(p.Payload) != "GET / HTTP/1.1\r\nHost: www.fish.com\r\nConnection: keep-alive\r\nUser-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Encoding: gzip,deflate,sdch\r\nAccept-Language: en-US,en;q=0.8\r\nAccept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.3\r\n\r\n" {
t.Error("--- PAYLOAD STRING ---\n", string(p.Payload), "\n--- PAYLOAD BYTES ---\n", p.Payload)
}
}
// Makes sure packet payload doesn't display the 6 trailing null of this packet
// as part of the payload. They're actually the ethernet trailer.
func TestDecodeSmallTcpPacketHasEmptyPayload(t *testing.T) {
p := &Packet{
// This packet is only 54 bits (an empty TCP RST), thus 6 trailing null
// bytes are added by the ethernet layer to make it the minimum packet size.
Data: []byte{
0xbc, 0x30, 0x5b, 0xe8, 0xd3, 0x49, 0xb8, 0xac, 0x6f, 0x92, 0xd5, 0xbf,
0x08, 0x00, 0x45, 0x00, 0x00, 0x28, 0x00, 0x00, 0x40, 0x00, 0x40, 0x06,
0x3f, 0x9f, 0xac, 0x11, 0x51, 0xc5, 0xac, 0x11, 0x51, 0x49, 0x00, 0x63,
0x9a, 0xef, 0x00, 0x00, 0x00, 0x00, 0x2e, 0xc1, 0x27, 0x83, 0x50, 0x14,
0x00, 0x00, 0xc3, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
}}
p.Decode()
if p.Payload == nil {
t.Error("Nil payload")
}
if len(p.Payload) != 0 {
t.Error("Non-empty payload:", p.Payload)
}
}
func TestDecodeVlanPacket(t *testing.T) {
p := &Packet{
Data: []byte{
0x00, 0x10, 0xdb, 0xff, 0x10, 0x00, 0x00, 0x15, 0x2c, 0x9d, 0xcc, 0x00, 0x81, 0x00, 0x01, 0xf7,
0x08, 0x00, 0x45, 0x00, 0x00, 0x28, 0x29, 0x8d, 0x40, 0x00, 0x7d, 0x06, 0x83, 0xa0, 0xac, 0x1b,
0xca, 0x8e, 0x45, 0x16, 0x94, 0xe2, 0xd4, 0x0a, 0x00, 0x50, 0xdf, 0xab, 0x9c, 0xc6, 0xcd, 0x1e,
0xe5, 0xd1, 0x50, 0x10, 0x01, 0x00, 0x5a, 0x74, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
}}
p.Decode()
if p.Type != TYPE_VLAN {
t.Error("Didn't detect vlan")
}
if len(p.Headers) != 3 {
t.Error("Incorrect number of headers:", len(p.Headers))
for i, h := range p.Headers {
t.Errorf("Header %d: %#v", i, h)
}
t.FailNow()
}
if _, ok := p.Headers[0].(*Vlanhdr); !ok {
t.Errorf("First header isn't vlan: %q", p.Headers[0])
}
if _, ok := p.Headers[1].(*Iphdr); !ok {
t.Errorf("Second header isn't IP: %q", p.Headers[1])
}
if _, ok := p.Headers[2].(*Tcphdr); !ok {
t.Errorf("Third header isn't TCP: %q", p.Headers[2])
}
}
func TestDecodeFuzzFallout(t *testing.T) {
testData := []struct {
Data []byte
}{
{[]byte("000000000000\x81\x000")},
{[]byte("000000000000\x81\x00000")},
{[]byte("000000000000\x86\xdd0")},
{[]byte("000000000000\b\x000")},
{[]byte("000000000000\b\x060")},
{[]byte{}},
{[]byte("000000000000\b\x0600000000")},
{[]byte("000000000000\x86\xdd000000\x01000000000000000000000000000000000")},
{[]byte("000000000000\x81\x0000\b\x0600000000")},
{[]byte("000000000000\b\x00n0000000000000000000")},
{[]byte("000000000000\x86\xdd000000\x0100000000000000000000000000000000000")},
{[]byte("000000000000\x81\x0000\b\x00g0000000000000000000")},
//{[]byte()},
{[]byte("000000000000\b\x00400000000\x110000000000")},
{[]byte("0nMء\xfe\x13\x13\x81\x00gr\b\x00&x\xc9\xe5b'\x1e0\x00\x04\x00\x0020596224")},
{[]byte("000000000000\x81\x0000\b\x00400000000\x110000000000")},
{[]byte("000000000000\b\x00000000000\x0600\xff0000000")},
{[]byte("000000000000\x86\xdd000000\x06000000000000000000000000000000000")},
{[]byte("000000000000\x81\x0000\b\x00000000000\x0600b0000000")},
{[]byte("000000000000\x81\x0000\b\x00400000000\x060000000000")},
{[]byte("000000000000\x86\xdd000000\x11000000000000000000000000000000000")},
{[]byte("000000000000\x86\xdd000000\x0600000000000000000000000000000000000000000000M")},
{[]byte("000000000000\b\x00500000000\x0600000000000")},
{[]byte("0nM\xd80\xfe\x13\x13\x81\x00gr\b\x00&x\xc9\xe5b'\x1e0\x00\x04\x00\x0020596224")},
}
for _, entry := range testData {
pkt := &Packet{
Time: time.Now(),
Caplen: uint32(len(entry.Data)),
Len: uint32(len(entry.Data)),
Data: entry.Data,
}
pkt.Decode()
/*
func() {
defer func() {
if err := recover(); err != nil {
t.Fatalf("%d. %q failed: %v", idx, string(entry.Data), err)
}
}()
pkt.Decode()
}()
*/
}
}

View File

@@ -1,206 +0,0 @@
package pcap
import (
"encoding/binary"
"fmt"
"io"
"time"
)
// FileHeader is the parsed header of a pcap file.
// http://wiki.wireshark.org/Development/LibpcapFileFormat
type FileHeader struct {
MagicNumber uint32
VersionMajor uint16
VersionMinor uint16
TimeZone int32
SigFigs uint32
SnapLen uint32
Network uint32
}
type PacketTime struct {
Sec int32
Usec int32
}
// Convert the PacketTime to a go Time struct.
func (p *PacketTime) Time() time.Time {
return time.Unix(int64(p.Sec), int64(p.Usec)*1000)
}
// Packet is a single packet parsed from a pcap file.
//
// Convenient access to IP, TCP, and UDP headers is provided after Decode()
// is called if the packet is of the appropriate type.
type Packet struct {
Time time.Time // packet send/receive time
Caplen uint32 // bytes stored in the file (caplen <= len)
Len uint32 // bytes sent/received
Data []byte // packet data
Type int // protocol type, see LINKTYPE_*
DestMac uint64
SrcMac uint64
Headers []interface{} // decoded headers, in order
Payload []byte // remaining non-header bytes
IP *Iphdr // IP header (for IP packets, after decoding)
TCP *Tcphdr // TCP header (for TCP packets, after decoding)
UDP *Udphdr // UDP header (for UDP packets after decoding)
}
// Reader parses pcap files.
type Reader struct {
flip bool
buf io.Reader
err error
fourBytes []byte
twoBytes []byte
sixteenBytes []byte
Header FileHeader
}
// NewReader reads pcap data from an io.Reader.
func NewReader(reader io.Reader) (*Reader, error) {
r := &Reader{
buf: reader,
fourBytes: make([]byte, 4),
twoBytes: make([]byte, 2),
sixteenBytes: make([]byte, 16),
}
switch magic := r.readUint32(); magic {
case 0xa1b2c3d4:
r.flip = false
case 0xd4c3b2a1:
r.flip = true
default:
return nil, fmt.Errorf("pcap: bad magic number: %0x", magic)
}
r.Header = FileHeader{
MagicNumber: 0xa1b2c3d4,
VersionMajor: r.readUint16(),
VersionMinor: r.readUint16(),
TimeZone: r.readInt32(),
SigFigs: r.readUint32(),
SnapLen: r.readUint32(),
Network: r.readUint32(),
}
return r, nil
}
// Next returns the next packet or nil if no more packets can be read.
func (r *Reader) Next() *Packet {
d := r.sixteenBytes
r.err = r.read(d)
if r.err != nil {
return nil
}
timeSec := asUint32(d[0:4], r.flip)
timeUsec := asUint32(d[4:8], r.flip)
capLen := asUint32(d[8:12], r.flip)
origLen := asUint32(d[12:16], r.flip)
data := make([]byte, capLen)
if r.err = r.read(data); r.err != nil {
return nil
}
return &Packet{
Time: time.Unix(int64(timeSec), int64(timeUsec)),
Caplen: capLen,
Len: origLen,
Data: data,
}
}
func (r *Reader) read(data []byte) error {
var err error
n, err := r.buf.Read(data)
for err == nil && n != len(data) {
var chunk int
chunk, err = r.buf.Read(data[n:])
n += chunk
}
if len(data) == n {
return nil
}
return err
}
func (r *Reader) readUint32() uint32 {
data := r.fourBytes
if r.err = r.read(data); r.err != nil {
return 0
}
return asUint32(data, r.flip)
}
func (r *Reader) readInt32() int32 {
data := r.fourBytes
if r.err = r.read(data); r.err != nil {
return 0
}
return int32(asUint32(data, r.flip))
}
func (r *Reader) readUint16() uint16 {
data := r.twoBytes
if r.err = r.read(data); r.err != nil {
return 0
}
return asUint16(data, r.flip)
}
// Writer writes a pcap file.
type Writer struct {
writer io.Writer
buf []byte
}
// NewWriter creates a Writer that stores output in an io.Writer.
// The FileHeader is written immediately.
func NewWriter(writer io.Writer, header *FileHeader) (*Writer, error) {
w := &Writer{
writer: writer,
buf: make([]byte, 24),
}
binary.LittleEndian.PutUint32(w.buf, header.MagicNumber)
binary.LittleEndian.PutUint16(w.buf[4:], header.VersionMajor)
binary.LittleEndian.PutUint16(w.buf[6:], header.VersionMinor)
binary.LittleEndian.PutUint32(w.buf[8:], uint32(header.TimeZone))
binary.LittleEndian.PutUint32(w.buf[12:], header.SigFigs)
binary.LittleEndian.PutUint32(w.buf[16:], header.SnapLen)
binary.LittleEndian.PutUint32(w.buf[20:], header.Network)
if _, err := writer.Write(w.buf); err != nil {
return nil, err
}
return w, nil
}
// Writer writes a packet to the underlying writer.
func (w *Writer) Write(pkt *Packet) error {
binary.LittleEndian.PutUint32(w.buf, uint32(pkt.Time.Unix()))
binary.LittleEndian.PutUint32(w.buf[4:], uint32(pkt.Time.Nanosecond()))
binary.LittleEndian.PutUint32(w.buf[8:], uint32(pkt.Time.Unix()))
binary.LittleEndian.PutUint32(w.buf[12:], pkt.Len)
if _, err := w.writer.Write(w.buf[:16]); err != nil {
return err
}
_, err := w.writer.Write(pkt.Data)
return err
}
func asUint32(data []byte, flip bool) uint32 {
if flip {
return binary.BigEndian.Uint32(data)
}
return binary.LittleEndian.Uint32(data)
}
func asUint16(data []byte, flip bool) uint16 {
if flip {
return binary.BigEndian.Uint16(data)
}
return binary.LittleEndian.Uint16(data)
}

View File

@@ -1,266 +0,0 @@
// Interface to both live and offline pcap parsing.
package pcap
/*
#cgo linux LDFLAGS: -lpcap
#cgo freebsd LDFLAGS: -lpcap
#cgo darwin LDFLAGS: -lpcap
#cgo windows CFLAGS: -I C:/WpdPack/Include
#cgo windows,386 LDFLAGS: -L C:/WpdPack/Lib -lwpcap
#cgo windows,amd64 LDFLAGS: -L C:/WpdPack/Lib/x64 -lwpcap
#include <stdlib.h>
#include <pcap.h>
// Workaround for not knowing how to cast to const u_char**
int hack_pcap_next_ex(pcap_t *p, struct pcap_pkthdr **pkt_header,
u_char **pkt_data) {
return pcap_next_ex(p, pkt_header, (const u_char **)pkt_data);
}
*/
import "C"
import (
"errors"
"net"
"syscall"
"time"
"unsafe"
)
type Pcap struct {
cptr *C.pcap_t
}
type Stat struct {
PacketsReceived uint32
PacketsDropped uint32
PacketsIfDropped uint32
}
type Interface struct {
Name string
Description string
Addresses []IFAddress
// TODO: add more elements
}
type IFAddress struct {
IP net.IP
Netmask net.IPMask
// TODO: add broadcast + PtP dst ?
}
func (p *Pcap) Next() (pkt *Packet) {
rv, _ := p.NextEx()
return rv
}
// Openlive opens a device and returns a *Pcap handler
func Openlive(device string, snaplen int32, promisc bool, timeout_ms int32) (handle *Pcap, err error) {
var buf *C.char
buf = (*C.char)(C.calloc(ERRBUF_SIZE, 1))
h := new(Pcap)
var pro int32
if promisc {
pro = 1
}
dev := C.CString(device)
defer C.free(unsafe.Pointer(dev))
h.cptr = C.pcap_open_live(dev, C.int(snaplen), C.int(pro), C.int(timeout_ms), buf)
if nil == h.cptr {
handle = nil
err = errors.New(C.GoString(buf))
} else {
handle = h
}
C.free(unsafe.Pointer(buf))
return
}
func Openoffline(file string) (handle *Pcap, err error) {
var buf *C.char
buf = (*C.char)(C.calloc(ERRBUF_SIZE, 1))
h := new(Pcap)
cf := C.CString(file)
defer C.free(unsafe.Pointer(cf))
h.cptr = C.pcap_open_offline(cf, buf)
if nil == h.cptr {
handle = nil
err = errors.New(C.GoString(buf))
} else {
handle = h
}
C.free(unsafe.Pointer(buf))
return
}
func (p *Pcap) NextEx() (pkt *Packet, result int32) {
var pkthdr *C.struct_pcap_pkthdr
var buf_ptr *C.u_char
var buf unsafe.Pointer
result = int32(C.hack_pcap_next_ex(p.cptr, &pkthdr, &buf_ptr))
buf = unsafe.Pointer(buf_ptr)
if nil == buf {
return
}
pkt = new(Packet)
pkt.Time = time.Unix(int64(pkthdr.ts.tv_sec), int64(pkthdr.ts.tv_usec)*1000)
pkt.Caplen = uint32(pkthdr.caplen)
pkt.Len = uint32(pkthdr.len)
pkt.Data = C.GoBytes(buf, C.int(pkthdr.caplen))
return
}
func (p *Pcap) Close() {
C.pcap_close(p.cptr)
}
func (p *Pcap) Geterror() error {
return errors.New(C.GoString(C.pcap_geterr(p.cptr)))
}
func (p *Pcap) Getstats() (stat *Stat, err error) {
var cstats _Ctype_struct_pcap_stat
if -1 == C.pcap_stats(p.cptr, &cstats) {
return nil, p.Geterror()
}
stats := new(Stat)
stats.PacketsReceived = uint32(cstats.ps_recv)
stats.PacketsDropped = uint32(cstats.ps_drop)
stats.PacketsIfDropped = uint32(cstats.ps_ifdrop)
return stats, nil
}
func (p *Pcap) Setfilter(expr string) (err error) {
var bpf _Ctype_struct_bpf_program
cexpr := C.CString(expr)
defer C.free(unsafe.Pointer(cexpr))
if -1 == C.pcap_compile(p.cptr, &bpf, cexpr, 1, 0) {
return p.Geterror()
}
if -1 == C.pcap_setfilter(p.cptr, &bpf) {
C.pcap_freecode(&bpf)
return p.Geterror()
}
C.pcap_freecode(&bpf)
return nil
}
func Version() string {
return C.GoString(C.pcap_lib_version())
}
func (p *Pcap) Datalink() int {
return int(C.pcap_datalink(p.cptr))
}
func (p *Pcap) Setdatalink(dlt int) error {
if -1 == C.pcap_set_datalink(p.cptr, C.int(dlt)) {
return p.Geterror()
}
return nil
}
func DatalinkValueToName(dlt int) string {
if name := C.pcap_datalink_val_to_name(C.int(dlt)); name != nil {
return C.GoString(name)
}
return ""
}
func DatalinkValueToDescription(dlt int) string {
if desc := C.pcap_datalink_val_to_description(C.int(dlt)); desc != nil {
return C.GoString(desc)
}
return ""
}
func Findalldevs() (ifs []Interface, err error) {
var buf *C.char
buf = (*C.char)(C.calloc(ERRBUF_SIZE, 1))
defer C.free(unsafe.Pointer(buf))
var alldevsp *C.pcap_if_t
if -1 == C.pcap_findalldevs((**C.pcap_if_t)(&alldevsp), buf) {
return nil, errors.New(C.GoString(buf))
}
defer C.pcap_freealldevs((*C.pcap_if_t)(alldevsp))
dev := alldevsp
var i uint32
for i = 0; dev != nil; dev = (*C.pcap_if_t)(dev.next) {
i++
}
ifs = make([]Interface, i)
dev = alldevsp
for j := uint32(0); dev != nil; dev = (*C.pcap_if_t)(dev.next) {
var iface Interface
iface.Name = C.GoString(dev.name)
iface.Description = C.GoString(dev.description)
iface.Addresses = findalladdresses(dev.addresses)
// TODO: add more elements
ifs[j] = iface
j++
}
return
}
func findalladdresses(addresses *_Ctype_struct_pcap_addr) (retval []IFAddress) {
// TODO - make it support more than IPv4 and IPv6?
retval = make([]IFAddress, 0, 1)
for curaddr := addresses; curaddr != nil; curaddr = (*_Ctype_struct_pcap_addr)(curaddr.next) {
var a IFAddress
var err error
if a.IP, err = sockaddr_to_IP((*syscall.RawSockaddr)(unsafe.Pointer(curaddr.addr))); err != nil {
continue
}
if a.Netmask, err = sockaddr_to_IP((*syscall.RawSockaddr)(unsafe.Pointer(curaddr.addr))); err != nil {
continue
}
retval = append(retval, a)
}
return
}
func sockaddr_to_IP(rsa *syscall.RawSockaddr) (IP []byte, err error) {
switch rsa.Family {
case syscall.AF_INET:
pp := (*syscall.RawSockaddrInet4)(unsafe.Pointer(rsa))
IP = make([]byte, 4)
for i := 0; i < len(IP); i++ {
IP[i] = pp.Addr[i]
}
return
case syscall.AF_INET6:
pp := (*syscall.RawSockaddrInet6)(unsafe.Pointer(rsa))
IP = make([]byte, 16)
for i := 0; i < len(IP); i++ {
IP[i] = pp.Addr[i]
}
return
}
err = errors.New("Unsupported address type")
return
}
func (p *Pcap) Inject(data []byte) (err error) {
buf := (*C.char)(C.malloc((C.size_t)(len(data))))
for i := 0; i < len(data); i++ {
*(*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(buf)) + uintptr(i))) = data[i]
}
if -1 == C.pcap_sendpacket(p.cptr, (*C.u_char)(unsafe.Pointer(buf)), (C.int)(len(data))) {
err = p.Geterror()
}
C.free(unsafe.Pointer(buf))
return
}

View File

@@ -1,49 +0,0 @@
package main
import (
"flag"
"fmt"
"os"
"runtime/pprof"
"time"
"github.com/coreos/etcd/Godeps/_workspace/src/github.com/akrennmair/gopcap"
)
func main() {
var filename *string = flag.String("file", "", "filename")
var decode *bool = flag.Bool("d", false, "If true, decode each packet")
var cpuprofile *string = flag.String("cpuprofile", "", "filename")
flag.Parse()
h, err := pcap.Openoffline(*filename)
if err != nil {
fmt.Printf("Couldn't create pcap reader: %v", err)
}
if *cpuprofile != "" {
if out, err := os.Create(*cpuprofile); err == nil {
pprof.StartCPUProfile(out)
defer func() {
pprof.StopCPUProfile()
out.Close()
}()
} else {
panic(err)
}
}
i, nilPackets := 0, 0
start := time.Now()
for pkt, code := h.NextEx(); code != -2; pkt, code = h.NextEx() {
if pkt == nil {
nilPackets++
} else if *decode {
pkt.Decode()
}
i++
}
duration := time.Since(start)
fmt.Printf("Took %v to process %v packets, %v per packet, %d nil packets\n", duration, i, duration/time.Duration(i), nilPackets)
}

View File

@@ -1,96 +0,0 @@
package main
// Parses a pcap file, writes it back to disk, then verifies the files
// are the same.
import (
"bufio"
"flag"
"fmt"
"io"
"os"
"github.com/coreos/etcd/Godeps/_workspace/src/github.com/akrennmair/gopcap"
)
var input *string = flag.String("input", "", "input file")
var output *string = flag.String("output", "", "output file")
var decode *bool = flag.Bool("decode", false, "print decoded packets")
func copyPcap(dest, src string) {
f, err := os.Open(src)
if err != nil {
fmt.Printf("couldn't open %q: %v\n", src, err)
return
}
defer f.Close()
reader, err := pcap.NewReader(bufio.NewReader(f))
if err != nil {
fmt.Printf("couldn't create reader: %v\n", err)
return
}
w, err := os.Create(dest)
if err != nil {
fmt.Printf("couldn't open %q: %v\n", dest, err)
return
}
defer w.Close()
buf := bufio.NewWriter(w)
writer, err := pcap.NewWriter(buf, &reader.Header)
if err != nil {
fmt.Printf("couldn't create writer: %v\n", err)
return
}
for {
pkt := reader.Next()
if pkt == nil {
break
}
if *decode {
pkt.Decode()
fmt.Println(pkt.String())
}
writer.Write(pkt)
}
buf.Flush()
}
func check(dest, src string) {
f, err := os.Open(src)
if err != nil {
fmt.Printf("couldn't open %q: %v\n", src, err)
return
}
defer f.Close()
freader := bufio.NewReader(f)
g, err := os.Open(dest)
if err != nil {
fmt.Printf("couldn't open %q: %v\n", src, err)
return
}
defer g.Close()
greader := bufio.NewReader(g)
for {
fb, ferr := freader.ReadByte()
gb, gerr := greader.ReadByte()
if ferr == io.EOF && gerr == io.EOF {
break
}
if fb == gb {
continue
}
fmt.Println("FAIL")
return
}
fmt.Println("PASS")
}
func main() {
flag.Parse()
copyPcap(*output, *input)
check(*output, *input)
}

View File

@@ -1,82 +0,0 @@
package main
import (
"flag"
"fmt"
"time"
"github.com/coreos/etcd/Godeps/_workspace/src/github.com/akrennmair/gopcap"
)
func min(x uint32, y uint32) uint32 {
if x < y {
return x
}
return y
}
func main() {
var device *string = flag.String("d", "", "device")
var file *string = flag.String("r", "", "file")
var expr *string = flag.String("e", "", "filter expression")
flag.Parse()
var h *pcap.Pcap
var err error
ifs, err := pcap.Findalldevs()
if len(ifs) == 0 {
fmt.Printf("Warning: no devices found : %s\n", err)
} else {
for i := 0; i < len(ifs); i++ {
fmt.Printf("dev %d: %s (%s)\n", i+1, ifs[i].Name, ifs[i].Description)
}
}
if *device != "" {
h, err = pcap.Openlive(*device, 65535, true, 0)
if h == nil {
fmt.Printf("Openlive(%s) failed: %s\n", *device, err)
return
}
} else if *file != "" {
h, err = pcap.Openoffline(*file)
if h == nil {
fmt.Printf("Openoffline(%s) failed: %s\n", *file, err)
return
}
} else {
fmt.Printf("usage: pcaptest [-d <device> | -r <file>]\n")
return
}
defer h.Close()
fmt.Printf("pcap version: %s\n", pcap.Version())
if *expr != "" {
fmt.Printf("Setting filter: %s\n", *expr)
err := h.Setfilter(*expr)
if err != nil {
fmt.Printf("Warning: setting filter failed: %s\n", err)
}
}
for pkt := h.Next(); pkt != nil; pkt = h.Next() {
fmt.Printf("time: %d.%06d (%s) caplen: %d len: %d\nData:",
int64(pkt.Time.Second()), int64(pkt.Time.Nanosecond()),
time.Unix(int64(pkt.Time.Second()), 0).String(), int64(pkt.Caplen), int64(pkt.Len))
for i := uint32(0); i < pkt.Caplen; i++ {
if i%32 == 0 {
fmt.Printf("\n")
}
if 32 <= pkt.Data[i] && pkt.Data[i] <= 126 {
fmt.Printf("%c", pkt.Data[i])
} else {
fmt.Printf(".")
}
}
fmt.Printf("\n\n")
}
}

View File

@@ -1,121 +0,0 @@
package main
import (
"bufio"
"flag"
"fmt"
"os"
"github.com/coreos/etcd/Godeps/_workspace/src/github.com/akrennmair/gopcap"
)
const (
TYPE_IP = 0x0800
TYPE_ARP = 0x0806
TYPE_IP6 = 0x86DD
IP_ICMP = 1
IP_INIP = 4
IP_TCP = 6
IP_UDP = 17
)
var out *bufio.Writer
var errout *bufio.Writer
func main() {
var device *string = flag.String("i", "", "interface")
var snaplen *int = flag.Int("s", 65535, "snaplen")
var hexdump *bool = flag.Bool("X", false, "hexdump")
expr := ""
out = bufio.NewWriter(os.Stdout)
errout = bufio.NewWriter(os.Stderr)
flag.Usage = func() {
fmt.Fprintf(errout, "usage: %s [ -i interface ] [ -s snaplen ] [ -X ] [ expression ]\n", os.Args[0])
errout.Flush()
os.Exit(1)
}
flag.Parse()
if len(flag.Args()) > 0 {
expr = flag.Arg(0)
}
if *device == "" {
devs, err := pcap.Findalldevs()
if err != nil {
fmt.Fprintf(errout, "tcpdump: couldn't find any devices: %s\n", err)
}
if 0 == len(devs) {
flag.Usage()
}
*device = devs[0].Name
}
h, err := pcap.Openlive(*device, int32(*snaplen), true, 0)
if h == nil {
fmt.Fprintf(errout, "tcpdump: %s\n", err)
errout.Flush()
return
}
defer h.Close()
if expr != "" {
ferr := h.Setfilter(expr)
if ferr != nil {
fmt.Fprintf(out, "tcpdump: %s\n", ferr)
out.Flush()
}
}
for pkt := h.Next(); pkt != nil; pkt = h.Next() {
pkt.Decode()
fmt.Fprintf(out, "%s\n", pkt.String())
if *hexdump {
Hexdump(pkt)
}
out.Flush()
}
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
func Hexdump(pkt *pcap.Packet) {
for i := 0; i < len(pkt.Data); i += 16 {
Dumpline(uint32(i), pkt.Data[i:min(i+16, len(pkt.Data))])
}
}
func Dumpline(addr uint32, line []byte) {
fmt.Fprintf(out, "\t0x%04x: ", int32(addr))
var i uint16
for i = 0; i < 16 && i < uint16(len(line)); i++ {
if i%2 == 0 {
out.WriteString(" ")
}
fmt.Fprintf(out, "%02x", line[i])
}
for j := i; j <= 16; j++ {
if j%2 == 0 {
out.WriteString(" ")
}
out.WriteString(" ")
}
out.WriteString(" ")
for i = 0; i < 16 && i < uint16(len(line)); i++ {
if line[i] >= 32 && line[i] <= 126 {
fmt.Fprintf(out, "%c", line[i])
} else {
out.WriteString(".")
}
}
out.WriteString("\n")
}

View File

@@ -4,7 +4,7 @@
// Original code is based on code by RogerV in the golang-nuts thread:
// https://groups.google.com/group/golang-nuts/browse_thread/thread/40cc41e9d9fc9247
// +build darwin freebsd linux netbsd openbsd solaris
// +build darwin freebsd linux netbsd openbsd
package speakeasy
@@ -19,8 +19,9 @@ import (
const sttyArg0 = "/bin/stty"
var (
sttyArgvEOff = []string{"stty", "-echo"}
sttyArgvEOn = []string{"stty", "echo"}
sttyArgvEOff []string = []string{"stty", "-echo"}
sttyArgvEOn []string = []string{"stty", "echo"}
ws syscall.WaitStatus = 0
)
// getPassword gets input hidden from the terminal from a user. This is
@@ -46,11 +47,10 @@ func getPassword() (password string, err error) {
}
// Turn on the terminal echo and stop listening for signals.
defer signal.Stop(sig)
defer close(brk)
defer echoOn(fd)
syscall.Wait4(pid, nil, 0, nil)
syscall.Wait4(pid, &ws, 0, nil)
line, err := readline()
if err == nil {
@@ -76,7 +76,7 @@ func echoOn(fd []uintptr) {
// Turn on the terminal echo.
pid, e := syscall.ForkExec(sttyArg0, sttyArgvEOn, &syscall.ProcAttr{Dir: "", Files: fd})
if e == nil {
syscall.Wait4(pid, nil, 0, nil)
syscall.Wait4(pid, &ws, 0, nil)
}
}

View File

@@ -1,4 +1,3 @@
*.prof
*.test
*.swp
/bin/

View File

@@ -1,8 +1,8 @@
Bolt [![Build Status](https://drone.io/github.com/boltdb/bolt/status.png)](https://drone.io/github.com/boltdb/bolt/latest) [![Coverage Status](https://coveralls.io/repos/boltdb/bolt/badge.png?branch=master)](https://coveralls.io/r/boltdb/bolt?branch=master) [![GoDoc](https://godoc.org/github.com/boltdb/bolt?status.png)](https://godoc.org/github.com/boltdb/bolt) ![Version](http://img.shields.io/badge/version-1.0-green.png)
====
Bolt is a pure Go key/value store inspired by [Howard Chu's][hyc_symas]
[LMDB project][lmdb]. The goal of the project is to provide a simple,
Bolt is a pure Go key/value store inspired by [Howard Chu's][hyc_symas] and
the [LMDB project][lmdb]. The goal of the project is to provide a simple,
fast, and reliable database for projects that don't require a full database
server such as Postgres or MySQL.
@@ -87,11 +87,6 @@ are not thread safe. To work with data in multiple goroutines you must start
a transaction for each one or use locking to ensure only one goroutine accesses
a transaction at a time. Creating transaction from the `DB` is thread safe.
Read-only transactions and read-write transactions should not depend on one
another and generally shouldn't be opened simultaneously in the same goroutine.
This can cause a deadlock as the read-write transaction needs to periodically
re-map the data file but it cannot do so while a read-only transaction is open.
#### Read-write transactions
@@ -180,8 +175,8 @@ and then safely close your transaction if an error is returned. This is the
recommended way to use Bolt transactions.
However, sometimes you may want to manually start and end your transactions.
You can use the `Tx.Begin()` function directly but **please** be sure to close
the transaction.
You can use the `Tx.Begin()` function directly but _please_ be sure to close the
transaction.
```go
// Start a writable transaction.
@@ -256,7 +251,7 @@ db.View(func(tx *bolt.Tx) error {
```
The `Get()` function does not return an error because its operation is
guaranteed to work (unless there is some kind of system failure). If the key
guarenteed to work (unless there is some kind of system failure). If the key
exists then it will return its byte slice value. If it doesn't exist then it
will return `nil`. It's important to note that you can have a zero-length value
set to a key which is different than the key not existing.
@@ -268,50 +263,6 @@ transaction is open. If you need to use a value outside of the transaction
then you must use `copy()` to copy it to another byte slice.
### Autoincrementing integer for the bucket
By using the NextSequence() function, you can let Bolt determine a sequence
which can be used as the unique identifier for your key/value pairs. See the
example below.
```go
// CreateUser saves u to the store. The new user ID is set on u once the data is persisted.
func (s *Store) CreateUser(u *User) error {
return s.db.Update(func(tx *bolt.Tx) error {
// Retrieve the users bucket.
// This should be created when the DB is first opened.
b := tx.Bucket([]byte("users"))
// Generate ID for the user.
// This returns an error only if the Tx is closed or not writeable.
// That can't happen in an Update() call so I ignore the error check.
id, _ = b.NextSequence()
u.ID = int(id)
// Marshal user data into bytes.
buf, err := json.Marshal(u)
if err != nil {
return err
}
// Persist bytes to users bucket.
return b.Put(itob(u.ID), buf)
})
}
// itob returns an 8-byte big endian representation of v.
func itob(v int) []byte {
b := make([]byte, 8)
binary.BigEndian.PutUint64(b, uint64(v))
return b
}
type User struct {
ID int
...
}
```
### Iterating over keys
Bolt stores its keys in byte-sorted order within a bucket. This makes sequential
@@ -426,11 +377,8 @@ func (*Bucket) DeleteBucket(key []byte) error
Bolt is a single file so it's easy to backup. You can use the `Tx.WriteTo()`
function to write a consistent view of the database to a writer. If you call
this from a read-only transaction, it will perform a hot backup and not block
your other database reads and writes.
By default, it will use a regular file handle which will utilize the operating
system's page cache. See the [`Tx`](https://godoc.org/github.com/boltdb/bolt#Tx)
documentation for information about optimizing for larger-than-RAM datasets.
your other database reads and writes. It will also use `O_DIRECT` when available
to prevent page cache trashing.
One common use case is to backup over HTTP so you can use tools like `cURL` to
do database backups:
@@ -498,21 +446,6 @@ It's also useful to pipe these stats to a service such as statsd for monitoring
or to provide an HTTP endpoint that will perform a fixed-length sample.
### Read-Only Mode
Sometimes it is useful to create a shared, read-only Bolt database. To this,
set the `Options.ReadOnly` flag when opening your database. Read-only mode
uses a shared lock to allow multiple processes to read from the database but
it will block any processes from opening the database in read-write mode.
```go
db, err := bolt.Open("my.db", 0666, &bolt.Options{ReadOnly: true})
if err != nil {
log.Fatal(err)
}
```
## Resources
For more information on getting started with Bolt, check out the following articles:
@@ -547,7 +480,7 @@ they are libraries bundled into the application, however, their underlying
structure is a log-structured merge-tree (LSM tree). An LSM tree optimizes
random writes by using a write ahead log and multi-tiered, sorted files called
SSTables. Bolt uses a B+tree internally and only a single file. Both approaches
have trade-offs.
have trade offs.
If you require a high random write throughput (>10,000 w/sec) or you need to use
spinning disks then LevelDB could be a good choice. If your application is
@@ -615,14 +548,7 @@ Here are a few things to note when evaluating and using Bolt:
can in memory and will release memory as needed to other processes. This means
that Bolt can show very high memory usage when working with large databases.
However, this is expected and the OS will release memory as needed. Bolt can
handle databases much larger than the available physical RAM, provided its
memory-map fits in the process virtual address space. It may be problematic
on 32-bits systems.
* The data structures in the Bolt database are memory mapped so the data file
will be endian specific. This means that you cannot copy a Bolt file from a
little endian machine to a big endian machine and have it work. For most
users this is not a concern since most modern CPUs are little endian.
handle databases much larger than the available physical RAM.
* Because of the way pages are laid out on disk, Bolt cannot truncate data files
and return free pages back to the disk. Instead, Bolt maintains a free list
@@ -636,62 +562,12 @@ Here are a few things to note when evaluating and using Bolt:
[page-allocation]: https://github.com/boltdb/bolt/issues/308#issuecomment-74811638
## Reading the Source
Bolt is a relatively small code base (<3KLOC) for an embedded, serializable,
transactional key/value database so it can be a good starting point for people
interested in how databases work.
The best places to start are the main entry points into Bolt:
- `Open()` - Initializes the reference to the database. It's responsible for
creating the database if it doesn't exist, obtaining an exclusive lock on the
file, reading the meta pages, & memory-mapping the file.
- `DB.Begin()` - Starts a read-only or read-write transaction depending on the
value of the `writable` argument. This requires briefly obtaining the "meta"
lock to keep track of open transactions. Only one read-write transaction can
exist at a time so the "rwlock" is acquired during the life of a read-write
transaction.
- `Bucket.Put()` - Writes a key/value pair into a bucket. After validating the
arguments, a cursor is used to traverse the B+tree to the page and position
where they key & value will be written. Once the position is found, the bucket
materializes the underlying page and the page's parent pages into memory as
"nodes". These nodes are where mutations occur during read-write transactions.
These changes get flushed to disk during commit.
- `Bucket.Get()` - Retrieves a key/value pair from a bucket. This uses a cursor
to move to the page & position of a key/value pair. During a read-only
transaction, the key and value data is returned as a direct reference to the
underlying mmap file so there's no allocation overhead. For read-write
transactions, this data may reference the mmap file or one of the in-memory
node values.
- `Cursor` - This object is simply for traversing the B+tree of on-disk pages
or in-memory nodes. It can seek to a specific key, move to the first or last
value, or it can move forward or backward. The cursor handles the movement up
and down the B+tree transparently to the end user.
- `Tx.Commit()` - Converts the in-memory dirty nodes and the list of free pages
into pages to be written to disk. Writing to disk then occurs in two phases.
First, the dirty pages are written to disk and an `fsync()` occurs. Second, a
new meta page with an incremented transaction ID is written and another
`fsync()` occurs. This two phase write ensures that partially written data
pages are ignored in the event of a crash since the meta page pointing to them
is never written. Partially written meta pages are invalidated because they
are written with a checksum.
If you have additional notes that could be helpful for others, please submit
them via pull request.
## Other Projects Using Bolt
Below is a list of public, open source projects that use Bolt:
* [Operation Go: A Routine Mission](http://gocode.io) - An online programming game for Golang using Bolt for user accounts and a leaderboard.
* [Bazil](https://bazil.org/) - A file system that lets your data reside where it is most convenient for it to reside.
* [Bazil](https://github.com/bazillion/bazil) - A file system that lets your data reside where it is most convenient for it to reside.
* [DVID](https://github.com/janelia-flyem/dvid) - Added Bolt as optional storage engine and testing it against Basho-tuned leveldb.
* [Skybox Analytics](https://github.com/skybox/skybox) - A standalone funnel analysis tool for web analytics.
* [Scuttlebutt](https://github.com/benbjohnson/scuttlebutt) - Uses Bolt to store and process all Twitter mentions of GitHub projects.
@@ -711,14 +587,5 @@ Below is a list of public, open source projects that use Bolt:
* [SkyDB](https://github.com/skydb/sky) - Behavioral analytics database.
* [Seaweed File System](https://github.com/chrislusf/weed-fs) - Highly scalable distributed key~file system with O(1) disk read.
* [InfluxDB](http://influxdb.com) - Scalable datastore for metrics, events, and real-time analytics.
* [Freehold](http://tshannon.bitbucket.org/freehold/) - An open, secure, and lightweight platform for your files and data.
* [Prometheus Annotation Server](https://github.com/oliver006/prom_annotation_server) - Annotation server for PromDash & Prometheus service monitoring system.
* [Consul](https://github.com/hashicorp/consul) - Consul is service discovery and configuration made easy. Distributed, highly available, and datacenter-aware.
* [Kala](https://github.com/ajvb/kala) - Kala is a modern job scheduler optimized to run on a single node. It is persistent, JSON over HTTP API, ISO 8601 duration notation, and dependent jobs.
* [drive](https://github.com/odeke-em/drive) - drive is an unofficial Google Drive command line client for \*NIX operating systems.
* [stow](https://github.com/djherbis/stow) - a persistence manager for objects
backed by boltdb.
* [buckets](https://github.com/joyrexus/buckets) - a bolt wrapper streamlining
simple tx and key scans.
If you are using Bolt in a project please send a pull request to add it to the list.

View File

@@ -20,9 +20,6 @@ import (
// take permanent effect only after a successful return is seen in
// caller.
//
// The maximum batch size and delay can be adjusted with DB.MaxBatchSize
// and DB.MaxBatchDelay, respectively.
//
// Batch is only useful when there are multiple goroutines calling it.
func (db *DB) Batch(fn func(*Tx) error) error {
errCh := make(chan error, 1)

View File

@@ -1,9 +0,0 @@
// +build arm64
package bolt
// maxMapSize represents the largest mmap size supported by Bolt.
const maxMapSize = 0xFFFFFFFFFFFF // 256TB
// maxAllocSize is the size used when creating array pointers.
const maxAllocSize = 0x7FFFFFFF

View File

@@ -4,6 +4,8 @@ import (
"syscall"
)
var odirect = syscall.O_DIRECT
// fdatasync flushes written data to a file descriptor.
func fdatasync(db *DB) error {
return syscall.Fdatasync(int(db.file.Fd()))

View File

@@ -11,6 +11,8 @@ const (
msInvalidate // invalidate cached data
)
var odirect int
func msync(db *DB) error {
_, _, errno := syscall.Syscall(syscall.SYS_MSYNC, uintptr(unsafe.Pointer(db.data)), uintptr(db.datasz), msInvalidate)
if errno != 0 {

View File

@@ -1,9 +0,0 @@
// +build ppc64le
package bolt
// maxMapSize represents the largest mmap size supported by Bolt.
const maxMapSize = 0xFFFFFFFFFFFF // 256TB
// maxAllocSize is the size used when creating array pointers.
const maxAllocSize = 0x7FFFFFFF

View File

@@ -1,9 +0,0 @@
// +build s390x
package bolt
// maxMapSize represents the largest mmap size supported by Bolt.
const maxMapSize = 0xFFFFFFFFFFFF // 256TB
// maxAllocSize is the size used when creating array pointers.
const maxAllocSize = 0x7FFFFFFF

View File

@@ -1,4 +1,4 @@
// +build !windows,!plan9,!solaris
// +build !windows,!plan9
package bolt
@@ -11,7 +11,7 @@ import (
)
// flock acquires an advisory lock on a file descriptor.
func flock(f *os.File, exclusive bool, timeout time.Duration) error {
func flock(f *os.File, timeout time.Duration) error {
var t time.Time
for {
// If we're beyond our timeout then return an error.
@@ -21,13 +21,9 @@ func flock(f *os.File, exclusive bool, timeout time.Duration) error {
} else if timeout > 0 && time.Since(t) > timeout {
return ErrTimeout
}
flag := syscall.LOCK_SH
if exclusive {
flag = syscall.LOCK_EX
}
// Otherwise attempt to obtain an exclusive lock.
err := syscall.Flock(int(f.Fd()), flag|syscall.LOCK_NB)
err := syscall.Flock(int(f.Fd()), syscall.LOCK_EX|syscall.LOCK_NB)
if err == nil {
return nil
} else if err != syscall.EWOULDBLOCK {
@@ -48,26 +44,19 @@ func funlock(f *os.File) error {
func mmap(db *DB, sz int) error {
// Truncate and fsync to ensure file size metadata is flushed.
// https://github.com/boltdb/bolt/issues/284
if !db.NoGrowSync && !db.readOnly {
if err := db.file.Truncate(int64(sz)); err != nil {
return fmt.Errorf("file resize error: %s", err)
}
if err := db.file.Sync(); err != nil {
return fmt.Errorf("file sync error: %s", err)
}
if err := db.file.Truncate(int64(sz)); err != nil {
return fmt.Errorf("file resize error: %s", err)
}
if err := db.file.Sync(); err != nil {
return fmt.Errorf("file sync error: %s", err)
}
// Map the data file to memory.
b, err := syscall.Mmap(int(db.file.Fd()), 0, sz, syscall.PROT_READ, syscall.MAP_SHARED|db.MmapFlags)
b, err := syscall.Mmap(int(db.file.Fd()), 0, sz, syscall.PROT_READ, syscall.MAP_SHARED)
if err != nil {
return err
}
// Advise the kernel that the mmap is accessed randomly.
if err := madvise(b, syscall.MADV_RANDOM); err != nil {
return fmt.Errorf("madvise: %s", err)
}
// Save the original byte slice and convert to a byte array pointer.
db.dataref = b
db.data = (*[maxMapSize]byte)(unsafe.Pointer(&b[0]))
@@ -89,12 +78,3 @@ func munmap(db *DB) error {
db.datasz = 0
return err
}
// NOTE: This function is copied from stdlib because it is not available on darwin.
func madvise(b []byte, advice int) (err error) {
_, _, e1 := syscall.Syscall(syscall.SYS_MADVISE, uintptr(unsafe.Pointer(&b[0])), uintptr(len(b)), uintptr(advice))
if e1 != 0 {
err = e1
}
return
}

View File

@@ -1,101 +0,0 @@
package bolt
import (
"fmt"
"os"
"syscall"
"time"
"unsafe"
"github.com/coreos/etcd/Godeps/_workspace/src/golang.org/x/sys/unix"
)
// flock acquires an advisory lock on a file descriptor.
func flock(f *os.File, exclusive bool, timeout time.Duration) error {
var t time.Time
for {
// If we're beyond our timeout then return an error.
// This can only occur after we've attempted a flock once.
if t.IsZero() {
t = time.Now()
} else if timeout > 0 && time.Since(t) > timeout {
return ErrTimeout
}
var lock syscall.Flock_t
lock.Start = 0
lock.Len = 0
lock.Pid = 0
lock.Whence = 0
lock.Pid = 0
if exclusive {
lock.Type = syscall.F_WRLCK
} else {
lock.Type = syscall.F_RDLCK
}
err := syscall.FcntlFlock(f.Fd(), syscall.F_SETLK, &lock)
if err == nil {
return nil
} else if err != syscall.EAGAIN {
return err
}
// Wait for a bit and try again.
time.Sleep(50 * time.Millisecond)
}
}
// funlock releases an advisory lock on a file descriptor.
func funlock(f *os.File) error {
var lock syscall.Flock_t
lock.Start = 0
lock.Len = 0
lock.Type = syscall.F_UNLCK
lock.Whence = 0
return syscall.FcntlFlock(uintptr(f.Fd()), syscall.F_SETLK, &lock)
}
// mmap memory maps a DB's data file.
func mmap(db *DB, sz int) error {
// Truncate and fsync to ensure file size metadata is flushed.
// https://github.com/boltdb/bolt/issues/284
if !db.NoGrowSync && !db.readOnly {
if err := db.file.Truncate(int64(sz)); err != nil {
return fmt.Errorf("file resize error: %s", err)
}
if err := db.file.Sync(); err != nil {
return fmt.Errorf("file sync error: %s", err)
}
}
// Map the data file to memory.
b, err := unix.Mmap(int(db.file.Fd()), 0, sz, syscall.PROT_READ, syscall.MAP_SHARED|db.MmapFlags)
if err != nil {
return err
}
// Advise the kernel that the mmap is accessed randomly.
if err := unix.Madvise(b, syscall.MADV_RANDOM); err != nil {
return fmt.Errorf("madvise: %s", err)
}
// Save the original byte slice and convert to a byte array pointer.
db.dataref = b
db.data = (*[maxMapSize]byte)(unsafe.Pointer(&b[0]))
db.datasz = sz
return nil
}
// munmap unmaps a DB's data file from memory.
func munmap(db *DB) error {
// Ignore the unmap if we have no mapped data.
if db.dataref == nil {
return nil
}
// Unmap using the original byte slice.
err := unix.Munmap(db.dataref)
db.dataref = nil
db.data = nil
db.datasz = 0
return err
}

View File

@@ -8,37 +8,7 @@ import (
"unsafe"
)
// LockFileEx code derived from golang build filemutex_windows.go @ v1.5.1
var (
modkernel32 = syscall.NewLazyDLL("kernel32.dll")
procLockFileEx = modkernel32.NewProc("LockFileEx")
procUnlockFileEx = modkernel32.NewProc("UnlockFileEx")
)
const (
// see https://msdn.microsoft.com/en-us/library/windows/desktop/aa365203(v=vs.85).aspx
flagLockExclusive = 2
flagLockFailImmediately = 1
// see https://msdn.microsoft.com/en-us/library/windows/desktop/ms681382(v=vs.85).aspx
errLockViolation syscall.Errno = 0x21
)
func lockFileEx(h syscall.Handle, flags, reserved, locklow, lockhigh uint32, ol *syscall.Overlapped) (err error) {
r, _, err := procLockFileEx.Call(uintptr(h), uintptr(flags), uintptr(reserved), uintptr(locklow), uintptr(lockhigh), uintptr(unsafe.Pointer(ol)))
if r == 0 {
return err
}
return nil
}
func unlockFileEx(h syscall.Handle, reserved, locklow, lockhigh uint32, ol *syscall.Overlapped) (err error) {
r, _, err := procUnlockFileEx.Call(uintptr(h), uintptr(reserved), uintptr(locklow), uintptr(lockhigh), uintptr(unsafe.Pointer(ol)), 0)
if r == 0 {
return err
}
return nil
}
var odirect int
// fdatasync flushes written data to a file descriptor.
func fdatasync(db *DB) error {
@@ -46,47 +16,21 @@ func fdatasync(db *DB) error {
}
// flock acquires an advisory lock on a file descriptor.
func flock(f *os.File, exclusive bool, timeout time.Duration) error {
var t time.Time
for {
// If we're beyond our timeout then return an error.
// This can only occur after we've attempted a flock once.
if t.IsZero() {
t = time.Now()
} else if timeout > 0 && time.Since(t) > timeout {
return ErrTimeout
}
var flag uint32 = flagLockFailImmediately
if exclusive {
flag |= flagLockExclusive
}
err := lockFileEx(syscall.Handle(f.Fd()), flag, 0, 1, 0, &syscall.Overlapped{})
if err == nil {
return nil
} else if err != errLockViolation {
return err
}
// Wait for a bit and try again.
time.Sleep(50 * time.Millisecond)
}
func flock(f *os.File, _ time.Duration) error {
return nil
}
// funlock releases an advisory lock on a file descriptor.
func funlock(f *os.File) error {
return unlockFileEx(syscall.Handle(f.Fd()), 0, 1, 0, &syscall.Overlapped{})
return nil
}
// mmap memory maps a DB's data file.
// Based on: https://github.com/edsrzf/mmap-go
func mmap(db *DB, sz int) error {
if !db.readOnly {
// Truncate the database to the size of the mmap.
if err := db.file.Truncate(int64(sz)); err != nil {
return fmt.Errorf("truncate: %s", err)
}
// Truncate the database to the size of the mmap.
if err := db.file.Truncate(int64(sz)); err != nil {
return fmt.Errorf("truncate: %s", err)
}
// Open a file mapping handle.

View File

@@ -2,6 +2,8 @@
package bolt
var odirect int
// fdatasync flushes written data to a file descriptor.
func fdatasync(db *DB) error {
return db.file.Sync()

View File

@@ -11,7 +11,7 @@ const (
MaxKeySize = 32768
// MaxValueSize is the maximum length of a value, in bytes.
MaxValueSize = (1 << 31) - 2
MaxValueSize = 4294967295
)
const (
@@ -99,7 +99,6 @@ func (b *Bucket) Cursor() *Cursor {
// Bucket retrieves a nested bucket by name.
// Returns nil if the bucket does not exist.
// The bucket instance is only valid for the lifetime of the transaction.
func (b *Bucket) Bucket(name []byte) *Bucket {
if b.buckets != nil {
if child := b.buckets[string(name)]; child != nil {
@@ -149,7 +148,6 @@ func (b *Bucket) openBucket(value []byte) *Bucket {
// CreateBucket creates a new bucket at the given key and returns the new bucket.
// Returns an error if the key already exists, if the bucket name is blank, or if the bucket name is too long.
// The bucket instance is only valid for the lifetime of the transaction.
func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) {
if b.tx.db == nil {
return nil, ErrTxClosed
@@ -194,7 +192,6 @@ func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) {
// CreateBucketIfNotExists creates a new bucket if it doesn't already exist and returns a reference to it.
// Returns an error if the bucket name is blank, or if the bucket name is too long.
// The bucket instance is only valid for the lifetime of the transaction.
func (b *Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error) {
child, err := b.CreateBucket(key)
if err == ErrBucketExists {
@@ -273,7 +270,6 @@ func (b *Bucket) Get(key []byte) []byte {
// Put sets the value for a key in the bucket.
// If the key exist then its previous value will be overwritten.
// Supplied value must remain valid for the life of the transaction.
// Returns an error if the bucket was created from a read-only transaction, if the key is blank, if the key is too large, or if the value is too large.
func (b *Bucket) Put(key []byte, value []byte) error {
if b.tx.db == nil {
@@ -350,8 +346,7 @@ func (b *Bucket) NextSequence() (uint64, error) {
// ForEach executes a function for each key/value pair in a bucket.
// If the provided function returns an error then the iteration is stopped and
// the error is returned to the caller. The provided function must not modify
// the bucket; this will result in undefined behavior.
// the error is returned to the caller.
func (b *Bucket) ForEach(fn func(k, v []byte) error) error {
if b.tx.db == nil {
return ErrTxClosed

View File

@@ -253,7 +253,7 @@ func TestBucket_Delete_FreelistOverflow(t *testing.T) {
b := tx.Bucket([]byte("0"))
c := b.Cursor()
for k, _ := c.First(); k != nil; k, _ = c.Next() {
c.Delete()
b.Delete(k)
}
return nil
})
@@ -640,22 +640,6 @@ func TestBucket_Put_KeyTooLarge(t *testing.T) {
})
}
// Ensure that an error is returned when inserting a value that's too large.
func TestBucket_Put_ValueTooLarge(t *testing.T) {
if os.Getenv("DRONE") == "true" {
t.Skip("not enough RAM for test")
}
db := NewTestDB()
defer db.Close()
db.Update(func(tx *bolt.Tx) error {
tx.CreateBucket([]byte("widgets"))
err := tx.Bucket([]byte("widgets")).Put([]byte("foo"), make([]byte, bolt.MaxValueSize+1))
equals(t, err, bolt.ErrValueTooLarge)
return nil
})
}
// Ensure a bucket can calculate stats.
func TestBucket_Stats(t *testing.T) {
db := NewTestDB()

View File

@@ -344,7 +344,7 @@ func (cmd *DumpCommand) Run(args ...string) error {
for i, pageID := range pageIDs {
// Print a separator.
if i > 0 {
fmt.Fprintln(cmd.Stdout, "===============================================")
fmt.Fprintln(cmd.Stdout, "===============================================\n")
}
// Print page to stdout.
@@ -465,7 +465,7 @@ func (cmd *PageCommand) Run(args ...string) error {
for i, pageID := range pageIDs {
// Print a separator.
if i > 0 {
fmt.Fprintln(cmd.Stdout, "===============================================")
fmt.Fprintln(cmd.Stdout, "===============================================\n")
}
// Retrieve page info and page size.
@@ -917,7 +917,7 @@ func (cmd *BenchCommand) Run(args ...string) error {
// Write to the database.
var results BenchResults
if err := cmd.runWrites(db, options, &results); err != nil {
return fmt.Errorf("write: %v", err)
return fmt.Errorf("write: ", err)
}
// Read from the database.

View File

@@ -34,13 +34,6 @@ func (c *Cursor) First() (key []byte, value []byte) {
p, n := c.bucket.pageNode(c.bucket.root)
c.stack = append(c.stack, elemRef{page: p, node: n, index: 0})
c.first()
// If we land on an empty page then move to the next value.
// https://github.com/boltdb/bolt/issues/450
if c.stack[len(c.stack)-1].count() == 0 {
c.next()
}
k, v, flags := c.keyValue()
if (flags & uint32(bucketLeafFlag)) != 0 {
return k, nil
@@ -216,37 +209,28 @@ func (c *Cursor) last() {
// next moves to the next leaf element and returns the key and value.
// If the cursor is at the last leaf element then it stays there and returns nil.
func (c *Cursor) next() (key []byte, value []byte, flags uint32) {
for {
// Attempt to move over one element until we're successful.
// Move up the stack as we hit the end of each page in our stack.
var i int
for i = len(c.stack) - 1; i >= 0; i-- {
elem := &c.stack[i]
if elem.index < elem.count()-1 {
elem.index++
break
}
// Attempt to move over one element until we're successful.
// Move up the stack as we hit the end of each page in our stack.
var i int
for i = len(c.stack) - 1; i >= 0; i-- {
elem := &c.stack[i]
if elem.index < elem.count()-1 {
elem.index++
break
}
// If we've hit the root page then stop and return. This will leave the
// cursor on the last element of the last page.
if i == -1 {
return nil, nil, 0
}
// Otherwise start from where we left off in the stack and find the
// first element of the first leaf page.
c.stack = c.stack[:i+1]
c.first()
// If this is an empty page then restart and move back up the stack.
// https://github.com/boltdb/bolt/issues/450
if c.stack[len(c.stack)-1].count() == 0 {
continue
}
return c.keyValue()
}
// If we've hit the root page then stop and return. This will leave the
// cursor on the last element of the last page.
if i == -1 {
return nil, nil, 0
}
// Otherwise start from where we left off in the stack and find the
// first element of the first leaf page.
c.stack = c.stack[:i+1]
c.first()
return c.keyValue()
}
// search recursively performs a binary search against a given page/node until it finds a given key.

View File

@@ -303,49 +303,6 @@ func TestCursor_Restart(t *testing.T) {
tx.Rollback()
}
// Ensure that a cursor can skip over empty pages that have been deleted.
func TestCursor_First_EmptyPages(t *testing.T) {
db := NewTestDB()
defer db.Close()
// Create 1000 keys in the "widgets" bucket.
db.Update(func(tx *bolt.Tx) error {
b, err := tx.CreateBucket([]byte("widgets"))
if err != nil {
t.Fatal(err)
}
for i := 0; i < 1000; i++ {
if err := b.Put(u64tob(uint64(i)), []byte{}); err != nil {
t.Fatal(err)
}
}
return nil
})
// Delete half the keys and then try to iterate.
db.Update(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("widgets"))
for i := 0; i < 600; i++ {
if err := b.Delete(u64tob(uint64(i))); err != nil {
t.Fatal(err)
}
}
c := b.Cursor()
var n int
for k, _ := c.First(); k != nil; k, _ = c.Next() {
n++
}
if n != 400 {
t.Fatalf("unexpected key count: %d", n)
}
return nil
})
}
// Ensure that a Tx can iterate over all elements in a bucket.
func TestCursor_QuickCheck(t *testing.T) {
f := func(items testdata) bool {

View File

@@ -55,18 +55,6 @@ type DB struct {
// THIS IS UNSAFE. PLEASE USE WITH CAUTION.
NoSync bool
// When true, skips the truncate call when growing the database.
// Setting this to true is only safe on non-ext3/ext4 systems.
// Skipping truncation avoids preallocation of hard drive space and
// bypasses a truncate() and fsync() syscall on remapping.
//
// https://github.com/boltdb/bolt/issues/284
NoGrowSync bool
// If you want to read the entire database fast, you can set MmapFlag to
// syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead.
MmapFlags int
// MaxBatchSize is the maximum size of a batch. Default value is
// copied from DefaultMaxBatchSize in Open.
//
@@ -108,10 +96,6 @@ type DB struct {
ops struct {
writeAt func(b []byte, off int64) (n int, err error)
}
// Read only mode.
// When true, Update() and Begin(true) return ErrDatabaseReadOnly immediately.
readOnly bool
}
// Path returns the path to currently open database file.
@@ -139,35 +123,24 @@ func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
if options == nil {
options = DefaultOptions
}
db.NoGrowSync = options.NoGrowSync
db.MmapFlags = options.MmapFlags
// Set default values for later DB operations.
db.MaxBatchSize = DefaultMaxBatchSize
db.MaxBatchDelay = DefaultMaxBatchDelay
flag := os.O_RDWR
if options.ReadOnly {
flag = os.O_RDONLY
db.readOnly = true
}
// Open data file and separate sync handler for metadata writes.
db.path = path
var err error
if db.file, err = os.OpenFile(db.path, flag|os.O_CREATE, mode); err != nil {
if db.file, err = os.OpenFile(db.path, os.O_RDWR|os.O_CREATE, mode); err != nil {
_ = db.close()
return nil, err
}
// Lock file so that other processes using Bolt in read-write mode cannot
// use the database at the same time. This would cause corruption since
// the two processes would write meta pages and free pages separately.
// The database file is locked exclusively (only one process can grab the lock)
// if !options.ReadOnly.
// The database file is locked using the shared lock (more than one process may
// hold a lock at the same time) otherwise (options.ReadOnly is set).
if err := flock(db.file, !db.readOnly, options.Timeout); err != nil {
// Lock file so that other processes using Bolt cannot use the database
// at the same time. This would cause corruption since the two processes
// would write meta pages and free pages separately.
if err := flock(db.file, options.Timeout); err != nil {
_ = db.close()
return nil, err
}
@@ -274,8 +247,8 @@ func (db *DB) munmap() error {
// of the database. The minimum size is 1MB and doubles until it reaches 1GB.
// Returns an error if the new mmap size is greater than the max allowed.
func (db *DB) mmapSize(size int) (int, error) {
// Double the size from 32KB until 1GB.
for i := uint(15); i <= 30; i++ {
// Double the size from 1MB until 1GB.
for i := uint(20); i <= 30; i++ {
if size <= 1<<i {
return 1 << i, nil
}
@@ -356,15 +329,8 @@ func (db *DB) init() error {
// Close releases all database resources.
// All transactions must be closed before closing the database.
func (db *DB) Close() error {
db.rwlock.Lock()
defer db.rwlock.Unlock()
db.metalock.Lock()
defer db.metalock.Unlock()
db.mmaplock.RLock()
defer db.mmaplock.RUnlock()
return db.close()
}
@@ -384,11 +350,8 @@ func (db *DB) close() error {
// Close file handles.
if db.file != nil {
// No need to unlock read-only file.
if !db.readOnly {
// Unlock the file.
_ = funlock(db.file)
}
// Unlock the file.
_ = funlock(db.file)
// Close the file descriptor.
if err := db.file.Close(); err != nil {
@@ -406,11 +369,6 @@ func (db *DB) close() error {
// will cause the calls to block and be serialized until the current write
// transaction finishes.
//
// Transactions should not be depedent on one another. Opening a read
// transaction and a write transaction in the same goroutine can cause the
// writer to deadlock because the database periodically needs to re-mmap itself
// as it grows and it cannot do that while a read transaction is open.
//
// IMPORTANT: You must close read-only transactions after you are finished or
// else the database will not reclaim old pages.
func (db *DB) Begin(writable bool) (*Tx, error) {
@@ -459,11 +417,6 @@ func (db *DB) beginTx() (*Tx, error) {
}
func (db *DB) beginRWTx() (*Tx, error) {
// If the database was opened with Options.ReadOnly, return an error.
if db.readOnly {
return nil, ErrDatabaseReadOnly
}
// Obtain writer lock. This is released by the transaction when it closes.
// This enforces only one writer transaction at a time.
db.rwlock.Lock()
@@ -594,12 +547,6 @@ func (db *DB) View(fn func(*Tx) error) error {
return nil
}
// Sync executes fdatasync() against the database file handle.
//
// This is not necessary under normal operation, however, if you use NoSync
// then it allows you to force the database file to sync against the disk.
func (db *DB) Sync() error { return fdatasync(db) }
// Stats retrieves ongoing performance stats for the database.
// This is only updated when a transaction closes.
func (db *DB) Stats() Stats {
@@ -660,33 +607,18 @@ func (db *DB) allocate(count int) (*page, error) {
return p, nil
}
func (db *DB) IsReadOnly() bool {
return db.readOnly
}
// Options represents the options that can be set when opening a database.
type Options struct {
// Timeout is the amount of time to wait to obtain a file lock.
// When set to zero it will wait indefinitely. This option is only
// available on Darwin and Linux.
Timeout time.Duration
// Sets the DB.NoGrowSync flag before memory mapping the file.
NoGrowSync bool
// Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to
// grab a shared lock (UNIX).
ReadOnly bool
// Sets the DB.MmapFlags flag before memory mapping the file.
MmapFlags int
}
// DefaultOptions represent the options used if nil options are passed into Open().
// No timeout is used which will cause Bolt to wait indefinitely for a lock.
var DefaultOptions = &Options{
Timeout: 0,
NoGrowSync: false,
Timeout: 0,
}
// Stats represents statistics about the database.

View File

@@ -39,8 +39,8 @@ func TestOpen(t *testing.T) {
// Ensure that opening an already open database file will timeout.
func TestOpen_Timeout(t *testing.T) {
if runtime.GOOS == "solaris" {
t.Skip("solaris fcntl locks don't support intra-process locking")
if runtime.GOOS == "windows" {
t.Skip("timeout not supported on windows")
}
path := tempfile()
@@ -63,8 +63,8 @@ func TestOpen_Timeout(t *testing.T) {
// Ensure that opening an already open database file will wait until its closed.
func TestOpen_Wait(t *testing.T) {
if runtime.GOOS == "solaris" {
t.Skip("solaris fcntl locks don't support intra-process locking")
if runtime.GOOS == "windows" {
t.Skip("timeout not supported on windows")
}
path := tempfile()
@@ -224,80 +224,6 @@ func TestDB_Open_FileTooSmall(t *testing.T) {
equals(t, errors.New("file size too small"), err)
}
// Ensure that a database can be opened in read-only mode by multiple processes
// and that a database can not be opened in read-write mode and in read-only
// mode at the same time.
func TestOpen_ReadOnly(t *testing.T) {
if runtime.GOOS == "solaris" {
t.Skip("solaris fcntl locks don't support intra-process locking")
}
bucket, key, value := []byte(`bucket`), []byte(`key`), []byte(`value`)
path := tempfile()
defer os.Remove(path)
// Open in read-write mode.
db, err := bolt.Open(path, 0666, nil)
ok(t, db.Update(func(tx *bolt.Tx) error {
b, err := tx.CreateBucket(bucket)
if err != nil {
return err
}
return b.Put(key, value)
}))
assert(t, db != nil, "")
assert(t, !db.IsReadOnly(), "")
ok(t, err)
ok(t, db.Close())
// Open in read-only mode.
db0, err := bolt.Open(path, 0666, &bolt.Options{ReadOnly: true})
ok(t, err)
defer db0.Close()
// Opening in read-write mode should return an error.
_, err = bolt.Open(path, 0666, &bolt.Options{Timeout: time.Millisecond * 100})
assert(t, err != nil, "")
// And again (in read-only mode).
db1, err := bolt.Open(path, 0666, &bolt.Options{ReadOnly: true})
ok(t, err)
defer db1.Close()
// Verify both read-only databases are accessible.
for _, db := range []*bolt.DB{db0, db1} {
// Verify is is in read only mode indeed.
assert(t, db.IsReadOnly(), "")
// Read-only databases should not allow updates.
assert(t,
bolt.ErrDatabaseReadOnly == db.Update(func(*bolt.Tx) error {
panic(`should never get here`)
}),
"")
// Read-only databases should not allow beginning writable txns.
_, err = db.Begin(true)
assert(t, bolt.ErrDatabaseReadOnly == err, "")
// Verify the data.
ok(t, db.View(func(tx *bolt.Tx) error {
b := tx.Bucket(bucket)
if b == nil {
return fmt.Errorf("expected bucket `%s`", string(bucket))
}
got := string(b.Get(key))
expected := string(value)
if got != expected {
return fmt.Errorf("expected `%s`, got `%s`", expected, got)
}
return nil
}))
}
}
// TODO(benbjohnson): Test corruption at every byte of the first two pages.
// Ensure that a database cannot open a transaction when it's not open.
@@ -328,49 +254,6 @@ func TestDB_BeginRW_Closed(t *testing.T) {
assert(t, tx == nil, "")
}
func TestDB_Close_PendingTx_RW(t *testing.T) { testDB_Close_PendingTx(t, true) }
func TestDB_Close_PendingTx_RO(t *testing.T) { testDB_Close_PendingTx(t, false) }
// Ensure that a database cannot close while transactions are open.
func testDB_Close_PendingTx(t *testing.T, writable bool) {
db := NewTestDB()
defer db.Close()
// Start transaction.
tx, err := db.Begin(true)
if err != nil {
t.Fatal(err)
}
// Open update in separate goroutine.
done := make(chan struct{})
go func() {
db.Close()
close(done)
}()
// Ensure database hasn't closed.
time.Sleep(100 * time.Millisecond)
select {
case <-done:
t.Fatal("database closed too early")
default:
}
// Commit transaction.
if err := tx.Commit(); err != nil {
t.Fatal(err)
}
// Ensure database closed now.
time.Sleep(100 * time.Millisecond)
select {
case <-done:
default:
t.Fatal("database did not close")
}
}
// Ensure a database can provide a transactional block.
func TestDB_Update(t *testing.T) {
db := NewTestDB()
@@ -616,7 +499,7 @@ func TestDB_Consistency(t *testing.T) {
})
}
// Ensure that DB stats can be subtracted from one another.
// Ensure that DB stats can be substracted from one another.
func TestDBStats_Sub(t *testing.T) {
var a, b bolt.Stats
a.TxStats.PageCount = 3
@@ -795,7 +678,7 @@ func (db *TestDB) PrintStats() {
// MustCheck runs a consistency check on the database and panics if any errors are found.
func (db *TestDB) MustCheck() {
db.Update(func(tx *bolt.Tx) error {
db.View(func(tx *bolt.Tx) error {
// Collect all the errors.
var errors []error
for err := range tx.Check() {

View File

@@ -36,10 +36,6 @@ var (
// ErrTxClosed is returned when committing or rolling back a transaction
// that has already been committed or rolled back.
ErrTxClosed = errors.New("tx closed")
// ErrDatabaseReadOnly is returned when a mutating transaction is started on a
// read-only database.
ErrDatabaseReadOnly = errors.New("database is in read-only mode")
)
// These errors can occur when putting or deleting a value or a bucket.

View File

@@ -48,14 +48,15 @@ func (f *freelist) pending_count() int {
// all returns a list of all free ids and all pending ids in one sorted list.
func (f *freelist) all() []pgid {
m := make(pgids, 0)
ids := make([]pgid, len(f.ids))
copy(ids, f.ids)
for _, list := range f.pending {
m = append(m, list...)
ids = append(ids, list...)
}
sort.Sort(m)
return pgids(f.ids).merge(m)
sort.Sort(pgids(ids))
return ids
}
// allocate returns the starting page id of a contiguous list of pages of a given size.
@@ -126,17 +127,15 @@ func (f *freelist) free(txid txid, p *page) {
// release moves all page ids for a transaction id (or older) to the freelist.
func (f *freelist) release(txid txid) {
m := make(pgids, 0)
for tid, ids := range f.pending {
if tid <= txid {
// Move transaction's pending pages to the available freelist.
// Don't remove from the cache since the page is still free.
m = append(m, ids...)
f.ids = append(f.ids, ids...)
delete(f.pending, tid)
}
}
sort.Sort(m)
f.ids = pgids(f.ids).merge(m)
sort.Sort(pgids(f.ids))
}
// rollback removes the pages from a given pending tx.

View File

@@ -1,9 +1,7 @@
package bolt
import (
"math/rand"
"reflect"
"sort"
"testing"
"unsafe"
)
@@ -129,28 +127,3 @@ func TestFreelist_write(t *testing.T) {
t.Fatalf("exp=%v; got=%v", exp, f2.ids)
}
}
func Benchmark_FreelistRelease10K(b *testing.B) { benchmark_FreelistRelease(b, 10000) }
func Benchmark_FreelistRelease100K(b *testing.B) { benchmark_FreelistRelease(b, 100000) }
func Benchmark_FreelistRelease1000K(b *testing.B) { benchmark_FreelistRelease(b, 1000000) }
func Benchmark_FreelistRelease10000K(b *testing.B) { benchmark_FreelistRelease(b, 10000000) }
func benchmark_FreelistRelease(b *testing.B, size int) {
ids := randomPgids(size)
pending := randomPgids(len(ids) / 400)
b.ResetTimer()
for i := 0; i < b.N; i++ {
f := &freelist{ids: ids, pending: map[txid][]pgid{1: pending}}
f.release(1)
}
}
func randomPgids(n int) []pgid {
rand.Seed(42)
pgids := make(pgids, n)
for i := range pgids {
pgids[i] = pgid(rand.Int63())
}
sort.Sort(pgids)
return pgids
}

View File

@@ -221,20 +221,11 @@ func (n *node) write(p *page) {
_assert(elem.pgid != p.id, "write: circular dependency occurred")
}
// If the length of key+value is larger than the max allocation size
// then we need to reallocate the byte array pointer.
//
// See: https://github.com/boltdb/bolt/pull/335
klen, vlen := len(item.key), len(item.value)
if len(b) < klen+vlen {
b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:]
}
// Write data for the element to the end of the page.
copy(b[0:], item.key)
b = b[klen:]
b = b[len(item.key):]
copy(b[0:], item.value)
b = b[vlen:]
b = b[len(item.value):]
}
// DEBUG ONLY: n.dump()

View File

@@ -3,7 +3,6 @@ package bolt
import (
"fmt"
"os"
"sort"
"unsafe"
)
@@ -97,7 +96,7 @@ type branchPageElement struct {
// key returns a byte slice of the node key.
func (n *branchPageElement) key() []byte {
buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
return (*[maxAllocSize]byte)(unsafe.Pointer(&buf[n.pos]))[:n.ksize]
return buf[n.pos : n.pos+n.ksize]
}
// leafPageElement represents a node on a leaf page.
@@ -111,13 +110,13 @@ type leafPageElement struct {
// key returns a byte slice of the node key.
func (n *leafPageElement) key() []byte {
buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
return (*[maxAllocSize]byte)(unsafe.Pointer(&buf[n.pos]))[:n.ksize]
return buf[n.pos : n.pos+n.ksize]
}
// value returns a byte slice of the node value.
func (n *leafPageElement) value() []byte {
buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
return (*[maxAllocSize]byte)(unsafe.Pointer(&buf[n.pos+n.ksize]))[:n.vsize]
return buf[n.pos+n.ksize : n.pos+n.ksize+n.vsize]
}
// PageInfo represents human readable information about a page.
@@ -133,40 +132,3 @@ type pgids []pgid
func (s pgids) Len() int { return len(s) }
func (s pgids) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s pgids) Less(i, j int) bool { return s[i] < s[j] }
// merge returns the sorted union of a and b.
func (a pgids) merge(b pgids) pgids {
// Return the opposite slice if one is nil.
if len(a) == 0 {
return b
} else if len(b) == 0 {
return a
}
// Create a list to hold all elements from both lists.
merged := make(pgids, 0, len(a)+len(b))
// Assign lead to the slice with a lower starting value, follow to the higher value.
lead, follow := a, b
if b[0] < a[0] {
lead, follow = b, a
}
// Continue while there are elements in the lead.
for len(lead) > 0 {
// Merge largest prefix of lead that is ahead of follow[0].
n := sort.Search(len(lead), func(i int) bool { return lead[i] > follow[0] })
merged = append(merged, lead[:n]...)
if n >= len(lead) {
break
}
// Swap lead and follow.
lead, follow = follow, lead[n:]
}
// Append what's left in follow.
merged = append(merged, follow...)
return merged
}

View File

@@ -1,10 +1,7 @@
package bolt
import (
"reflect"
"sort"
"testing"
"testing/quick"
)
// Ensure that the page type can be returned in human readable format.
@@ -30,43 +27,3 @@ func TestPage_typ(t *testing.T) {
func TestPage_dump(t *testing.T) {
(&page{id: 256}).hexdump(16)
}
func TestPgids_merge(t *testing.T) {
a := pgids{4, 5, 6, 10, 11, 12, 13, 27}
b := pgids{1, 3, 8, 9, 25, 30}
c := a.merge(b)
if !reflect.DeepEqual(c, pgids{1, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 25, 27, 30}) {
t.Errorf("mismatch: %v", c)
}
a = pgids{4, 5, 6, 10, 11, 12, 13, 27, 35, 36}
b = pgids{8, 9, 25, 30}
c = a.merge(b)
if !reflect.DeepEqual(c, pgids{4, 5, 6, 8, 9, 10, 11, 12, 13, 25, 27, 30, 35, 36}) {
t.Errorf("mismatch: %v", c)
}
}
func TestPgids_merge_quick(t *testing.T) {
if err := quick.Check(func(a, b pgids) bool {
// Sort incoming lists.
sort.Sort(a)
sort.Sort(b)
// Merge the two lists together.
got := a.merge(b)
// The expected value should be the two lists combined and sorted.
exp := append(a, b...)
sort.Sort(exp)
if !reflect.DeepEqual(exp, got) {
t.Errorf("\nexp=%+v\ngot=%+v\n", exp, got)
return false
}
return true
}, nil); err != nil {
t.Fatal(err)
}
}

View File

@@ -29,14 +29,6 @@ type Tx struct {
pages map[pgid]*page
stats TxStats
commitHandlers []func()
// WriteFlag specifies the flag for write-related methods like WriteTo().
// Tx opens the database file with the specified flag to copy the data.
//
// By default, the flag is unset, which works well for mostly in-memory
// workloads. For databases that are much larger than available RAM,
// set the flag to syscall.O_DIRECT to avoid trashing the page cache.
WriteFlag int
}
// init initializes the transaction.
@@ -95,21 +87,18 @@ func (tx *Tx) Stats() TxStats {
// Bucket retrieves a bucket by name.
// Returns nil if the bucket does not exist.
// The bucket instance is only valid for the lifetime of the transaction.
func (tx *Tx) Bucket(name []byte) *Bucket {
return tx.root.Bucket(name)
}
// CreateBucket creates a new bucket.
// Returns an error if the bucket already exists, if the bucket name is blank, or if the bucket name is too long.
// The bucket instance is only valid for the lifetime of the transaction.
func (tx *Tx) CreateBucket(name []byte) (*Bucket, error) {
return tx.root.CreateBucket(name)
}
// CreateBucketIfNotExists creates a new bucket if it doesn't already exist.
// Returns an error if the bucket name is blank, or if the bucket name is too long.
// The bucket instance is only valid for the lifetime of the transaction.
func (tx *Tx) CreateBucketIfNotExists(name []byte) (*Bucket, error) {
return tx.root.CreateBucketIfNotExists(name)
}
@@ -138,8 +127,7 @@ func (tx *Tx) OnCommit(fn func()) {
}
// Commit writes all changes to disk and updates the meta page.
// Returns an error if a disk write error occurs, or if Commit is
// called on a read-only transaction.
// Returns an error if a disk write error occurs.
func (tx *Tx) Commit() error {
_assert(!tx.managed, "managed tx commit not allowed")
if tx.db == nil {
@@ -215,8 +203,7 @@ func (tx *Tx) Commit() error {
return nil
}
// Rollback closes the transaction and ignores all previous updates. Read-only
// transactions must be rolled back and not committed.
// Rollback closes the transaction and ignores all previous updates.
func (tx *Tx) Rollback() error {
_assert(!tx.managed, "managed tx rollback not allowed")
if tx.db == nil {
@@ -247,8 +234,7 @@ func (tx *Tx) close() {
var freelistPendingN = tx.db.freelist.pending_count()
var freelistAlloc = tx.db.freelist.size()
// Remove transaction ref & writer lock.
tx.db.rwtx = nil
// Remove writer lock.
tx.db.rwlock.Unlock()
// Merge statistics.
@@ -262,12 +248,7 @@ func (tx *Tx) close() {
} else {
tx.db.removeTx(tx)
}
// Clear all references.
tx.db = nil
tx.meta = nil
tx.root = Bucket{tx: tx}
tx.pages = nil
}
// Copy writes the entire database to a writer.
@@ -280,18 +261,21 @@ func (tx *Tx) Copy(w io.Writer) error {
// WriteTo writes the entire database to a writer.
// If err == nil then exactly tx.Size() bytes will be written into the writer.
func (tx *Tx) WriteTo(w io.Writer) (n int64, err error) {
// Attempt to open reader with WriteFlag
f, err := os.OpenFile(tx.db.path, os.O_RDONLY|tx.WriteFlag, 0)
if err != nil {
return 0, err
// Attempt to open reader directly.
var f *os.File
if f, err = os.OpenFile(tx.db.path, os.O_RDONLY|odirect, 0); err != nil {
// Fallback to a regular open if that doesn't work.
if f, err = os.OpenFile(tx.db.path, os.O_RDONLY, 0); err != nil {
return 0, err
}
}
defer f.Close()
// Copy the meta pages.
tx.db.metalock.Lock()
n, err = io.CopyN(w, f, int64(tx.db.pageSize*2))
tx.db.metalock.Unlock()
if err != nil {
_ = f.Close()
return n, fmt.Errorf("meta copy: %s", err)
}
@@ -299,6 +283,7 @@ func (tx *Tx) WriteTo(w io.Writer) (n int64, err error) {
wn, err := io.CopyN(w, f, tx.Size()-int64(tx.db.pageSize*2))
n += wn
if err != nil {
_ = f.Close()
return n, err
}
@@ -436,39 +421,15 @@ func (tx *Tx) write() error {
// Write pages to disk in order.
for _, p := range pages {
size := (int(p.overflow) + 1) * tx.db.pageSize
buf := (*[maxAllocSize]byte)(unsafe.Pointer(p))[:size]
offset := int64(p.id) * int64(tx.db.pageSize)
// Write out page in "max allocation" sized chunks.
ptr := (*[maxAllocSize]byte)(unsafe.Pointer(p))
for {
// Limit our write to our max allocation size.
sz := size
if sz > maxAllocSize-1 {
sz = maxAllocSize - 1
}
// Write chunk to disk.
buf := ptr[:sz]
if _, err := tx.db.ops.writeAt(buf, offset); err != nil {
return err
}
// Update statistics.
tx.stats.Write++
// Exit inner for loop if we've written all the chunks.
size -= sz
if size == 0 {
break
}
// Otherwise move offset forward and move pointer to next chunk.
offset += int64(sz)
ptr = (*[maxAllocSize]byte)(unsafe.Pointer(&ptr[sz]))
if _, err := tx.db.ops.writeAt(buf, offset); err != nil {
return err
}
}
// Ignore file sync if flag is set on DB.
// Update statistics.
tx.stats.Write++
}
if !tx.db.NoSync || IgnoreNoSync {
if err := fdatasync(tx.db); err != nil {
return err

View File

@@ -252,38 +252,6 @@ func TestTx_DeleteBucket_NotFound(t *testing.T) {
})
}
// Ensure that no error is returned when a tx.ForEach function does not return
// an error.
func TestTx_ForEach_NoError(t *testing.T) {
db := NewTestDB()
defer db.Close()
db.Update(func(tx *bolt.Tx) error {
tx.CreateBucket([]byte("widgets"))
tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar"))
equals(t, nil, tx.ForEach(func(name []byte, b *bolt.Bucket) error {
return nil
}))
return nil
})
}
// Ensure that an error is returned when a tx.ForEach function returns an error.
func TestTx_ForEach_WithError(t *testing.T) {
db := NewTestDB()
defer db.Close()
db.Update(func(tx *bolt.Tx) error {
tx.CreateBucket([]byte("widgets"))
tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar"))
err := errors.New("foo")
equals(t, err, tx.ForEach(func(name []byte, b *bolt.Bucket) error {
return err
}))
return nil
})
}
// Ensure that Tx commit handlers are called after a transaction successfully commits.
func TestTx_OnCommit(t *testing.T) {
var x int

View File

@@ -0,0 +1 @@
*~

View File

@@ -0,0 +1,19 @@
# This file is like Go's AUTHORS file: it lists Copyright holders.
# The list of humans who have contributd is in the CONTRIBUTORS file.
#
# To contribute to this project, because it will eventually be folded
# back in to Go itself, you need to submit a CLA:
#
# http://golang.org/doc/contribute.html#copyright
#
# Then you get added to CONTRIBUTORS and you or your company get added
# to the AUTHORS file.
Blake Mizerany <blake.mizerany@gmail.com> github=bmizerany
Daniel Morsing <daniel.morsing@gmail.com> github=DanielMorsing
Gabriel Aszalos <gabriel.aszalos@gmail.com> github=gbbr
Google, Inc.
Keith Rarick <kr@xph.us> github=kr
Matthew Keenan <tank.en.mate@gmail.com> <github@mattkeenan.net> github=mattkeenan
Matt Layher <mdlayher@gmail.com> github=mdlayher
Tatsuhiro Tsujikawa <tatsuhiro.t@gmail.com> github=tatsuhiro-t

View File

@@ -0,0 +1,19 @@
# This file is like Go's CONTRIBUTORS file: it lists humans.
# The list of copyright holders (which may be companies) are in the AUTHORS file.
#
# To contribute to this project, because it will eventually be folded
# back in to Go itself, you need to submit a CLA:
#
# http://golang.org/doc/contribute.html#copyright
#
# Then you get added to CONTRIBUTORS and you or your company get added
# to the AUTHORS file.
Blake Mizerany <blake.mizerany@gmail.com> github=bmizerany
Brad Fitzpatrick <bradfitz@golang.org> github=bradfitz
Daniel Morsing <daniel.morsing@gmail.com> github=DanielMorsing
Gabriel Aszalos <gabriel.aszalos@gmail.com> github=gbbr
Keith Rarick <kr@xph.us> github=kr
Matthew Keenan <tank.en.mate@gmail.com> <github@mattkeenan.net> github=mattkeenan
Matt Layher <mdlayher@gmail.com> github=mdlayher
Tatsuhiro Tsujikawa <tatsuhiro.t@gmail.com> github=tatsuhiro-t

View File

@@ -17,15 +17,8 @@ RUN apt-get install -y --no-install-recommends \
libcunit1-dev libssl-dev libxml2-dev libevent-dev \
automake autoconf
# The list of packages nghttp2 recommends for h2load:
RUN apt-get install -y --no-install-recommends make binutils \
autoconf automake autotools-dev \
libtool pkg-config zlib1g-dev libcunit1-dev libssl-dev libxml2-dev \
libev-dev libevent-dev libjansson-dev libjemalloc-dev \
cython python3.4-dev python-setuptools
# Note: setting NGHTTP2_VER before the git clone, so an old git clone isn't cached:
ENV NGHTTP2_VER 895da9a
ENV NGHTTP2_VER af24f8394e43f4
RUN cd /root && git clone https://github.com/tatsuhiro-t/nghttp2.git
WORKDIR /root/nghttp2
@@ -38,9 +31,9 @@ RUN make
RUN make install
WORKDIR /root
RUN wget http://curl.haxx.se/download/curl-7.45.0.tar.gz
RUN tar -zxvf curl-7.45.0.tar.gz
WORKDIR /root/curl-7.45.0
RUN wget http://curl.haxx.se/download/curl-7.40.0.tar.gz
RUN tar -zxvf curl-7.40.0.tar.gz
WORKDIR /root/curl-7.40.0
RUN ./configure --with-ssl --with-nghttp2=/usr/local
RUN make
RUN make install

View File

@@ -0,0 +1,5 @@
We only accept contributions from users who have gone through Go's
contribution process (signed a CLA).
Please acknowledge whether you have (and use the same email) if
sending a pull request.

View File

@@ -0,0 +1,7 @@
Copyright 2014 Google & the Go AUTHORS
Go AUTHORS are:
See https://code.google.com/p/go/source/browse/AUTHORS
Licensed under the terms of Go itself:
https://code.google.com/p/go/source/browse/LICENSE

View File

@@ -10,11 +10,8 @@ Status:
* The client work has just started but shares a lot of code
is coming along much quicker.
Docs are at https://godoc.org/golang.org/x/net/http2
Docs are at https://godoc.org/github.com/bradfitz/http2
Demo test server at https://http2.golang.org/
Help & bug reports welcome!
Contributing: https://golang.org/doc/contribute.html
Bugs: https://golang.org/issue/new?title=x/net/http2:+
Help & bug reports welcome.

View File

@@ -0,0 +1,75 @@
// Copyright 2014 The Go Authors.
// See https://code.google.com/p/go/source/browse/CONTRIBUTORS
// Licensed under the same terms as Go itself:
// https://code.google.com/p/go/source/browse/LICENSE
package http2
import (
"errors"
)
// buffer is an io.ReadWriteCloser backed by a fixed size buffer.
// It never allocates, but moves old data as new data is written.
type buffer struct {
buf []byte
r, w int
closed bool
err error // err to return to reader
}
var (
errReadEmpty = errors.New("read from empty buffer")
errWriteFull = errors.New("write on full buffer")
)
// Read copies bytes from the buffer into p.
// It is an error to read when no data is available.
func (b *buffer) Read(p []byte) (n int, err error) {
n = copy(p, b.buf[b.r:b.w])
b.r += n
if b.closed && b.r == b.w {
err = b.err
} else if b.r == b.w && n == 0 {
err = errReadEmpty
}
return n, err
}
// Len returns the number of bytes of the unread portion of the buffer.
func (b *buffer) Len() int {
return b.w - b.r
}
// Write copies bytes from p into the buffer.
// It is an error to write more data than the buffer can hold.
func (b *buffer) Write(p []byte) (n int, err error) {
if b.closed {
return 0, errors.New("closed")
}
// Slide existing data to beginning.
if b.r > 0 && len(p) > len(b.buf)-b.w {
copy(b.buf, b.buf[b.r:b.w])
b.w -= b.r
b.r = 0
}
// Write new data.
n = copy(b.buf[b.w:], p)
b.w += n
if n < len(p) {
err = errWriteFull
}
return n, err
}
// Close marks the buffer as closed. Future calls to Write will
// return an error. Future calls to Read, once the buffer is
// empty, will return err.
func (b *buffer) Close(err error) {
if !b.closed {
b.closed = true
b.err = err
}
}

View File

@@ -0,0 +1,73 @@
// Copyright 2014 The Go Authors.
// See https://code.google.com/p/go/source/browse/CONTRIBUTORS
// Licensed under the same terms as Go itself:
// https://code.google.com/p/go/source/browse/LICENSE
package http2
import (
"io"
"reflect"
"testing"
)
var bufferReadTests = []struct {
buf buffer
read, wn int
werr error
wp []byte
wbuf buffer
}{
{
buffer{[]byte{'a', 0}, 0, 1, false, nil},
5, 1, nil, []byte{'a'},
buffer{[]byte{'a', 0}, 1, 1, false, nil},
},
{
buffer{[]byte{'a', 0}, 0, 1, true, io.EOF},
5, 1, io.EOF, []byte{'a'},
buffer{[]byte{'a', 0}, 1, 1, true, io.EOF},
},
{
buffer{[]byte{0, 'a'}, 1, 2, false, nil},
5, 1, nil, []byte{'a'},
buffer{[]byte{0, 'a'}, 2, 2, false, nil},
},
{
buffer{[]byte{0, 'a'}, 1, 2, true, io.EOF},
5, 1, io.EOF, []byte{'a'},
buffer{[]byte{0, 'a'}, 2, 2, true, io.EOF},
},
{
buffer{[]byte{}, 0, 0, false, nil},
5, 0, errReadEmpty, []byte{},
buffer{[]byte{}, 0, 0, false, nil},
},
{
buffer{[]byte{}, 0, 0, true, io.EOF},
5, 0, io.EOF, []byte{},
buffer{[]byte{}, 0, 0, true, io.EOF},
},
}
func TestBufferRead(t *testing.T) {
for i, tt := range bufferReadTests {
read := make([]byte, tt.read)
n, err := tt.buf.Read(read)
if n != tt.wn {
t.Errorf("#%d: wn = %d want %d", i, n, tt.wn)
continue
}
if err != tt.werr {
t.Errorf("#%d: werr = %v want %v", i, err, tt.werr)
continue
}
read = read[:n]
if !reflect.DeepEqual(read, tt.wp) {
t.Errorf("#%d: read = %+v want %+v", i, read, tt.wp)
}
if !reflect.DeepEqual(tt.buf, tt.wbuf) {
t.Errorf("#%d: buf = %+v want %+v", i, tt.buf, tt.wbuf)
}
}
}

View File

@@ -1,6 +1,7 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Copyright 2014 The Go Authors.
// See https://code.google.com/p/go/source/browse/CONTRIBUTORS
// Licensed under the same terms as Go itself:
// https://code.google.com/p/go/source/browse/LICENSE
package http2
@@ -75,16 +76,3 @@ func (e StreamError) Error() string {
type goAwayFlowError struct{}
func (goAwayFlowError) Error() string { return "connection exceeded flow control window size" }
// connErrorReason wraps a ConnectionError with an informative error about why it occurs.
// Errors of this type are only returned by the frame parser functions
// and converted into ConnectionError(ErrCodeProtocol).
type connError struct {
Code ErrCode
Reason string
}
func (e connError) Error() string {
return fmt.Sprintf("http2: connection error: %v: %v", e.Code, e.Reason)
}

View File

@@ -1,6 +1,9 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// See https://code.google.com/p/go/source/browse/CONTRIBUTORS
// Licensed under the same terms as Go itself:
// https://code.google.com/p/go/source/browse/LICENSE
package http2

View File

@@ -1,6 +1,7 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Copyright 2014 The Go Authors.
// See https://code.google.com/p/go/source/browse/CONTRIBUTORS
// Licensed under the same terms as Go itself:
// https://code.google.com/p/go/source/browse/LICENSE
// Flow control

View File

@@ -1,6 +1,7 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Copyright 2014 The Go Authors.
// See https://code.google.com/p/go/source/browse/CONTRIBUTORS
// Licensed under the same terms as Go itself:
// https://code.google.com/p/go/source/browse/LICENSE
package http2

View File

@@ -1,6 +1,7 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Copyright 2014 The Go Authors.
// See https://code.google.com/p/go/source/browse/CONTRIBUTORS
// Licensed under the same terms as Go itself:
// https://code.google.com/p/go/source/browse/LICENSE
package http2
@@ -10,7 +11,6 @@ import (
"errors"
"fmt"
"io"
"log"
"sync"
)
@@ -85,8 +85,8 @@ const (
// Continuation Frame
FlagContinuationEndHeaders Flags = 0x4
FlagPushPromiseEndHeaders Flags = 0x4
FlagPushPromisePadded Flags = 0x8
FlagPushPromiseEndHeaders = 0x4
FlagPushPromisePadded = 0x8
)
var flagName = map[FrameType]map[Flags]string{
@@ -172,12 +172,6 @@ func (h FrameHeader) Header() FrameHeader { return h }
func (h FrameHeader) String() string {
var buf bytes.Buffer
buf.WriteString("[FrameHeader ")
h.writeDebug(&buf)
buf.WriteByte(']')
return buf.String()
}
func (h FrameHeader) writeDebug(buf *bytes.Buffer) {
buf.WriteString(h.Type.String())
if h.Flags != 0 {
buf.WriteString(" flags=")
@@ -194,14 +188,15 @@ func (h FrameHeader) writeDebug(buf *bytes.Buffer) {
if name != "" {
buf.WriteString(name)
} else {
fmt.Fprintf(buf, "0x%x", 1<<i)
fmt.Fprintf(&buf, "0x%x", 1<<i)
}
}
}
if h.StreamID != 0 {
fmt.Fprintf(buf, " stream=%d", h.StreamID)
fmt.Fprintf(&buf, " stream=%d", h.StreamID)
}
fmt.Fprintf(buf, " len=%d", h.Length)
fmt.Fprintf(&buf, " len=%d]", h.Length)
return buf.String()
}
func (h *FrameHeader) checkValid() {
@@ -261,11 +256,6 @@ type Frame interface {
type Framer struct {
r io.Reader
lastFrame Frame
errReason string
// lastHeaderStream is non-zero if the last frame was an
// unfinished HEADERS/CONTINUATION.
lastHeaderStream uint32
maxReadSize uint32
headerBuf [frameHeaderLen]byte
@@ -282,29 +272,18 @@ type Framer struct {
wbuf []byte
// AllowIllegalWrites permits the Framer's Write methods to
// write frames that do not conform to the HTTP/2 spec. This
// write frames that do not conform to the HTTP/2 spec. This
// permits using the Framer to test other HTTP/2
// implementations' conformance to the spec.
// If false, the Write methods will prefer to return an error
// rather than comply.
AllowIllegalWrites bool
// AllowIllegalReads permits the Framer's ReadFrame method
// to return non-compliant frames or frame orders.
// This is for testing and permits using the Framer to test
// other HTTP/2 implementations' conformance to the spec.
AllowIllegalReads bool
// TODO: track which type of frame & with which flags was sent
// last. Then return an error (unless AllowIllegalWrites) if
// we're in the middle of a header block and a
// non-Continuation or Continuation on a different stream is
// attempted to be written.
logReads bool
debugFramer *Framer // only use for logging written writes
debugFramerBuf *bytes.Buffer
}
func (f *Framer) startWrite(ftype FrameType, flags Flags, streamID uint32) {
@@ -332,10 +311,6 @@ func (f *Framer) endWrite() error {
byte(length>>16),
byte(length>>8),
byte(length))
if logFrameWrites {
f.logWrite()
}
n, err := f.w.Write(f.wbuf)
if err == nil && n != len(f.wbuf) {
err = io.ErrShortWrite
@@ -343,24 +318,6 @@ func (f *Framer) endWrite() error {
return err
}
func (f *Framer) logWrite() {
if f.debugFramer == nil {
f.debugFramerBuf = new(bytes.Buffer)
f.debugFramer = NewFramer(nil, f.debugFramerBuf)
f.debugFramer.logReads = false // we log it ourselves, saying "wrote" below
// Let us read anything, even if we accidentally wrote it
// in the wrong order:
f.debugFramer.AllowIllegalReads = true
}
f.debugFramerBuf.Write(f.wbuf)
fr, err := f.debugFramer.ReadFrame()
if err != nil {
log.Printf("http2: Framer %p: failed to decode just-written frame", f)
return
}
log.Printf("http2: Framer %p: wrote %v", f, summarizeFrame(fr))
}
func (f *Framer) writeByte(v byte) { f.wbuf = append(f.wbuf, v) }
func (f *Framer) writeBytes(v []byte) { f.wbuf = append(f.wbuf, v...) }
func (f *Framer) writeUint16(v uint16) { f.wbuf = append(f.wbuf, byte(v>>8), byte(v)) }
@@ -376,9 +333,8 @@ const (
// NewFramer returns a Framer that writes frames to w and reads them from r.
func NewFramer(w io.Writer, r io.Reader) *Framer {
fr := &Framer{
w: w,
r: r,
logReads: logFrameReads,
w: w,
r: r,
}
fr.getReadBuf = func(size uint32) []byte {
if cap(fr.readBuf) >= int(size) {
@@ -406,22 +362,10 @@ func (fr *Framer) SetMaxReadFrameSize(v uint32) {
// sends a frame that is larger than declared with SetMaxReadFrameSize.
var ErrFrameTooLarge = errors.New("http2: frame too large")
// terminalReadFrameError reports whether err is an unrecoverable
// error from ReadFrame and no other frames should be read.
func terminalReadFrameError(err error) bool {
if _, ok := err.(StreamError); ok {
return false
}
return err != nil
}
// ReadFrame reads a single frame. The returned Frame is only valid
// until the next call to ReadFrame.
//
// If the frame is larger than previously set with SetMaxReadFrameSize, the
// returned error is ErrFrameTooLarge. Other errors may be of type
// ConnectionError, StreamError, or anything else from from the underlying
// reader.
// If the frame is larger than previously set with SetMaxReadFrameSize,
// the returned error is ErrFrameTooLarge.
func (fr *Framer) ReadFrame() (Frame, error) {
if fr.lastFrame != nil {
fr.lastFrame.invalidate()
@@ -439,66 +383,10 @@ func (fr *Framer) ReadFrame() (Frame, error) {
}
f, err := typeFrameParser(fh.Type)(fh, payload)
if err != nil {
if ce, ok := err.(connError); ok {
return nil, fr.connError(ce.Code, ce.Reason)
}
return nil, err
}
if err := fr.checkFrameOrder(f); err != nil {
return nil, err
}
if fr.logReads {
log.Printf("http2: Framer %p: read %v", fr, summarizeFrame(f))
}
return f, nil
}
// connError returns ConnectionError(code) but first
// stashes away a public reason to the caller can optionally relay it
// to the peer before hanging up on them. This might help others debug
// their implementations.
func (fr *Framer) connError(code ErrCode, reason string) error {
fr.errReason = reason
return ConnectionError(code)
}
// checkFrameOrder reports an error if f is an invalid frame to return
// next from ReadFrame. Mostly it checks whether HEADERS and
// CONTINUATION frames are contiguous.
func (fr *Framer) checkFrameOrder(f Frame) error {
last := fr.lastFrame
fr.lastFrame = f
if fr.AllowIllegalReads {
return nil
}
fh := f.Header()
if fr.lastHeaderStream != 0 {
if fh.Type != FrameContinuation {
return fr.connError(ErrCodeProtocol,
fmt.Sprintf("got %s for stream %d; expected CONTINUATION following %s for stream %d",
fh.Type, fh.StreamID,
last.Header().Type, fr.lastHeaderStream))
}
if fh.StreamID != fr.lastHeaderStream {
return fr.connError(ErrCodeProtocol,
fmt.Sprintf("got CONTINUATION for stream %d; expected stream %d",
fh.StreamID, fr.lastHeaderStream))
}
} else if fh.Type == FrameContinuation {
return fr.connError(ErrCodeProtocol, fmt.Sprintf("unexpected CONTINUATION for stream %d", fh.StreamID))
}
switch fh.Type {
case FrameHeaders, FrameContinuation:
if fh.Flags.Has(FlagHeadersEndHeaders) {
fr.lastHeaderStream = 0
} else {
fr.lastHeaderStream = fh.StreamID
}
}
return nil
return f, nil
}
// A DataFrame conveys arbitrary, variable-length sequences of octets
@@ -529,7 +417,7 @@ func parseDataFrame(fh FrameHeader, payload []byte) (Frame, error) {
// field is 0x0, the recipient MUST respond with a
// connection error (Section 5.4.1) of type
// PROTOCOL_ERROR.
return nil, connError{ErrCodeProtocol, "DATA frame with stream ID 0"}
return nil, ConnectionError(ErrCodeProtocol)
}
f := &DataFrame{
FrameHeader: fh,
@@ -547,7 +435,7 @@ func parseDataFrame(fh FrameHeader, payload []byte) (Frame, error) {
// length of the frame payload, the recipient MUST
// treat this as a connection error.
// Filed: https://github.com/http2/http2-spec/issues/610
return nil, connError{ErrCodeProtocol, "pad size larger than data payload"}
return nil, ConnectionError(ErrCodeProtocol)
}
f.data = payload[:len(payload)-int(padSize)]
return f, nil
@@ -687,8 +575,6 @@ type PingFrame struct {
Data [8]byte
}
func (f *PingFrame) IsAck() bool { return f.Flags.Has(FlagPingAck) }
func parsePingFrame(fh FrameHeader, payload []byte) (Frame, error) {
if len(payload) != 8 {
return nil, ConnectionError(ErrCodeFrameSize)
@@ -777,7 +663,7 @@ func parseUnknownFrame(fh FrameHeader, p []byte) (Frame, error) {
// See http://http2.github.io/http2-spec/#rfc.section.6.9
type WindowUpdateFrame struct {
FrameHeader
Increment uint32 // never read with high bit set
Increment uint32
}
func parseWindowUpdateFrame(fh FrameHeader, p []byte) (Frame, error) {
@@ -854,7 +740,7 @@ func parseHeadersFrame(fh FrameHeader, p []byte) (_ Frame, err error) {
// is received whose stream identifier field is 0x0, the recipient MUST
// respond with a connection error (Section 5.4.1) of type
// PROTOCOL_ERROR.
return nil, connError{ErrCodeProtocol, "HEADERS frame with stream ID 0"}
return nil, ConnectionError(ErrCodeProtocol)
}
var padLength uint8
if fh.Flags.Has(FlagHeadersPadded) {
@@ -984,10 +870,10 @@ func (p PriorityParam) IsZero() bool {
func parsePriorityFrame(fh FrameHeader, payload []byte) (Frame, error) {
if fh.StreamID == 0 {
return nil, connError{ErrCodeProtocol, "PRIORITY frame with stream ID 0"}
return nil, ConnectionError(ErrCodeProtocol)
}
if len(payload) != 5 {
return nil, connError{ErrCodeFrameSize, fmt.Sprintf("PRIORITY frame payload size was %d; want 5", len(payload))}
return nil, ConnectionError(ErrCodeFrameSize)
}
v := binary.BigEndian.Uint32(payload[:4])
streamID := v & 0x7fffffff // mask off high bit
@@ -1057,12 +943,13 @@ type ContinuationFrame struct {
}
func parseContinuationFrame(fh FrameHeader, p []byte) (Frame, error) {
if fh.StreamID == 0 {
return nil, connError{ErrCodeProtocol, "CONTINUATION frame with stream ID 0"}
}
return &ContinuationFrame{fh, p}, nil
}
func (f *ContinuationFrame) StreamEnded() bool {
return f.FrameHeader.Flags.Has(FlagDataEndStream)
}
func (f *ContinuationFrame) HeaderBlockFragment() []byte {
f.checkValid()
return f.headerFragBuf
@@ -1224,46 +1111,3 @@ type streamEnder interface {
type headersEnder interface {
HeadersEnded() bool
}
func summarizeFrame(f Frame) string {
var buf bytes.Buffer
f.Header().writeDebug(&buf)
switch f := f.(type) {
case *SettingsFrame:
n := 0
f.ForeachSetting(func(s Setting) error {
n++
if n == 1 {
buf.WriteString(", settings:")
}
fmt.Fprintf(&buf, " %v=%v,", s.ID, s.Val)
return nil
})
if n > 0 {
buf.Truncate(buf.Len() - 1) // remove trailing comma
}
case *DataFrame:
data := f.Data()
const max = 256
if len(data) > max {
data = data[:max]
}
fmt.Fprintf(&buf, " data=%q", data)
if len(f.Data()) > max {
fmt.Fprintf(&buf, " (%d bytes omitted)", len(f.Data())-max)
}
case *WindowUpdateFrame:
if f.StreamID == 0 {
buf.WriteString(" (conn)")
}
fmt.Fprintf(&buf, " incr=%v", f.Increment)
case *PingFrame:
fmt.Fprintf(&buf, " ping=%q", f.Data[:])
case *GoAwayFrame:
fmt.Fprintf(&buf, " LastStreamID=%v ErrCode=%v Debug=%q",
f.LastStreamID, f.ErrCode, f.debugData)
case *RSTStreamFrame:
fmt.Fprintf(&buf, " ErrCode=%v", f.ErrCode)
}
return buf.String()
}

View File

@@ -6,8 +6,6 @@ package http2
import (
"bytes"
"fmt"
"io"
"reflect"
"strings"
"testing"
@@ -26,25 +24,6 @@ func TestFrameSizes(t *testing.T) {
}
}
func TestFrameTypeString(t *testing.T) {
tests := []struct {
ft FrameType
want string
}{
{FrameData, "DATA"},
{FramePing, "PING"},
{FrameGoAway, "GOAWAY"},
{0xf, "UNKNOWN_FRAME_TYPE_15"},
}
for i, tt := range tests {
got := tt.ft.String()
if got != tt.want {
t.Errorf("%d. String(FrameType %d) = %q; want %q", i, int(tt.ft), got, tt.want)
}
}
}
func TestWriteRST(t *testing.T) {
fr, buf := testFramer()
var streamID uint32 = 1<<24 + 2<<16 + 3<<8 + 4
@@ -266,7 +245,6 @@ func TestWriteContinuation(t *testing.T) {
t.Errorf("test %q: %v", tt.name, err)
continue
}
fr.AllowIllegalReads = true
f, err := fr.ReadFrame()
if err != nil {
t.Errorf("test %q: failed to read the frame back: %v", tt.name, err)
@@ -598,138 +576,3 @@ func TestWritePushPromise(t *testing.T) {
t.Fatalf("parsed back:\n%#v\nwant:\n%#v", f, want)
}
}
// test checkFrameOrder and that HEADERS and CONTINUATION frames can't be intermingled.
func TestReadFrameOrder(t *testing.T) {
head := func(f *Framer, id uint32, end bool) {
f.WriteHeaders(HeadersFrameParam{
StreamID: id,
BlockFragment: []byte("foo"), // unused, but non-empty
EndHeaders: end,
})
}
cont := func(f *Framer, id uint32, end bool) {
f.WriteContinuation(id, end, []byte("foo"))
}
tests := [...]struct {
name string
w func(*Framer)
atLeast int
wantErr string
}{
0: {
w: func(f *Framer) {
head(f, 1, true)
},
},
1: {
w: func(f *Framer) {
head(f, 1, true)
head(f, 2, true)
},
},
2: {
wantErr: "got HEADERS for stream 2; expected CONTINUATION following HEADERS for stream 1",
w: func(f *Framer) {
head(f, 1, false)
head(f, 2, true)
},
},
3: {
wantErr: "got DATA for stream 1; expected CONTINUATION following HEADERS for stream 1",
w: func(f *Framer) {
head(f, 1, false)
},
},
4: {
w: func(f *Framer) {
head(f, 1, false)
cont(f, 1, true)
head(f, 2, true)
},
},
5: {
wantErr: "got CONTINUATION for stream 2; expected stream 1",
w: func(f *Framer) {
head(f, 1, false)
cont(f, 2, true)
head(f, 2, true)
},
},
6: {
wantErr: "unexpected CONTINUATION for stream 1",
w: func(f *Framer) {
cont(f, 1, true)
},
},
7: {
wantErr: "unexpected CONTINUATION for stream 1",
w: func(f *Framer) {
cont(f, 1, false)
},
},
8: {
wantErr: "HEADERS frame with stream ID 0",
w: func(f *Framer) {
head(f, 0, true)
},
},
9: {
wantErr: "CONTINUATION frame with stream ID 0",
w: func(f *Framer) {
cont(f, 0, true)
},
},
10: {
wantErr: "unexpected CONTINUATION for stream 1",
atLeast: 5,
w: func(f *Framer) {
head(f, 1, false)
cont(f, 1, false)
cont(f, 1, false)
cont(f, 1, false)
cont(f, 1, true)
cont(f, 1, false)
},
},
}
for i, tt := range tests {
buf := new(bytes.Buffer)
f := NewFramer(buf, buf)
f.AllowIllegalWrites = true
tt.w(f)
f.WriteData(1, true, nil) // to test transition away from last step
var err error
n := 0
var log bytes.Buffer
for {
var got Frame
got, err = f.ReadFrame()
fmt.Fprintf(&log, " read %v, %v\n", got, err)
if err != nil {
break
}
n++
}
if err == io.EOF {
err = nil
}
ok := tt.wantErr == ""
if ok && err != nil {
t.Errorf("%d. after %d good frames, ReadFrame = %v; want success\n%s", i, n, err, log.Bytes())
continue
}
if !ok && err != ConnectionError(ErrCodeProtocol) {
t.Errorf("%d. after %d good frames, ReadFrame = %v; want ConnectionError(ErrCodeProtocol)\n%s", i, n, err, log.Bytes())
continue
}
if f.errReason != tt.wantErr {
t.Errorf("%d. framer eror = %q; want %q\n%s", i, f.errReason, tt.wantErr, log.Bytes())
}
if n < tt.atLeast {
t.Errorf("%d. framer only read %d frames; want at least %d\n%s", i, n, tt.atLeast, log.Bytes())
}
}
}

View File

@@ -1,6 +1,9 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// See https://code.google.com/p/go/source/browse/CONTRIBUTORS
// Licensed under the same terms as Go itself:
// https://code.google.com/p/go/source/browse/LICENSE
// Defensive debug-only utility to track that functions run on the
// goroutine that they're supposed to.
@@ -11,20 +14,16 @@ import (
"bytes"
"errors"
"fmt"
"os"
"runtime"
"strconv"
"sync"
)
var DebugGoroutines = os.Getenv("DEBUG_HTTP2_GOROUTINES") == "1"
var DebugGoroutines = false
type goroutineLock uint64
func newGoroutineLock() goroutineLock {
if !DebugGoroutines {
return 0
}
return goroutineLock(curGoroutineID())
}

View File

@@ -1,6 +1,9 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// See https://code.google.com/p/go/source/browse/CONTRIBUTORS
// Licensed under the same terms as Go itself:
// https://code.google.com/p/go/source/browse/LICENSE
package http2
@@ -11,10 +14,7 @@ import (
)
func TestGoroutineLock(t *testing.T) {
oldDebug := DebugGoroutines
DebugGoroutines = true
defer func() { DebugGoroutines = oldDebug }()
g := newGoroutineLock()
g.check()

View File

@@ -0,0 +1,5 @@
h2demo.linux: h2demo.go
GOOS=linux go build --tags=h2demo -o h2demo.linux .
upload: h2demo.linux
cat h2demo.linux | go run launch.go --write_object=http2-demo-server-tls/h2demo --write_object_is_public

View File

@@ -1,6 +1,7 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Copyright 2014 The Go Authors.
// See https://code.google.com/p/go/source/browse/CONTRIBUTORS
// Licensed under the same terms as Go itself:
// https://code.google.com/p/go/source/browse/LICENSE
// +build h2demo
@@ -19,6 +20,7 @@ import (
"log"
"net"
"net/http"
"os/exec"
"path"
"regexp"
"runtime"
@@ -29,17 +31,14 @@ import (
"camlistore.org/pkg/googlestorage"
"camlistore.org/pkg/singleflight"
"github.com/coreos/etcd/Godeps/_workspace/src/golang.org/x/net/http2"
"github.com/coreos/etcd/Godeps/_workspace/src/github.com/bradfitz/http2"
)
var (
prod = flag.Bool("prod", false, "Whether to configure itself to be the production http2.golang.org server.")
httpsAddr = flag.String("https_addr", "localhost:4430", "TLS address to listen on ('host:port' or ':port'). Required.")
httpAddr = flag.String("http_addr", "", "Plain HTTP address to listen on ('host:port', or ':port'). Empty means no HTTP.")
hostHTTP = flag.String("http_host", "", "Optional host or host:port to use for http:// links to this service. By default, this is implied from -http_addr.")
hostHTTPS = flag.String("https_host", "", "Optional host or host:port to use for http:// links to this service. By default, this is implied from -https_addr.")
openFirefox = flag.Bool("openff", false, "Open Firefox")
addr = flag.String("addr", "localhost:4430", "TLS address to listen on")
httpAddr = flag.String("httpaddr", "", "If non-empty, address to listen for regular HTTP on")
prod = flag.Bool("prod", false, "Whether to configure itself to be the production http2.golang.org server.")
)
func homeOldHTTP(w http.ResponseWriter, r *http.Request) {
@@ -52,7 +51,7 @@ func homeOldHTTP(w http.ResponseWriter, r *http.Request) {
<li>Use Firefox Nightly or go to <b>about:config</b> and enable "network.http.spdy.enabled.http2draft"</li>
<li>Use Google Chrome Canary and/or go to <b>chrome://flags/#enable-spdy4</b> to <i>Enable SPDY/4</i> (Chrome's name for HTTP/2)</li>
</ul>
<p>See code & instructions for connecting at <a href="https://github.com/golang/net/tree/master/http2">https://github.com/golang/net/tree/master/http2</a>.</p>
<p>See code & instructions for connecting at <a href="https://github.com/bradfitz/http2">https://github.com/bradfitz/http2</a>.</p>
</body></html>`)
}
@@ -73,13 +72,13 @@ href="https://http2.github.io/">HTTP/2</a> demo & interop server.</p>
<p>This server exists for others in the HTTP/2 community to test their HTTP/2 client implementations and point out flaws in our server.</p>
<p>
The code is at <a href="https://golang.org/x/net/http2">golang.org/x/net/http2</a> and
is used transparently by the Go standard library from Go 1.6 and later.
</p>
<p> The code is currently at <a
href="https://github.com/bradfitz/http2">github.com/bradfitz/http2</a>
but will move to the Go standard library at some point in the future
(enabled by default, without users needing to change their code).</p>
<p>Contact info: <i>bradfitz@golang.org</i>, or <a
href="https://golang.org/issues">file a bug</a>.</p>
href="https://github.com/bradfitz/http2/issues">file a bug</a>.</p>
<h2>Handlers for testing</h2>
<ul>
@@ -91,7 +90,6 @@ href="https://golang.org/issues">file a bug</a>.</p>
<li>GET <a href="/redirect">/redirect</a> to redirect back to / (this page)</li>
<li>GET <a href="/goroutines">/goroutines</a> to see all active goroutines in this server</li>
<li>PUT something to <a href="/crc32">/crc32</a> to get a count of number of bytes and its CRC-32</li>
<li>PUT something to <a href="/ECHO">/ECHO</a> and it will be streamed back to you capitalized</li>
</ul>
</body></html>`)
@@ -125,40 +123,6 @@ func crcHandler(w http.ResponseWriter, r *http.Request) {
}
}
type capitalizeReader struct {
r io.Reader
}
func (cr capitalizeReader) Read(p []byte) (n int, err error) {
n, err = cr.r.Read(p)
for i, b := range p[:n] {
if b >= 'a' && b <= 'z' {
p[i] = b - ('a' - 'A')
}
}
return
}
type flushWriter struct {
w io.Writer
}
func (fw flushWriter) Write(p []byte) (n int, err error) {
n, err = fw.w.Write(p)
if f, ok := fw.w.(http.Flusher); ok {
f.Flush()
}
return
}
func echoCapitalHandler(w http.ResponseWriter, r *http.Request) {
if r.Method != "PUT" {
http.Error(w, "PUT required.", 400)
return
}
io.Copy(flushWriter{w}, capitalizeReader{r.Body})
}
var (
fsGrp singleflight.Group
fsMu sync.Mutex // guards fsCache
@@ -234,7 +198,7 @@ func registerHandlers() {
tiles.ServeHTTP(w, r)
return
}
http.Redirect(w, r, "https://"+httpsHost()+"/", http.StatusFound)
http.Redirect(w, r, "https://http2.golang.org/", http.StatusFound)
return
}
if r.ProtoMajor == 1 {
@@ -252,7 +216,6 @@ func registerHandlers() {
mux2.Handle("/file/go.src.tar.gz", fileServer("https://storage.googleapis.com/golang/go1.4.1.src.tar.gz"))
mux2.HandleFunc("/reqinfo", reqInfoHandler)
mux2.HandleFunc("/crc32", crcHandler)
mux2.HandleFunc("/ECHO", echoCapitalHandler)
mux2.HandleFunc("/clockstream", clockStreamHandler)
mux2.Handle("/gophertiles", tiles)
mux2.HandleFunc("/redirect", func(w http.ResponseWriter, r *http.Request) {
@@ -324,7 +287,7 @@ func newGopherTilesHandler() http.Handler {
return
}
}
io.WriteString(w, "<html><body onload='showtimes()'>")
io.WriteString(w, "<html><body>")
fmt.Fprintf(w, "A grid of %d tiled images is below. Compare:<p>", xt*yt)
for _, ms := range []int{0, 30, 200, 1000} {
d := time.Duration(ms) * nanosPerMilli
@@ -342,24 +305,15 @@ func newGopherTilesHandler() http.Handler {
}
io.WriteString(w, "<br/>\n")
}
io.WriteString(w, `<p><div id='loadtimes'></div></p>
<script>
function showtimes() {
var times = 'Times from connection start:<br>'
times += 'DOM loaded: ' + (window.performance.timing.domContentLoadedEventEnd - window.performance.timing.connectStart) + 'ms<br>'
times += 'DOM complete (images loaded): ' + (window.performance.timing.domComplete - window.performance.timing.connectStart) + 'ms<br>'
document.getElementById('loadtimes').innerHTML = times
}
</script>
<hr><a href='/'>&lt;&lt Back to Go HTTP/2 demo server</a></body></html>`)
io.WriteString(w, "<hr><a href='/'>&lt;&lt Back to Go HTTP/2 demo server</a></body></html>")
})
}
func httpsHost() string {
if *hostHTTPS != "" {
return *hostHTTPS
if *prod {
return "http2.golang.org"
}
if v := *httpsAddr; strings.HasPrefix(v, ":") {
if v := *addr; strings.HasPrefix(v, ":") {
return "localhost" + v
} else {
return v
@@ -367,8 +321,8 @@ func httpsHost() string {
}
func httpHost() string {
if *hostHTTP != "" {
return *hostHTTP
if *prod {
return "http2.golang.org"
}
if v := *httpAddr; strings.HasPrefix(v, ":") {
return "localhost" + v
@@ -444,29 +398,29 @@ func main() {
var srv http.Server
flag.BoolVar(&http2.VerboseLogs, "verbose", false, "Verbose HTTP/2 debugging.")
flag.Parse()
srv.Addr = *httpsAddr
srv.Addr = *addr
registerHandlers()
if *prod {
*hostHTTP = "http2.golang.org"
*hostHTTPS = "http2.golang.org"
*httpAddr = "http2.golang.org"
log.Fatal(serveProd())
}
url := "https://" + httpsHost() + "/"
url := "https://" + *addr + "/"
log.Printf("Listening on " + url)
http2.ConfigureServer(&srv, &http2.Server{})
if *httpAddr != "" {
go func() {
log.Printf("Listening on http://" + httpHost() + "/ (for unencrypted HTTP/1)")
log.Fatal(http.ListenAndServe(*httpAddr, nil))
}()
go func() { log.Fatal(http.ListenAndServe(*httpAddr, nil)) }()
}
go func() {
log.Fatal(srv.ListenAndServeTLS("server.crt", "server.key"))
}()
if *openFirefox && runtime.GOOS == "darwin" {
time.Sleep(250 * time.Millisecond)
exec.Command("open", "-b", "org.mozilla.nightly", "https://localhost:4430/").Run()
}
select {}
}

View File

@@ -20,9 +20,8 @@ import (
"strings"
"time"
"golang.org/x/oauth2"
"golang.org/x/oauth2/google"
compute "google.golang.org/api/compute/v1"
"code.google.com/p/goauth2/oauth"
compute "code.google.com/p/google-api-go-client/compute/v1"
)
var (
@@ -45,18 +44,19 @@ func readFile(v string) string {
return strings.TrimSpace(string(slurp))
}
var config = &oauth2.Config{
var config = &oauth.Config{
// The client-id and secret should be for an "Installed Application" when using
// the CLI. Later we'll use a web application with a callback.
ClientID: readFile("client-id.dat"),
ClientId: readFile("client-id.dat"),
ClientSecret: readFile("client-secret.dat"),
Endpoint: google.Endpoint,
Scopes: []string{
compute.DevstorageFullControlScope,
Scope: strings.Join([]string{
compute.DevstorageFull_controlScope,
compute.ComputeScope,
"https://www.googleapis.com/auth/sqlservice",
"https://www.googleapis.com/auth/sqlservice.admin",
},
}, " "),
AuthURL: "https://accounts.google.com/o/oauth2/auth",
TokenURL: "https://accounts.google.com/o/oauth2/token",
RedirectURL: "urn:ietf:wg:oauth:2.0:oob",
}
@@ -88,32 +88,31 @@ func main() {
prefix := "https://www.googleapis.com/compute/v1/projects/" + *proj
machType := prefix + "/zones/" + *zone + "/machineTypes/" + *mach
const tokenFileName = "token.dat"
tokenFile := tokenCacheFile(tokenFileName)
tokenSource := oauth2.ReuseTokenSource(nil, tokenFile)
token, err := tokenSource.Token()
tr := &oauth.Transport{
Config: config,
}
tokenCache := oauth.CacheFile("token.dat")
token, err := tokenCache.Token()
if err != nil {
if *writeObject != "" {
log.Fatalf("Can't use --write_object without a valid token.dat file already cached.")
}
log.Printf("Error getting token from %s: %v", tokenFileName, err)
log.Printf("Error getting token from %s: %v", string(tokenCache), err)
log.Printf("Get auth code from %v", config.AuthCodeURL("my-state"))
fmt.Print("\nEnter auth code: ")
sc := bufio.NewScanner(os.Stdin)
sc.Scan()
authCode := strings.TrimSpace(sc.Text())
token, err = config.Exchange(oauth2.NoContext, authCode)
token, err = tr.Exchange(authCode)
if err != nil {
log.Fatalf("Error exchanging auth code for a token: %v", err)
}
if err := tokenFile.WriteToken(token); err != nil {
log.Fatalf("Error writing to %s: %v", tokenFileName, err)
}
tokenSource = oauth2.ReuseTokenSource(token, nil)
tokenCache.PutToken(token)
}
oauthClient := oauth2.NewClient(oauth2.NoContext, tokenSource)
tr.Token = token
oauthClient := &http.Client{Transport: tr}
if *writeObject != "" {
writeCloudStorageObject(oauthClient)
return
@@ -165,7 +164,7 @@ func main() {
Items: []*compute.MetadataItems{
{
Key: "user-data",
Value: &cloudConfig,
Value: cloudConfig,
},
},
},
@@ -185,7 +184,7 @@ func main() {
{
Email: "default",
Scopes: []string{
compute.DevstorageFullControlScope,
compute.DevstorageFull_controlScope,
compute.ComputeScope,
},
},
@@ -278,25 +277,3 @@ func writeCloudStorageObject(httpClient *http.Client) {
log.Printf("Success.")
os.Exit(0)
}
type tokenCacheFile string
func (f tokenCacheFile) Token() (*oauth2.Token, error) {
slurp, err := ioutil.ReadFile(string(f))
if err != nil {
return nil, err
}
t := new(oauth2.Token)
if err := json.Unmarshal(slurp, t); err != nil {
return nil, err
}
return t, nil
}
func (f tokenCacheFile) WriteToken(t *oauth2.Token) error {
jt, err := json.Marshal(t)
if err != nil {
return err
}
return ioutil.WriteFile(string(f), jt, 0600)
}

View File

@@ -1,6 +1,9 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// See https://code.google.com/p/go/source/browse/CONTRIBUTORS
// Licensed under the same terms as Go itself:
// https://code.google.com/p/go/source/browse/LICENSE
package http2
@@ -57,7 +60,6 @@ func init() {
"server",
"set-cookie",
"strict-transport-security",
"trailer",
"transfer-encoding",
"user-agent",
"vary",

View File

@@ -1,6 +1,7 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Copyright 2014 The Go Authors.
// See https://code.google.com/p/go/source/browse/CONTRIBUTORS
// Licensed under the same terms as Go itself:
// https://code.google.com/p/go/source/browse/LICENSE
package hpack

View File

@@ -1,6 +1,7 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Copyright 2014 The Go Authors.
// See https://code.google.com/p/go/source/browse/CONTRIBUTORS
// Licensed under the same terms as Go itself:
// https://code.google.com/p/go/source/browse/LICENSE
package hpack

View File

@@ -1,6 +1,7 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Copyright 2014 The Go Authors.
// See https://code.google.com/p/go/source/browse/CONTRIBUTORS
// Licensed under the same terms as Go itself:
// https://code.google.com/p/go/source/browse/LICENSE
// Package hpack implements HPACK, a compression format for
// efficiently representing HTTP header fields in the context of HTTP/2.
@@ -41,14 +42,6 @@ type HeaderField struct {
Sensitive bool
}
func (hf HeaderField) String() string {
var suffix string
if hf.Sensitive {
suffix = " (sensitive)"
}
return fmt.Sprintf("header field %q = %q%s", hf.Name, hf.Value, suffix)
}
func (hf *HeaderField) size() uint32 {
// http://http2.github.io/http2-spec/compression.html#rfc.section.4.1
// "The size of the dynamic table is the sum of the size of
@@ -71,65 +64,23 @@ type Decoder struct {
dynTab dynamicTable
emit func(f HeaderField)
emitEnabled bool // whether calls to emit are enabled
maxStrLen int // 0 means unlimited
// buf is the unparsed buffer. It's only written to
// saveBuf if it was truncated in the middle of a header
// block. Because it's usually not owned, we can only
// process it under Write.
buf []byte // not owned; only valid during Write
// saveBuf is previous data passed to Write which we weren't able
// to fully parse before. Unlike buf, we own this data.
buf []byte // usually not owned
saveBuf bytes.Buffer
}
// NewDecoder returns a new decoder with the provided maximum dynamic
// table size. The emitFunc will be called for each valid field
// parsed, in the same goroutine as calls to Write, before Write returns.
func NewDecoder(maxDynamicTableSize uint32, emitFunc func(f HeaderField)) *Decoder {
func NewDecoder(maxSize uint32, emitFunc func(f HeaderField)) *Decoder {
d := &Decoder{
emit: emitFunc,
emitEnabled: true,
emit: emitFunc,
}
d.dynTab.allowedMaxSize = maxDynamicTableSize
d.dynTab.setMaxSize(maxDynamicTableSize)
d.dynTab.allowedMaxSize = maxSize
d.dynTab.setMaxSize(maxSize)
return d
}
// ErrStringLength is returned by Decoder.Write when the max string length
// (as configured by Decoder.SetMaxStringLength) would be violated.
var ErrStringLength = errors.New("hpack: string too long")
// SetMaxStringLength sets the maximum size of a HeaderField name or
// value string. If a string exceeds this length (even after any
// decompression), Write will return ErrStringLength.
// A value of 0 means unlimited and is the default from NewDecoder.
func (d *Decoder) SetMaxStringLength(n int) {
d.maxStrLen = n
}
// SetEmitFunc changes the callback used when new header fields
// are decoded.
// It must be non-nil. It does not affect EmitEnabled.
func (d *Decoder) SetEmitFunc(emitFunc func(f HeaderField)) {
d.emit = emitFunc
}
// SetEmitEnabled controls whether the emitFunc provided to NewDecoder
// should be called. The default is true.
//
// This facility exists to let servers enforce MAX_HEADER_LIST_SIZE
// while still decoding and keeping in-sync with decoder state, but
// without doing unnecessary decompression or generating unnecessary
// garbage for header fields past the limit.
func (d *Decoder) SetEmitEnabled(v bool) { d.emitEnabled = v }
// EmitEnabled reports whether calls to the emitFunc provided to NewDecoder
// are currently enabled. The default is true.
func (d *Decoder) EmitEnabled() bool { return d.emitEnabled }
// TODO: add method *Decoder.Reset(maxSize, emitFunc) to let callers re-use Decoders and their
// underlying buffers for garbage reasons.
@@ -296,23 +247,15 @@ func (d *Decoder) Write(p []byte) (n int, err error) {
for len(d.buf) > 0 {
err = d.parseHeaderFieldRepr()
if err == errNeedMore {
// Extra paranoia, making sure saveBuf won't
// get too large. All the varint and string
// reading code earlier should already catch
// overlong things and return ErrStringLength,
// but keep this as a last resort.
const varIntOverhead = 8 // conservative
if d.maxStrLen != 0 && int64(len(d.buf)) > 2*(int64(d.maxStrLen)+varIntOverhead) {
return 0, ErrStringLength
}
d.saveBuf.Write(d.buf)
return len(p), nil
}
if err != nil {
if err == errNeedMore {
err = nil
d.saveBuf.Write(d.buf)
}
break
}
}
return len(p), err
}
@@ -380,8 +323,9 @@ func (d *Decoder) parseFieldIndexed() error {
if !ok {
return DecodingError{InvalidIndexError(idx)}
}
d.emit(HeaderField{Name: hf.Name, Value: hf.Value})
d.buf = buf
return d.callEmit(HeaderField{Name: hf.Name, Value: hf.Value})
return nil
}
// (same invariants and behavior as parseHeaderFieldRepr)
@@ -393,7 +337,6 @@ func (d *Decoder) parseFieldLiteral(n uint8, it indexType) error {
}
var hf HeaderField
wantStr := d.emitEnabled || it.indexed()
if nameIdx > 0 {
ihf, ok := d.at(nameIdx)
if !ok {
@@ -401,12 +344,12 @@ func (d *Decoder) parseFieldLiteral(n uint8, it indexType) error {
}
hf.Name = ihf.Name
} else {
hf.Name, buf, err = d.readString(buf, wantStr)
hf.Name, buf, err = readString(buf)
if err != nil {
return err
}
}
hf.Value, buf, err = d.readString(buf, wantStr)
hf.Value, buf, err = readString(buf)
if err != nil {
return err
}
@@ -415,18 +358,7 @@ func (d *Decoder) parseFieldLiteral(n uint8, it indexType) error {
d.dynTab.add(hf)
}
hf.Sensitive = it.sensitive()
return d.callEmit(hf)
}
func (d *Decoder) callEmit(hf HeaderField) error {
if d.maxStrLen != 0 {
if len(hf.Name) > d.maxStrLen || len(hf.Value) > d.maxStrLen {
return ErrStringLength
}
}
if d.emitEnabled {
d.emit(hf)
}
d.emit(hf)
return nil
}
@@ -488,15 +420,7 @@ func readVarInt(n byte, p []byte) (i uint64, remain []byte, err error) {
return 0, origP, errNeedMore
}
// readString decodes an hpack string from p.
//
// wantStr is whether s will be used. If false, decompression and
// []byte->string garbage are skipped if s will be ignored
// anyway. This does mean that huffman decoding errors for non-indexed
// strings past the MAX_HEADER_LIST_SIZE are ignored, but the server
// is returning an error anyway, and because they're not indexed, the error
// won't affect the decoding state.
func (d *Decoder) readString(p []byte, wantStr bool) (s string, remain []byte, err error) {
func readString(p []byte) (s string, remain []byte, err error) {
if len(p) == 0 {
return "", p, errNeedMore
}
@@ -505,29 +429,17 @@ func (d *Decoder) readString(p []byte, wantStr bool) (s string, remain []byte, e
if err != nil {
return "", p, err
}
if d.maxStrLen != 0 && strLen > uint64(d.maxStrLen) {
return "", nil, ErrStringLength
}
if uint64(len(p)) < strLen {
return "", p, errNeedMore
}
if !isHuff {
if wantStr {
s = string(p[:strLen])
}
return s, p[strLen:], nil
return string(p[:strLen]), p[strLen:], nil
}
if wantStr {
buf := bufPool.Get().(*bytes.Buffer)
buf.Reset() // don't trust others
defer bufPool.Put(buf)
if err := huffmanDecode(buf, d.maxStrLen, p[:strLen]); err != nil {
buf.Reset()
return "", nil, err
}
s = buf.String()
buf.Reset() // be nice to GC
// TODO: optimize this garbage:
var buf bytes.Buffer
if _, err := HuffmanDecode(&buf, p[:strLen]); err != nil {
return "", nil, err
}
return s, p[strLen:], nil
return buf.String(), p[strLen:], nil
}

View File

@@ -1,6 +1,7 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Copyright 2014 The Go Authors.
// See https://code.google.com/p/go/source/browse/CONTRIBUTORS
// Licensed under the same terms as Go itself:
// https://code.google.com/p/go/source/browse/LICENSE
package hpack
@@ -9,13 +10,11 @@ import (
"bytes"
"encoding/hex"
"fmt"
"math/rand"
"reflect"
"regexp"
"strconv"
"strings"
"testing"
"time"
)
func TestStaticTable(t *testing.T) {
@@ -583,100 +582,6 @@ func TestAppendHuffmanString(t *testing.T) {
}
}
func TestHuffmanMaxStrLen(t *testing.T) {
const msg = "Some string"
huff := AppendHuffmanString(nil, msg)
testGood := func(max int) {
var out bytes.Buffer
if err := huffmanDecode(&out, max, huff); err != nil {
t.Errorf("For maxLen=%d, unexpected error: %v", max, err)
}
if out.String() != msg {
t.Errorf("For maxLen=%d, out = %q; want %q", max, out.String(), msg)
}
}
testGood(0)
testGood(len(msg))
testGood(len(msg) + 1)
var out bytes.Buffer
if err := huffmanDecode(&out, len(msg)-1, huff); err != ErrStringLength {
t.Errorf("err = %v; want ErrStringLength", err)
}
}
func TestHuffmanRoundtripStress(t *testing.T) {
const Len = 50 // of uncompressed string
input := make([]byte, Len)
var output bytes.Buffer
var huff []byte
n := 5000
if testing.Short() {
n = 100
}
seed := time.Now().UnixNano()
t.Logf("Seed = %v", seed)
src := rand.New(rand.NewSource(seed))
var encSize int64
for i := 0; i < n; i++ {
for l := range input {
input[l] = byte(src.Intn(256))
}
huff = AppendHuffmanString(huff[:0], string(input))
encSize += int64(len(huff))
output.Reset()
if err := huffmanDecode(&output, 0, huff); err != nil {
t.Errorf("Failed to decode %q -> %q -> error %v", input, huff, err)
continue
}
if !bytes.Equal(output.Bytes(), input) {
t.Errorf("Roundtrip failure on %q -> %q -> %q", input, huff, output.Bytes())
}
}
t.Logf("Compressed size of original: %0.02f%% (%v -> %v)", 100*(float64(encSize)/(Len*float64(n))), Len*n, encSize)
}
func TestHuffmanDecodeFuzz(t *testing.T) {
const Len = 50 // of compressed
var buf, zbuf bytes.Buffer
n := 5000
if testing.Short() {
n = 100
}
seed := time.Now().UnixNano()
t.Logf("Seed = %v", seed)
src := rand.New(rand.NewSource(seed))
numFail := 0
for i := 0; i < n; i++ {
zbuf.Reset()
if i == 0 {
// Start with at least one invalid one.
zbuf.WriteString("00\x91\xff\xff\xff\xff\xc8")
} else {
for l := 0; l < Len; l++ {
zbuf.WriteByte(byte(src.Intn(256)))
}
}
buf.Reset()
if err := huffmanDecode(&buf, 0, zbuf.Bytes()); err != nil {
if err == ErrInvalidHuffman {
numFail++
continue
}
t.Errorf("Failed to decode %q: %v", zbuf.Bytes(), err)
continue
}
}
t.Logf("%0.02f%% are invalid (%d / %d)", 100*float64(numFail)/float64(n), numFail, n)
if numFail < 1 {
t.Error("expected at least one invalid huffman encoding (test starts with one)")
}
}
func TestReadVarInt(t *testing.T) {
type res struct {
i uint64
@@ -732,17 +637,6 @@ func TestReadVarInt(t *testing.T) {
}
}
// Fuzz crash, originally reported at https://github.com/bradfitz/http2/issues/56
func TestHuffmanFuzzCrash(t *testing.T) {
got, err := HuffmanDecodeToString([]byte("00\x91\xff\xff\xff\xff\xc8"))
if got != "" {
t.Errorf("Got %q; want empty string", got)
}
if err != ErrInvalidHuffman {
t.Errorf("Err = %v; want ErrInvalidHuffman", err)
}
}
func dehex(s string) []byte {
s = strings.Replace(s, " ", "", -1)
s = strings.Replace(s, "\n", "", -1)
@@ -752,62 +646,3 @@ func dehex(s string) []byte {
}
return b
}
func TestEmitEnabled(t *testing.T) {
var buf bytes.Buffer
enc := NewEncoder(&buf)
enc.WriteField(HeaderField{Name: "foo", Value: "bar"})
enc.WriteField(HeaderField{Name: "foo", Value: "bar"})
numCallback := 0
var dec *Decoder
dec = NewDecoder(8<<20, func(HeaderField) {
numCallback++
dec.SetEmitEnabled(false)
})
if !dec.EmitEnabled() {
t.Errorf("initial emit enabled = false; want true")
}
if _, err := dec.Write(buf.Bytes()); err != nil {
t.Error(err)
}
if numCallback != 1 {
t.Errorf("num callbacks = %d; want 1", numCallback)
}
if dec.EmitEnabled() {
t.Errorf("emit enabled = true; want false")
}
}
func TestSaveBufLimit(t *testing.T) {
const maxStr = 1 << 10
var got []HeaderField
dec := NewDecoder(initialHeaderTableSize, func(hf HeaderField) {
got = append(got, hf)
})
dec.SetMaxStringLength(maxStr)
var frag []byte
frag = append(frag[:0], encodeTypeByte(false, false))
frag = appendVarInt(frag, 7, 3)
frag = append(frag, "foo"...)
frag = appendVarInt(frag, 7, 3)
frag = append(frag, "bar"...)
if _, err := dec.Write(frag); err != nil {
t.Fatal(err)
}
want := []HeaderField{{Name: "foo", Value: "bar"}}
if !reflect.DeepEqual(got, want) {
t.Errorf("After small writes, got %v; want %v", got, want)
}
frag = append(frag[:0], encodeTypeByte(false, false))
frag = appendVarInt(frag, 7, maxStr*3)
frag = append(frag, make([]byte, maxStr*3)...)
_, err := dec.Write(frag)
if err != ErrStringLength {
t.Fatalf("Write error = %v; want ErrStringLength", err)
}
}

View File

@@ -1,12 +1,12 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Copyright 2014 The Go Authors.
// See https://code.google.com/p/go/source/browse/CONTRIBUTORS
// Licensed under the same terms as Go itself:
// https://code.google.com/p/go/source/browse/LICENSE
package hpack
import (
"bytes"
"errors"
"io"
"sync"
)
@@ -22,46 +22,15 @@ func HuffmanDecode(w io.Writer, v []byte) (int, error) {
buf := bufPool.Get().(*bytes.Buffer)
buf.Reset()
defer bufPool.Put(buf)
if err := huffmanDecode(buf, 0, v); err != nil {
return 0, err
}
return w.Write(buf.Bytes())
}
// HuffmanDecodeToString decodes the string in v.
func HuffmanDecodeToString(v []byte) (string, error) {
buf := bufPool.Get().(*bytes.Buffer)
buf.Reset()
defer bufPool.Put(buf)
if err := huffmanDecode(buf, 0, v); err != nil {
return "", err
}
return buf.String(), nil
}
// ErrInvalidHuffman is returned for errors found decoding
// Huffman-encoded strings.
var ErrInvalidHuffman = errors.New("hpack: invalid Huffman-encoded data")
// huffmanDecode decodes v to buf.
// If maxLen is greater than 0, attempts to write more to buf than
// maxLen bytes will return ErrStringLength.
func huffmanDecode(buf *bytes.Buffer, maxLen int, v []byte) error {
n := rootHuffmanNode
cur, nbits := uint(0), uint8(0)
for _, b := range v {
cur = cur<<8 | uint(b)
nbits += 8
for nbits >= 8 {
idx := byte(cur >> (nbits - 8))
n = n.children[idx]
if n == nil {
return ErrInvalidHuffman
}
n = n.children[byte(cur>>(nbits-8))]
if n.children == nil {
if maxLen != 0 && buf.Len() == maxLen {
return ErrStringLength
}
buf.WriteByte(n.sym)
nbits -= n.codeLen
n = rootHuffmanNode
@@ -79,7 +48,7 @@ func huffmanDecode(buf *bytes.Buffer, maxLen int, v []byte) error {
nbits -= n.codeLen
n = rootHuffmanNode
}
return nil
return w.Write(buf.Bytes())
}
type node struct {
@@ -98,10 +67,10 @@ func newInternalNode() *node {
var rootHuffmanNode = newInternalNode()
func init() {
if len(huffmanCodes) != 256 {
panic("unexpected size")
}
for i, code := range huffmanCodes {
if i > 255 {
panic("too many huffman codes")
}
addDecoderNode(byte(i), code, huffmanCodeLen[i])
}
}

Some files were not shown because too many files have changed in this diff Show More