diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json index 93c0ff11f..9d0064116 100644 --- a/Godeps/Godeps.json +++ b/Godeps/Godeps.json @@ -1,6 +1,6 @@ { "ImportPath": "github.com/coreos/etcd", - "GoVersion": "go1.4.2", + "GoVersion": "go1.4.1", "Packages": [ "./..." ], @@ -10,6 +10,11 @@ "Comment": "go.r60-163", "Rev": "9352842ae63ee1d7e74e074ce7bb10370c4b6b9e" }, + { + "ImportPath": "github.com/boltdb/bolt", + "Comment": "v1.0-71-g71f28ea", + "Rev": "71f28eaecbebd00604d87bb1de0dae8fcfa54bbd" + }, { "ImportPath": "github.com/codegangsta/cli", "Comment": "1.2.0-26-gf7ebb76", diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/.gitignore b/Godeps/_workspace/src/github.com/boltdb/bolt/.gitignore new file mode 100644 index 000000000..b2bb382bd --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/.gitignore @@ -0,0 +1,3 @@ +*.prof +*.test +/bin/ diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/LICENSE b/Godeps/_workspace/src/github.com/boltdb/bolt/LICENSE new file mode 100644 index 000000000..004e77fe5 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/LICENSE @@ -0,0 +1,20 @@ +The MIT License (MIT) + +Copyright (c) 2013 Ben Johnson + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/Makefile b/Godeps/_workspace/src/github.com/boltdb/bolt/Makefile new file mode 100644 index 000000000..cfbed514b --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/Makefile @@ -0,0 +1,54 @@ +TEST=. +BENCH=. +COVERPROFILE=/tmp/c.out +BRANCH=`git rev-parse --abbrev-ref HEAD` +COMMIT=`git rev-parse --short HEAD` +GOLDFLAGS="-X main.branch $(BRANCH) -X main.commit $(COMMIT)" + +default: build + +bench: + go test -v -test.run=NOTHINCONTAINSTHIS -test.bench=$(BENCH) + +# http://cloc.sourceforge.net/ +cloc: + @cloc --not-match-f='Makefile|_test.go' . + +cover: fmt + go test -coverprofile=$(COVERPROFILE) -test.run=$(TEST) $(COVERFLAG) . + go tool cover -html=$(COVERPROFILE) + rm $(COVERPROFILE) + +cpuprofile: fmt + @go test -c + @./bolt.test -test.v -test.run=$(TEST) -test.cpuprofile cpu.prof + +# go get github.com/kisielk/errcheck +errcheck: + @echo "=== errcheck ===" + @errcheck github.com/boltdb/bolt + +fmt: + @go fmt ./... + +get: + @go get -d ./... + +build: get + @mkdir -p bin + @go build -ldflags=$(GOLDFLAGS) -a -o bin/bolt ./cmd/bolt + +test: fmt + @go get github.com/stretchr/testify/assert + @echo "=== TESTS ===" + @go test -v -cover -test.run=$(TEST) + @echo "" + @echo "" + @echo "=== CLI ===" + @go test -v -test.run=$(TEST) ./cmd/bolt + @echo "" + @echo "" + @echo "=== RACE DETECTOR ===" + @go test -v -race -test.run="TestSimulate_(100op|1000op)" + +.PHONY: bench cloc cover cpuprofile fmt memprofile test diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/README.md b/Godeps/_workspace/src/github.com/boltdb/bolt/README.md new file mode 100644 index 000000000..401a757c3 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/README.md @@ -0,0 +1,591 @@ +Bolt [![Build Status](https://drone.io/github.com/boltdb/bolt/status.png)](https://drone.io/github.com/boltdb/bolt/latest) [![Coverage Status](https://coveralls.io/repos/boltdb/bolt/badge.png?branch=master)](https://coveralls.io/r/boltdb/bolt?branch=master) [![GoDoc](https://godoc.org/github.com/boltdb/bolt?status.png)](https://godoc.org/github.com/boltdb/bolt) ![Version](http://img.shields.io/badge/version-1.0-green.png) +==== + +Bolt is a pure Go key/value store inspired by [Howard Chu's][hyc_symas] and +the [LMDB project][lmdb]. The goal of the project is to provide a simple, +fast, and reliable database for projects that don't require a full database +server such as Postgres or MySQL. + +Since Bolt is meant to be used as such a low-level piece of functionality, +simplicity is key. The API will be small and only focus on getting values +and setting values. That's it. + +[hyc_symas]: https://twitter.com/hyc_symas +[lmdb]: http://symas.com/mdb/ + + +## Project Status + +Bolt is stable and the API is fixed. Full unit test coverage and randomized +black box testing are used to ensure database consistency and thread safety. +Bolt is currently in high-load production environments serving databases as +large as 1TB. Many companies such as Shopify and Heroku use Bolt-backed +services every day. + + +## Getting Started + +### Installing + +To start using Bolt, install Go and run `go get`: + +```sh +$ go get github.com/boltdb/bolt/... +``` + +This will retrieve the library and install the `bolt` command line utility into +your `$GOBIN` path. + + +### Opening a database + +The top-level object in Bolt is a `DB`. It is represented as a single file on +your disk and represents a consistent snapshot of your data. + +To open your database, simply use the `bolt.Open()` function: + +```go +package main + +import ( + "log" + + "github.com/boltdb/bolt" +) + +func main() { + // Open the my.db data file in your current directory. + // It will be created if it doesn't exist. + db, err := bolt.Open("my.db", 0600, nil) + if err != nil { + log.Fatal(err) + } + defer db.Close() + + ... +} +``` + +Please note that Bolt obtains a file lock on the data file so multiple processes +cannot open the same database at the same time. Opening an already open Bolt +database will cause it to hang until the other process closes it. To prevent +an indefinite wait you can pass a timeout option to the `Open()` function: + +```go +db, err := bolt.Open("my.db", 0600, &bolt.Options{Timeout: 1 * time.Second}) +``` + + +### Transactions + +Bolt allows only one read-write transaction at a time but allows as many +read-only transactions as you want at a time. Each transaction has a consistent +view of the data as it existed when the transaction started. + +Individual transactions and all objects created from them (e.g. buckets, keys) +are not thread safe. To work with data in multiple goroutines you must start +a transaction for each one or use locking to ensure only one goroutine accesses +a transaction at a time. Creating transaction from the `DB` is thread safe. + + +#### Read-write transactions + +To start a read-write transaction, you can use the `DB.Update()` function: + +```go +err := db.Update(func(tx *bolt.Tx) error { + ... + return nil +}) +``` + +Inside the closure, you have a consistent view of the database. You commit the +transaction by returning `nil` at the end. You can also rollback the transaction +at any point by returning an error. All database operations are allowed inside +a read-write transaction. + +Always check the return error as it will report any disk failures that can cause +your transaction to not complete. If you return an error within your closure +it will be passed through. + + +#### Read-only transactions + +To start a read-only transaction, you can use the `DB.View()` function: + +```go +err := db.View(func(tx *bolt.Tx) error { + ... + return nil +}) +``` + +You also get a consistent view of the database within this closure, however, +no mutating operations are allowed within a read-only transaction. You can only +retrieve buckets, retrieve values, and copy the database within a read-only +transaction. + + +#### Batch read-write transactions + +Each `DB.Update()` waits for disk to commit the writes. This overhead +can be minimized by combining multiple updates with the `DB.Batch()` +function: + +```go +err := db.Batch(func(tx *bolt.Tx) error { + ... + return nil +}) +``` + +Concurrent Batch calls are opportunistically combined into larger +transactions. Batch is only useful when there are multiple goroutines +calling it. + +The trade-off is that `Batch` can call the given +function multiple times, if parts of the transaction fail. The +function must be idempotent and side effects must take effect only +after a successful return from `DB.Batch()`. + +For example: don't display messages from inside the function, instead +set variables in the enclosing scope: + +```go +var id uint64 +err := db.Batch(func(tx *bolt.Tx) error { + // Find last key in bucket, decode as bigendian uint64, increment + // by one, encode back to []byte, and add new key. + ... + id = newValue + return nil +}) +if err != nil { + return ... +} +fmt.Println("Allocated ID %d", id) +``` + + +#### Managing transactions manually + +The `DB.View()` and `DB.Update()` functions are wrappers around the `DB.Begin()` +function. These helper functions will start the transaction, execute a function, +and then safely close your transaction if an error is returned. This is the +recommended way to use Bolt transactions. + +However, sometimes you may want to manually start and end your transactions. +You can use the `Tx.Begin()` function directly but _please_ be sure to close the +transaction. + +```go +// Start a writable transaction. +tx, err := db.Begin(true) +if err != nil { + return err +} +defer tx.Rollback() + +// Use the transaction... +_, err := tx.CreateBucket([]byte("MyBucket")) +if err != nil { + return err +} + +// Commit the transaction and check for error. +if err := tx.Commit(); err != nil { + return err +} +``` + +The first argument to `DB.Begin()` is a boolean stating if the transaction +should be writable. + + +### Using buckets + +Buckets are collections of key/value pairs within the database. All keys in a +bucket must be unique. You can create a bucket using the `DB.CreateBucket()` +function: + +```go +db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("MyBucket")) + if err != nil { + return fmt.Errorf("create bucket: %s", err) + } + return nil +}) +``` + +You can also create a bucket only if it doesn't exist by using the +`Tx.CreateBucketIfNotExists()` function. It's a common pattern to call this +function for all your top-level buckets after you open your database so you can +guarantee that they exist for future transactions. + +To delete a bucket, simply call the `Tx.DeleteBucket()` function. + + +### Using key/value pairs + +To save a key/value pair to a bucket, use the `Bucket.Put()` function: + +```go +db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("MyBucket")) + err := b.Put([]byte("answer"), []byte("42")) + return err +}) +``` + +This will set the value of the `"answer"` key to `"42"` in the `MyBucket` +bucket. To retrieve this value, we can use the `Bucket.Get()` function: + +```go +db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("MyBucket")) + v := b.Get([]byte("answer")) + fmt.Printf("The answer is: %s\n", v) + return nil +}) +``` + +The `Get()` function does not return an error because its operation is +guarenteed to work (unless there is some kind of system failure). If the key +exists then it will return its byte slice value. If it doesn't exist then it +will return `nil`. It's important to note that you can have a zero-length value +set to a key which is different than the key not existing. + +Use the `Bucket.Delete()` function to delete a key from the bucket. + +Please note that values returned from `Get()` are only valid while the +transaction is open. If you need to use a value outside of the transaction +then you must use `copy()` to copy it to another byte slice. + + +### Iterating over keys + +Bolt stores its keys in byte-sorted order within a bucket. This makes sequential +iteration over these keys extremely fast. To iterate over keys we'll use a +`Cursor`: + +```go +db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("MyBucket")) + c := b.Cursor() + + for k, v := c.First(); k != nil; k, v = c.Next() { + fmt.Printf("key=%s, value=%s\n", k, v) + } + + return nil +}) +``` + +The cursor allows you to move to a specific point in the list of keys and move +forward or backward through the keys one at a time. + +The following functions are available on the cursor: + +``` +First() Move to the first key. +Last() Move to the last key. +Seek() Move to a specific key. +Next() Move to the next key. +Prev() Move to the previous key. +``` + +When you have iterated to the end of the cursor then `Next()` will return `nil`. +You must seek to a position using `First()`, `Last()`, or `Seek()` before +calling `Next()` or `Prev()`. If you do not seek to a position then these +functions will return `nil`. + + +#### Prefix scans + +To iterate over a key prefix, you can combine `Seek()` and `bytes.HasPrefix()`: + +```go +db.View(func(tx *bolt.Tx) error { + c := tx.Bucket([]byte("MyBucket")).Cursor() + + prefix := []byte("1234") + for k, v := c.Seek(prefix); bytes.HasPrefix(k, prefix); k, v = c.Next() { + fmt.Printf("key=%s, value=%s\n", k, v) + } + + return nil +}) +``` + +#### Range scans + +Another common use case is scanning over a range such as a time range. If you +use a sortable time encoding such as RFC3339 then you can query a specific +date range like this: + +```go +db.View(func(tx *bolt.Tx) error { + // Assume our events bucket has RFC3339 encoded time keys. + c := tx.Bucket([]byte("Events")).Cursor() + + // Our time range spans the 90's decade. + min := []byte("1990-01-01T00:00:00Z") + max := []byte("2000-01-01T00:00:00Z") + + // Iterate over the 90's. + for k, v := c.Seek(min); k != nil && bytes.Compare(k, max) <= 0; k, v = c.Next() { + fmt.Printf("%s: %s\n", k, v) + } + + return nil +}) +``` + + +#### ForEach() + +You can also use the function `ForEach()` if you know you'll be iterating over +all the keys in a bucket: + +```go +db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("MyBucket")) + b.ForEach(func(k, v []byte) error { + fmt.Printf("key=%s, value=%s\n", k, v) + return nil + }) + return nil +}) +``` + + +### Nested buckets + +You can also store a bucket in a key to create nested buckets. The API is the +same as the bucket management API on the `DB` object: + +```go +func (*Bucket) CreateBucket(key []byte) (*Bucket, error) +func (*Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error) +func (*Bucket) DeleteBucket(key []byte) error +``` + + +### Database backups + +Bolt is a single file so it's easy to backup. You can use the `Tx.WriteTo()` +function to write a consistent view of the database to a writer. If you call +this from a read-only transaction, it will perform a hot backup and not block +your other database reads and writes. It will also use `O_DIRECT` when available +to prevent page cache trashing. + +One common use case is to backup over HTTP so you can use tools like `cURL` to +do database backups: + +```go +func BackupHandleFunc(w http.ResponseWriter, req *http.Request) { + err := db.View(func(tx *bolt.Tx) error { + w.Header().Set("Content-Type", "application/octet-stream") + w.Header().Set("Content-Disposition", `attachment; filename="my.db"`) + w.Header().Set("Content-Length", strconv.Itoa(int(tx.Size()))) + _, err := tx.WriteTo(w) + return err + }) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + } +} +``` + +Then you can backup using this command: + +```sh +$ curl http://localhost/backup > my.db +``` + +Or you can open your browser to `http://localhost/backup` and it will download +automatically. + +If you want to backup to another file you can use the `Tx.CopyFile()` helper +function. + + +### Statistics + +The database keeps a running count of many of the internal operations it +performs so you can better understand what's going on. By grabbing a snapshot +of these stats at two points in time we can see what operations were performed +in that time range. + +For example, we could start a goroutine to log stats every 10 seconds: + +```go +go func() { + // Grab the initial stats. + prev := db.Stats() + + for { + // Wait for 10s. + time.Sleep(10 * time.Second) + + // Grab the current stats and diff them. + stats := db.Stats() + diff := stats.Sub(&prev) + + // Encode stats to JSON and print to STDERR. + json.NewEncoder(os.Stderr).Encode(diff) + + // Save stats for the next loop. + prev = stats + } +}() +``` + +It's also useful to pipe these stats to a service such as statsd for monitoring +or to provide an HTTP endpoint that will perform a fixed-length sample. + + +## Resources + +For more information on getting started with Bolt, check out the following articles: + +* [Intro to BoltDB: Painless Performant Persistence](http://npf.io/2014/07/intro-to-boltdb-painless-performant-persistence/) by [Nate Finch](https://github.com/natefinch). +* [Bolt -- an embedded key/value database for Go](https://www.progville.com/go/bolt-embedded-db-golang/) by Progville + + +## Comparison with other databases + +### Postgres, MySQL, & other relational databases + +Relational databases structure data into rows and are only accessible through +the use of SQL. This approach provides flexibility in how you store and query +your data but also incurs overhead in parsing and planning SQL statements. Bolt +accesses all data by a byte slice key. This makes Bolt fast to read and write +data by key but provides no built-in support for joining values together. + +Most relational databases (with the exception of SQLite) are standalone servers +that run separately from your application. This gives your systems +flexibility to connect multiple application servers to a single database +server but also adds overhead in serializing and transporting data over the +network. Bolt runs as a library included in your application so all data access +has to go through your application's process. This brings data closer to your +application but limits multi-process access to the data. + + +### LevelDB, RocksDB + +LevelDB and its derivatives (RocksDB, HyperLevelDB) are similar to Bolt in that +they are libraries bundled into the application, however, their underlying +structure is a log-structured merge-tree (LSM tree). An LSM tree optimizes +random writes by using a write ahead log and multi-tiered, sorted files called +SSTables. Bolt uses a B+tree internally and only a single file. Both approaches +have trade offs. + +If you require a high random write throughput (>10,000 w/sec) or you need to use +spinning disks then LevelDB could be a good choice. If your application is +read-heavy or does a lot of range scans then Bolt could be a good choice. + +One other important consideration is that LevelDB does not have transactions. +It supports batch writing of key/values pairs and it supports read snapshots +but it will not give you the ability to do a compare-and-swap operation safely. +Bolt supports fully serializable ACID transactions. + + +### LMDB + +Bolt was originally a port of LMDB so it is architecturally similar. Both use +a B+tree, have ACID semantics with fully serializable transactions, and support +lock-free MVCC using a single writer and multiple readers. + +The two projects have somewhat diverged. LMDB heavily focuses on raw performance +while Bolt has focused on simplicity and ease of use. For example, LMDB allows +several unsafe actions such as direct writes for the sake of performance. Bolt +opts to disallow actions which can leave the database in a corrupted state. The +only exception to this in Bolt is `DB.NoSync`. + +There are also a few differences in API. LMDB requires a maximum mmap size when +opening an `mdb_env` whereas Bolt will handle incremental mmap resizing +automatically. LMDB overloads the getter and setter functions with multiple +flags whereas Bolt splits these specialized cases into their own functions. + + +## Caveats & Limitations + +It's important to pick the right tool for the job and Bolt is no exception. +Here are a few things to note when evaluating and using Bolt: + +* Bolt is good for read intensive workloads. Sequential write performance is + also fast but random writes can be slow. You can add a write-ahead log or + [transaction coalescer](https://github.com/boltdb/coalescer) in front of Bolt + to mitigate this issue. + +* Bolt uses a B+tree internally so there can be a lot of random page access. + SSDs provide a significant performance boost over spinning disks. + +* Try to avoid long running read transactions. Bolt uses copy-on-write so + old pages cannot be reclaimed while an old transaction is using them. + +* Byte slices returned from Bolt are only valid during a transaction. Once the + transaction has been committed or rolled back then the memory they point to + can be reused by a new page or can be unmapped from virtual memory and you'll + see an `unexpected fault address` panic when accessing it. + +* Be careful when using `Bucket.FillPercent`. Setting a high fill percent for + buckets that have random inserts will cause your database to have very poor + page utilization. + +* Use larger buckets in general. Smaller buckets causes poor page utilization + once they become larger than the page size (typically 4KB). + +* Bulk loading a lot of random writes into a new bucket can be slow as the + page will not split until the transaction is committed. Randomly inserting + more than 100,000 key/value pairs into a single new bucket in a single + transaction is not advised. + +* Bolt uses a memory-mapped file so the underlying operating system handles the + caching of the data. Typically, the OS will cache as much of the file as it + can in memory and will release memory as needed to other processes. This means + that Bolt can show very high memory usage when working with large databases. + However, this is expected and the OS will release memory as needed. Bolt can + handle databases much larger than the available physical RAM. + +* Because of the way pages are laid out on disk, Bolt cannot truncate data files + and return free pages back to the disk. Instead, Bolt maintains a free list + of unused pages within its data file. These free pages can be reused by later + transactions. This works well for many use cases as databases generally tend + to grow. However, it's important to note that deleting large chunks of data + will not allow you to reclaim that space on disk. + + For more information on page allocation, [see this comment][page-allocation]. + +[page-allocation]: https://github.com/boltdb/bolt/issues/308#issuecomment-74811638 + + +## Other Projects Using Bolt + +Below is a list of public, open source projects that use Bolt: + +* [Operation Go: A Routine Mission](http://gocode.io) - An online programming game for Golang using Bolt for user accounts and a leaderboard. +* [Bazil](https://github.com/bazillion/bazil) - A file system that lets your data reside where it is most convenient for it to reside. +* [DVID](https://github.com/janelia-flyem/dvid) - Added Bolt as optional storage engine and testing it against Basho-tuned leveldb. +* [Skybox Analytics](https://github.com/skybox/skybox) - A standalone funnel analysis tool for web analytics. +* [Scuttlebutt](https://github.com/benbjohnson/scuttlebutt) - Uses Bolt to store and process all Twitter mentions of GitHub projects. +* [Wiki](https://github.com/peterhellberg/wiki) - A tiny wiki using Goji, BoltDB and Blackfriday. +* [ChainStore](https://github.com/nulayer/chainstore) - Simple key-value interface to a variety of storage engines organized as a chain of operations. +* [MetricBase](https://github.com/msiebuhr/MetricBase) - Single-binary version of Graphite. +* [Gitchain](https://github.com/gitchain/gitchain) - Decentralized, peer-to-peer Git repositories aka "Git meets Bitcoin". +* [event-shuttle](https://github.com/sclasen/event-shuttle) - A Unix system service to collect and reliably deliver messages to Kafka. +* [ipxed](https://github.com/kelseyhightower/ipxed) - Web interface and api for ipxed. +* [BoltStore](https://github.com/yosssi/boltstore) - Session store using Bolt. +* [photosite/session](http://godoc.org/bitbucket.org/kardianos/photosite/session) - Sessions for a photo viewing site. +* [LedisDB](https://github.com/siddontang/ledisdb) - A high performance NoSQL, using Bolt as optional storage. +* [ipLocator](https://github.com/AndreasBriese/ipLocator) - A fast ip-geo-location-server using bolt with bloom filters. +* [cayley](https://github.com/google/cayley) - Cayley is an open-source graph database using Bolt as optional backend. +* [bleve](http://www.blevesearch.com/) - A pure Go search engine similar to ElasticSearch that uses Bolt as the default storage backend. +* [tentacool](https://github.com/optiflows/tentacool) - REST api server to manage system stuff (IP, DNS, Gateway...) on a linux server. +* [SkyDB](https://github.com/skydb/sky) - Behavioral analytics database. +* [Seaweed File System](https://github.com/chrislusf/weed-fs) - Highly scalable distributed key~file system with O(1) disk read. +* [InfluxDB](http://influxdb.com) - Scalable datastore for metrics, events, and real-time analytics. + +If you are using Bolt in a project please send a pull request to add it to the list. diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/batch.go b/Godeps/_workspace/src/github.com/boltdb/bolt/batch.go new file mode 100644 index 000000000..bef1f4aa2 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/batch.go @@ -0,0 +1,135 @@ +package bolt + +import ( + "errors" + "fmt" + "sync" + "time" +) + +// Batch calls fn as part of a batch. It behaves similar to Update, +// except: +// +// 1. concurrent Batch calls can be combined into a single Bolt +// transaction. +// +// 2. the function passed to Batch may be called multiple times, +// regardless of whether it returns error or not. +// +// This means that Batch function side effects must be idempotent and +// take permanent effect only after a successful return is seen in +// caller. +// +// Batch is only useful when there are multiple goroutines calling it. +func (db *DB) Batch(fn func(*Tx) error) error { + errCh := make(chan error, 1) + + db.batchMu.Lock() + if (db.batch == nil) || (db.batch != nil && len(db.batch.calls) >= db.MaxBatchSize) { + // There is no existing batch, or the existing batch is full; start a new one. + db.batch = &batch{ + db: db, + } + db.batch.timer = time.AfterFunc(db.MaxBatchDelay, db.batch.trigger) + } + db.batch.calls = append(db.batch.calls, call{fn: fn, err: errCh}) + if len(db.batch.calls) >= db.MaxBatchSize { + // wake up batch, it's ready to run + go db.batch.trigger() + } + db.batchMu.Unlock() + + err := <-errCh + if err == trySolo { + err = db.Update(fn) + } + return err +} + +type call struct { + fn func(*Tx) error + err chan<- error +} + +type batch struct { + db *DB + timer *time.Timer + start sync.Once + calls []call +} + +// trigger runs the batch if it hasn't already been run. +func (b *batch) trigger() { + b.start.Do(b.run) +} + +// run performs the transactions in the batch and communicates results +// back to DB.Batch. +func (b *batch) run() { + b.db.batchMu.Lock() + b.timer.Stop() + // Make sure no new work is added to this batch, but don't break + // other batches. + if b.db.batch == b { + b.db.batch = nil + } + b.db.batchMu.Unlock() + +retry: + for len(b.calls) > 0 { + var failIdx = -1 + err := b.db.Update(func(tx *Tx) error { + for i, c := range b.calls { + if err := safelyCall(c.fn, tx); err != nil { + failIdx = i + return err + } + } + return nil + }) + + if failIdx >= 0 { + // take the failing transaction out of the batch. it's + // safe to shorten b.calls here because db.batch no longer + // points to us, and we hold the mutex anyway. + c := b.calls[failIdx] + b.calls[failIdx], b.calls = b.calls[len(b.calls)-1], b.calls[:len(b.calls)-1] + // tell the submitter re-run it solo, continue with the rest of the batch + c.err <- trySolo + continue retry + } + + // pass success, or bolt internal errors, to all callers + for _, c := range b.calls { + if c.err != nil { + c.err <- err + } + } + break retry + } +} + +// trySolo is a special sentinel error value used for signaling that a +// transaction function should be re-run. It should never be seen by +// callers. +var trySolo = errors.New("batch function returned an error and should be re-run solo") + +type panicked struct { + reason interface{} +} + +func (p panicked) Error() string { + if err, ok := p.reason.(error); ok { + return err.Error() + } + return fmt.Sprintf("panic: %v", p.reason) +} + +func safelyCall(fn func(*Tx) error, tx *Tx) (err error) { + defer func() { + if p := recover(); p != nil { + err = panicked{p} + } + }() + return fn(tx) +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/batch_benchmark_test.go b/Godeps/_workspace/src/github.com/boltdb/bolt/batch_benchmark_test.go new file mode 100644 index 000000000..f6b8c1834 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/batch_benchmark_test.go @@ -0,0 +1,170 @@ +package bolt_test + +import ( + "bytes" + "encoding/binary" + "errors" + "hash/fnv" + "sync" + "testing" + + "github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt" +) + +func validateBatchBench(b *testing.B, db *TestDB) { + var rollback = errors.New("sentinel error to cause rollback") + validate := func(tx *bolt.Tx) error { + bucket := tx.Bucket([]byte("bench")) + h := fnv.New32a() + buf := make([]byte, 4) + for id := uint32(0); id < 1000; id++ { + binary.LittleEndian.PutUint32(buf, id) + h.Reset() + h.Write(buf[:]) + k := h.Sum(nil) + v := bucket.Get(k) + if v == nil { + b.Errorf("not found id=%d key=%x", id, k) + continue + } + if g, e := v, []byte("filler"); !bytes.Equal(g, e) { + b.Errorf("bad value for id=%d key=%x: %s != %q", id, k, g, e) + } + if err := bucket.Delete(k); err != nil { + return err + } + } + // should be empty now + c := bucket.Cursor() + for k, v := c.First(); k != nil; k, v = c.Next() { + b.Errorf("unexpected key: %x = %q", k, v) + } + return rollback + } + if err := db.Update(validate); err != nil && err != rollback { + b.Error(err) + } +} + +func BenchmarkDBBatchAutomatic(b *testing.B) { + db := NewTestDB() + defer db.Close() + db.MustCreateBucket([]byte("bench")) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + start := make(chan struct{}) + var wg sync.WaitGroup + + for round := 0; round < 1000; round++ { + wg.Add(1) + + go func(id uint32) { + defer wg.Done() + <-start + + h := fnv.New32a() + buf := make([]byte, 4) + binary.LittleEndian.PutUint32(buf, id) + h.Write(buf[:]) + k := h.Sum(nil) + insert := func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("bench")) + return b.Put(k, []byte("filler")) + } + if err := db.Batch(insert); err != nil { + b.Error(err) + return + } + }(uint32(round)) + } + close(start) + wg.Wait() + } + + b.StopTimer() + validateBatchBench(b, db) +} + +func BenchmarkDBBatchSingle(b *testing.B) { + db := NewTestDB() + defer db.Close() + db.MustCreateBucket([]byte("bench")) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + start := make(chan struct{}) + var wg sync.WaitGroup + + for round := 0; round < 1000; round++ { + wg.Add(1) + go func(id uint32) { + defer wg.Done() + <-start + + h := fnv.New32a() + buf := make([]byte, 4) + binary.LittleEndian.PutUint32(buf, id) + h.Write(buf[:]) + k := h.Sum(nil) + insert := func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("bench")) + return b.Put(k, []byte("filler")) + } + if err := db.Update(insert); err != nil { + b.Error(err) + return + } + }(uint32(round)) + } + close(start) + wg.Wait() + } + + b.StopTimer() + validateBatchBench(b, db) +} + +func BenchmarkDBBatchManual10x100(b *testing.B) { + db := NewTestDB() + defer db.Close() + db.MustCreateBucket([]byte("bench")) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + start := make(chan struct{}) + var wg sync.WaitGroup + + for major := 0; major < 10; major++ { + wg.Add(1) + go func(id uint32) { + defer wg.Done() + <-start + + insert100 := func(tx *bolt.Tx) error { + h := fnv.New32a() + buf := make([]byte, 4) + for minor := uint32(0); minor < 100; minor++ { + binary.LittleEndian.PutUint32(buf, uint32(id*100+minor)) + h.Reset() + h.Write(buf[:]) + k := h.Sum(nil) + b := tx.Bucket([]byte("bench")) + if err := b.Put(k, []byte("filler")); err != nil { + return err + } + } + return nil + } + if err := db.Update(insert100); err != nil { + b.Fatal(err) + } + }(uint32(major)) + } + close(start) + wg.Wait() + } + + b.StopTimer() + validateBatchBench(b, db) +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/batch_example_test.go b/Godeps/_workspace/src/github.com/boltdb/bolt/batch_example_test.go new file mode 100644 index 000000000..b0d00935a --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/batch_example_test.go @@ -0,0 +1,148 @@ +package bolt_test + +import ( + "encoding/binary" + "fmt" + "io/ioutil" + "log" + "math/rand" + "net/http" + "net/http/httptest" + "os" + + "github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt" +) + +// Set this to see how the counts are actually updated. +const verbose = false + +// Counter updates a counter in Bolt for every URL path requested. +type counter struct { + db *bolt.DB +} + +func (c counter) ServeHTTP(rw http.ResponseWriter, req *http.Request) { + // Communicates the new count from a successful database + // transaction. + var result uint64 + + increment := func(tx *bolt.Tx) error { + b, err := tx.CreateBucketIfNotExists([]byte("hits")) + if err != nil { + return err + } + key := []byte(req.URL.String()) + // Decode handles key not found for us. + count := decode(b.Get(key)) + 1 + b.Put(key, encode(count)) + // All good, communicate new count. + result = count + return nil + } + if err := c.db.Batch(increment); err != nil { + http.Error(rw, err.Error(), 500) + return + } + + if verbose { + log.Printf("server: %s: %d", req.URL.String(), result) + } + + rw.Header().Set("Content-Type", "application/octet-stream") + fmt.Fprintf(rw, "%d\n", result) +} + +func client(id int, base string, paths []string) error { + // Process paths in random order. + rng := rand.New(rand.NewSource(int64(id))) + permutation := rng.Perm(len(paths)) + + for i := range paths { + path := paths[permutation[i]] + resp, err := http.Get(base + path) + if err != nil { + return err + } + defer resp.Body.Close() + buf, err := ioutil.ReadAll(resp.Body) + if err != nil { + return err + } + if verbose { + log.Printf("client: %s: %s", path, buf) + } + } + return nil +} + +func ExampleDB_Batch() { + // Open the database. + db, _ := bolt.Open(tempfile(), 0666, nil) + defer os.Remove(db.Path()) + defer db.Close() + + // Start our web server + count := counter{db} + srv := httptest.NewServer(count) + defer srv.Close() + + // Decrease the batch size to make things more interesting. + db.MaxBatchSize = 3 + + // Get every path multiple times concurrently. + const clients = 10 + paths := []string{ + "/foo", + "/bar", + "/baz", + "/quux", + "/thud", + "/xyzzy", + } + errors := make(chan error, clients) + for i := 0; i < clients; i++ { + go func(id int) { + errors <- client(id, srv.URL, paths) + }(i) + } + // Check all responses to make sure there's no error. + for i := 0; i < clients; i++ { + if err := <-errors; err != nil { + fmt.Printf("client error: %v", err) + return + } + } + + // Check the final result + db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("hits")) + c := b.Cursor() + for k, v := c.First(); k != nil; k, v = c.Next() { + fmt.Printf("hits to %s: %d\n", k, decode(v)) + } + return nil + }) + + // Output: + // hits to /bar: 10 + // hits to /baz: 10 + // hits to /foo: 10 + // hits to /quux: 10 + // hits to /thud: 10 + // hits to /xyzzy: 10 +} + +// encode marshals a counter. +func encode(n uint64) []byte { + buf := make([]byte, 8) + binary.BigEndian.PutUint64(buf, n) + return buf +} + +// decode unmarshals a counter. Nil buffers are decoded as 0. +func decode(buf []byte) uint64 { + if buf == nil { + return 0 + } + return binary.BigEndian.Uint64(buf) +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/batch_test.go b/Godeps/_workspace/src/github.com/boltdb/bolt/batch_test.go new file mode 100644 index 000000000..05230a9aa --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/batch_test.go @@ -0,0 +1,167 @@ +package bolt_test + +import ( + "testing" + "time" + + "github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt" +) + +// Ensure two functions can perform updates in a single batch. +func TestDB_Batch(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.MustCreateBucket([]byte("widgets")) + + // Iterate over multiple updates in separate goroutines. + n := 2 + ch := make(chan error) + for i := 0; i < n; i++ { + go func(i int) { + ch <- db.Batch(func(tx *bolt.Tx) error { + return tx.Bucket([]byte("widgets")).Put(u64tob(uint64(i)), []byte{}) + }) + }(i) + } + + // Check all responses to make sure there's no error. + for i := 0; i < n; i++ { + if err := <-ch; err != nil { + t.Fatal(err) + } + } + + // Ensure data is correct. + db.MustView(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + for i := 0; i < n; i++ { + if v := b.Get(u64tob(uint64(i))); v == nil { + t.Errorf("key not found: %d", i) + } + } + return nil + }) +} + +func TestDB_Batch_Panic(t *testing.T) { + db := NewTestDB() + defer db.Close() + + var sentinel int + var bork = &sentinel + var problem interface{} + var err error + + // Execute a function inside a batch that panics. + func() { + defer func() { + if p := recover(); p != nil { + problem = p + } + }() + err = db.Batch(func(tx *bolt.Tx) error { + panic(bork) + }) + }() + + // Verify there is no error. + if g, e := err, error(nil); g != e { + t.Fatalf("wrong error: %v != %v", g, e) + } + // Verify the panic was captured. + if g, e := problem, bork; g != e { + t.Fatalf("wrong error: %v != %v", g, e) + } +} + +func TestDB_BatchFull(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.MustCreateBucket([]byte("widgets")) + + const size = 3 + // buffered so we never leak goroutines + ch := make(chan error, size) + put := func(i int) { + ch <- db.Batch(func(tx *bolt.Tx) error { + return tx.Bucket([]byte("widgets")).Put(u64tob(uint64(i)), []byte{}) + }) + } + + db.MaxBatchSize = size + // high enough to never trigger here + db.MaxBatchDelay = 1 * time.Hour + + go put(1) + go put(2) + + // Give the batch a chance to exhibit bugs. + time.Sleep(10 * time.Millisecond) + + // not triggered yet + select { + case <-ch: + t.Fatalf("batch triggered too early") + default: + } + + go put(3) + + // Check all responses to make sure there's no error. + for i := 0; i < size; i++ { + if err := <-ch; err != nil { + t.Fatal(err) + } + } + + // Ensure data is correct. + db.MustView(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + for i := 1; i <= size; i++ { + if v := b.Get(u64tob(uint64(i))); v == nil { + t.Errorf("key not found: %d", i) + } + } + return nil + }) +} + +func TestDB_BatchTime(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.MustCreateBucket([]byte("widgets")) + + const size = 1 + // buffered so we never leak goroutines + ch := make(chan error, size) + put := func(i int) { + ch <- db.Batch(func(tx *bolt.Tx) error { + return tx.Bucket([]byte("widgets")).Put(u64tob(uint64(i)), []byte{}) + }) + } + + db.MaxBatchSize = 1000 + db.MaxBatchDelay = 0 + + go put(1) + + // Batch must trigger by time alone. + + // Check all responses to make sure there's no error. + for i := 0; i < size; i++ { + if err := <-ch; err != nil { + t.Fatal(err) + } + } + + // Ensure data is correct. + db.MustView(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + for i := 1; i <= size; i++ { + if v := b.Get(u64tob(uint64(i))); v == nil { + t.Errorf("key not found: %d", i) + } + } + return nil + }) +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_386.go b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_386.go new file mode 100644 index 000000000..e659bfb91 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_386.go @@ -0,0 +1,7 @@ +package bolt + +// maxMapSize represents the largest mmap size supported by Bolt. +const maxMapSize = 0x7FFFFFFF // 2GB + +// maxAllocSize is the size used when creating array pointers. +const maxAllocSize = 0xFFFFFFF diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_amd64.go b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_amd64.go new file mode 100644 index 000000000..cca6b7eb7 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_amd64.go @@ -0,0 +1,7 @@ +package bolt + +// maxMapSize represents the largest mmap size supported by Bolt. +const maxMapSize = 0xFFFFFFFFFFFF // 256TB + +// maxAllocSize is the size used when creating array pointers. +const maxAllocSize = 0x7FFFFFFF diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_arm.go b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_arm.go new file mode 100644 index 000000000..e659bfb91 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_arm.go @@ -0,0 +1,7 @@ +package bolt + +// maxMapSize represents the largest mmap size supported by Bolt. +const maxMapSize = 0x7FFFFFFF // 2GB + +// maxAllocSize is the size used when creating array pointers. +const maxAllocSize = 0xFFFFFFF diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_linux.go b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_linux.go new file mode 100644 index 000000000..e9d1c907b --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_linux.go @@ -0,0 +1,12 @@ +package bolt + +import ( + "syscall" +) + +var odirect = syscall.O_DIRECT + +// fdatasync flushes written data to a file descriptor. +func fdatasync(db *DB) error { + return syscall.Fdatasync(int(db.file.Fd())) +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_openbsd.go b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_openbsd.go new file mode 100644 index 000000000..7c1bef1a4 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_openbsd.go @@ -0,0 +1,29 @@ +package bolt + +import ( + "syscall" + "unsafe" +) + +const ( + msAsync = 1 << iota // perform asynchronous writes + msSync // perform synchronous writes + msInvalidate // invalidate cached data +) + +var odirect int + +func msync(db *DB) error { + _, _, errno := syscall.Syscall(syscall.SYS_MSYNC, uintptr(unsafe.Pointer(db.data)), uintptr(db.datasz), msInvalidate) + if errno != 0 { + return errno + } + return nil +} + +func fdatasync(db *DB) error { + if db.data != nil { + return msync(db) + } + return db.file.Sync() +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_test.go b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_test.go new file mode 100644 index 000000000..b7bea1fc5 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_test.go @@ -0,0 +1,36 @@ +package bolt_test + +import ( + "fmt" + "path/filepath" + "reflect" + "runtime" + "testing" +) + +// assert fails the test if the condition is false. +func assert(tb testing.TB, condition bool, msg string, v ...interface{}) { + if !condition { + _, file, line, _ := runtime.Caller(1) + fmt.Printf("\033[31m%s:%d: "+msg+"\033[39m\n\n", append([]interface{}{filepath.Base(file), line}, v...)...) + tb.FailNow() + } +} + +// ok fails the test if an err is not nil. +func ok(tb testing.TB, err error) { + if err != nil { + _, file, line, _ := runtime.Caller(1) + fmt.Printf("\033[31m%s:%d: unexpected error: %s\033[39m\n\n", filepath.Base(file), line, err.Error()) + tb.FailNow() + } +} + +// equals fails the test if exp is not equal to act. +func equals(tb testing.TB, exp, act interface{}) { + if !reflect.DeepEqual(exp, act) { + _, file, line, _ := runtime.Caller(1) + fmt.Printf("\033[31m%s:%d:\n\n\texp: %#v\n\n\tgot: %#v\033[39m\n\n", filepath.Base(file), line, exp, act) + tb.FailNow() + } +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_unix.go b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_unix.go new file mode 100644 index 000000000..e222cfdcc --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_unix.go @@ -0,0 +1,80 @@ +// +build !windows,!plan9 + +package bolt + +import ( + "fmt" + "os" + "syscall" + "time" + "unsafe" +) + +// flock acquires an advisory lock on a file descriptor. +func flock(f *os.File, timeout time.Duration) error { + var t time.Time + for { + // If we're beyond our timeout then return an error. + // This can only occur after we've attempted a flock once. + if t.IsZero() { + t = time.Now() + } else if timeout > 0 && time.Since(t) > timeout { + return ErrTimeout + } + + // Otherwise attempt to obtain an exclusive lock. + err := syscall.Flock(int(f.Fd()), syscall.LOCK_EX|syscall.LOCK_NB) + if err == nil { + return nil + } else if err != syscall.EWOULDBLOCK { + return err + } + + // Wait for a bit and try again. + time.Sleep(50 * time.Millisecond) + } +} + +// funlock releases an advisory lock on a file descriptor. +func funlock(f *os.File) error { + return syscall.Flock(int(f.Fd()), syscall.LOCK_UN) +} + +// mmap memory maps a DB's data file. +func mmap(db *DB, sz int) error { + // Truncate and fsync to ensure file size metadata is flushed. + // https://github.com/boltdb/bolt/issues/284 + if err := db.file.Truncate(int64(sz)); err != nil { + return fmt.Errorf("file resize error: %s", err) + } + if err := db.file.Sync(); err != nil { + return fmt.Errorf("file sync error: %s", err) + } + + // Map the data file to memory. + b, err := syscall.Mmap(int(db.file.Fd()), 0, sz, syscall.PROT_READ, syscall.MAP_SHARED) + if err != nil { + return err + } + + // Save the original byte slice and convert to a byte array pointer. + db.dataref = b + db.data = (*[maxMapSize]byte)(unsafe.Pointer(&b[0])) + db.datasz = sz + return nil +} + +// munmap unmaps a DB's data file from memory. +func munmap(db *DB) error { + // Ignore the unmap if we have no mapped data. + if db.dataref == nil { + return nil + } + + // Unmap using the original byte slice. + err := syscall.Munmap(db.dataref) + db.dataref = nil + db.data = nil + db.datasz = 0 + return err +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_windows.go b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_windows.go new file mode 100644 index 000000000..c8539d40b --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/bolt_windows.go @@ -0,0 +1,74 @@ +package bolt + +import ( + "fmt" + "os" + "syscall" + "time" + "unsafe" +) + +var odirect int + +// fdatasync flushes written data to a file descriptor. +func fdatasync(db *DB) error { + return db.file.Sync() +} + +// flock acquires an advisory lock on a file descriptor. +func flock(f *os.File, _ time.Duration) error { + return nil +} + +// funlock releases an advisory lock on a file descriptor. +func funlock(f *os.File) error { + return nil +} + +// mmap memory maps a DB's data file. +// Based on: https://github.com/edsrzf/mmap-go +func mmap(db *DB, sz int) error { + // Truncate the database to the size of the mmap. + if err := db.file.Truncate(int64(sz)); err != nil { + return fmt.Errorf("truncate: %s", err) + } + + // Open a file mapping handle. + sizelo := uint32(sz >> 32) + sizehi := uint32(sz) & 0xffffffff + h, errno := syscall.CreateFileMapping(syscall.Handle(db.file.Fd()), nil, syscall.PAGE_READONLY, sizelo, sizehi, nil) + if h == 0 { + return os.NewSyscallError("CreateFileMapping", errno) + } + + // Create the memory map. + addr, errno := syscall.MapViewOfFile(h, syscall.FILE_MAP_READ, 0, 0, uintptr(sz)) + if addr == 0 { + return os.NewSyscallError("MapViewOfFile", errno) + } + + // Close mapping handle. + if err := syscall.CloseHandle(syscall.Handle(h)); err != nil { + return os.NewSyscallError("CloseHandle", err) + } + + // Convert to a byte array. + db.data = ((*[maxMapSize]byte)(unsafe.Pointer(addr))) + db.datasz = sz + + return nil +} + +// munmap unmaps a pointer from a file. +// Based on: https://github.com/edsrzf/mmap-go +func munmap(db *DB) error { + if db.data == nil { + return nil + } + + addr := (uintptr)(unsafe.Pointer(&db.data[0])) + if err := syscall.UnmapViewOfFile(addr); err != nil { + return os.NewSyscallError("UnmapViewOfFile", err) + } + return nil +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/boltsync_unix.go b/Godeps/_workspace/src/github.com/boltdb/bolt/boltsync_unix.go new file mode 100644 index 000000000..8db89776f --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/boltsync_unix.go @@ -0,0 +1,10 @@ +// +build !windows,!plan9,!linux,!openbsd + +package bolt + +var odirect int + +// fdatasync flushes written data to a file descriptor. +func fdatasync(db *DB) error { + return db.file.Sync() +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/bucket.go b/Godeps/_workspace/src/github.com/boltdb/bolt/bucket.go new file mode 100644 index 000000000..676699210 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/bucket.go @@ -0,0 +1,743 @@ +package bolt + +import ( + "bytes" + "fmt" + "unsafe" +) + +const ( + // MaxKeySize is the maximum length of a key, in bytes. + MaxKeySize = 32768 + + // MaxValueSize is the maximum length of a value, in bytes. + MaxValueSize = 4294967295 +) + +const ( + maxUint = ^uint(0) + minUint = 0 + maxInt = int(^uint(0) >> 1) + minInt = -maxInt - 1 +) + +const bucketHeaderSize = int(unsafe.Sizeof(bucket{})) + +const ( + minFillPercent = 0.1 + maxFillPercent = 1.0 +) + +// DefaultFillPercent is the percentage that split pages are filled. +// This value can be changed by setting Bucket.FillPercent. +const DefaultFillPercent = 0.5 + +// Bucket represents a collection of key/value pairs inside the database. +type Bucket struct { + *bucket + tx *Tx // the associated transaction + buckets map[string]*Bucket // subbucket cache + page *page // inline page reference + rootNode *node // materialized node for the root page. + nodes map[pgid]*node // node cache + + // Sets the threshold for filling nodes when they split. By default, + // the bucket will fill to 50% but it can be useful to increase this + // amount if you know that your write workloads are mostly append-only. + // + // This is non-persisted across transactions so it must be set in every Tx. + FillPercent float64 +} + +// bucket represents the on-file representation of a bucket. +// This is stored as the "value" of a bucket key. If the bucket is small enough, +// then its root page can be stored inline in the "value", after the bucket +// header. In the case of inline buckets, the "root" will be 0. +type bucket struct { + root pgid // page id of the bucket's root-level page + sequence uint64 // monotonically incrementing, used by NextSequence() +} + +// newBucket returns a new bucket associated with a transaction. +func newBucket(tx *Tx) Bucket { + var b = Bucket{tx: tx, FillPercent: DefaultFillPercent} + if tx.writable { + b.buckets = make(map[string]*Bucket) + b.nodes = make(map[pgid]*node) + } + return b +} + +// Tx returns the tx of the bucket. +func (b *Bucket) Tx() *Tx { + return b.tx +} + +// Root returns the root of the bucket. +func (b *Bucket) Root() pgid { + return b.root +} + +// Writable returns whether the bucket is writable. +func (b *Bucket) Writable() bool { + return b.tx.writable +} + +// Cursor creates a cursor associated with the bucket. +// The cursor is only valid as long as the transaction is open. +// Do not use a cursor after the transaction is closed. +func (b *Bucket) Cursor() *Cursor { + // Update transaction statistics. + b.tx.stats.CursorCount++ + + // Allocate and return a cursor. + return &Cursor{ + bucket: b, + stack: make([]elemRef, 0), + } +} + +// Bucket retrieves a nested bucket by name. +// Returns nil if the bucket does not exist. +func (b *Bucket) Bucket(name []byte) *Bucket { + if b.buckets != nil { + if child := b.buckets[string(name)]; child != nil { + return child + } + } + + // Move cursor to key. + c := b.Cursor() + k, v, flags := c.seek(name) + + // Return nil if the key doesn't exist or it is not a bucket. + if !bytes.Equal(name, k) || (flags&bucketLeafFlag) == 0 { + return nil + } + + // Otherwise create a bucket and cache it. + var child = b.openBucket(v) + if b.buckets != nil { + b.buckets[string(name)] = child + } + + return child +} + +// Helper method that re-interprets a sub-bucket value +// from a parent into a Bucket +func (b *Bucket) openBucket(value []byte) *Bucket { + var child = newBucket(b.tx) + + // If this is a writable transaction then we need to copy the bucket entry. + // Read-only transactions can point directly at the mmap entry. + if b.tx.writable { + child.bucket = &bucket{} + *child.bucket = *(*bucket)(unsafe.Pointer(&value[0])) + } else { + child.bucket = (*bucket)(unsafe.Pointer(&value[0])) + } + + // Save a reference to the inline page if the bucket is inline. + if child.root == 0 { + child.page = (*page)(unsafe.Pointer(&value[bucketHeaderSize])) + } + + return &child +} + +// CreateBucket creates a new bucket at the given key and returns the new bucket. +// Returns an error if the key already exists, if the bucket name is blank, or if the bucket name is too long. +func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) { + if b.tx.db == nil { + return nil, ErrTxClosed + } else if !b.tx.writable { + return nil, ErrTxNotWritable + } else if len(key) == 0 { + return nil, ErrBucketNameRequired + } + + // Move cursor to correct position. + c := b.Cursor() + k, _, flags := c.seek(key) + + // Return an error if there is an existing key. + if bytes.Equal(key, k) { + if (flags & bucketLeafFlag) != 0 { + return nil, ErrBucketExists + } else { + return nil, ErrIncompatibleValue + } + } + + // Create empty, inline bucket. + var bucket = Bucket{ + bucket: &bucket{}, + rootNode: &node{isLeaf: true}, + FillPercent: DefaultFillPercent, + } + var value = bucket.write() + + // Insert into node. + key = cloneBytes(key) + c.node().put(key, key, value, 0, bucketLeafFlag) + + // Since subbuckets are not allowed on inline buckets, we need to + // dereference the inline page, if it exists. This will cause the bucket + // to be treated as a regular, non-inline bucket for the rest of the tx. + b.page = nil + + return b.Bucket(key), nil +} + +// CreateBucketIfNotExists creates a new bucket if it doesn't already exist and returns a reference to it. +// Returns an error if the bucket name is blank, or if the bucket name is too long. +func (b *Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error) { + child, err := b.CreateBucket(key) + if err == ErrBucketExists { + return b.Bucket(key), nil + } else if err != nil { + return nil, err + } + return child, nil +} + +// DeleteBucket deletes a bucket at the given key. +// Returns an error if the bucket does not exists, or if the key represents a non-bucket value. +func (b *Bucket) DeleteBucket(key []byte) error { + if b.tx.db == nil { + return ErrTxClosed + } else if !b.Writable() { + return ErrTxNotWritable + } + + // Move cursor to correct position. + c := b.Cursor() + k, _, flags := c.seek(key) + + // Return an error if bucket doesn't exist or is not a bucket. + if !bytes.Equal(key, k) { + return ErrBucketNotFound + } else if (flags & bucketLeafFlag) == 0 { + return ErrIncompatibleValue + } + + // Recursively delete all child buckets. + child := b.Bucket(key) + err := child.ForEach(func(k, v []byte) error { + if v == nil { + if err := child.DeleteBucket(k); err != nil { + return fmt.Errorf("delete bucket: %s", err) + } + } + return nil + }) + if err != nil { + return err + } + + // Remove cached copy. + delete(b.buckets, string(key)) + + // Release all bucket pages to freelist. + child.nodes = nil + child.rootNode = nil + child.free() + + // Delete the node if we have a matching key. + c.node().del(key) + + return nil +} + +// Get retrieves the value for a key in the bucket. +// Returns a nil value if the key does not exist or if the key is a nested bucket. +// The returned value is only valid for the life of the transaction. +func (b *Bucket) Get(key []byte) []byte { + k, v, flags := b.Cursor().seek(key) + + // Return nil if this is a bucket. + if (flags & bucketLeafFlag) != 0 { + return nil + } + + // If our target node isn't the same key as what's passed in then return nil. + if !bytes.Equal(key, k) { + return nil + } + return v +} + +// Put sets the value for a key in the bucket. +// If the key exist then its previous value will be overwritten. +// Returns an error if the bucket was created from a read-only transaction, if the key is blank, if the key is too large, or if the value is too large. +func (b *Bucket) Put(key []byte, value []byte) error { + if b.tx.db == nil { + return ErrTxClosed + } else if !b.Writable() { + return ErrTxNotWritable + } else if len(key) == 0 { + return ErrKeyRequired + } else if len(key) > MaxKeySize { + return ErrKeyTooLarge + } else if int64(len(value)) > MaxValueSize { + return ErrValueTooLarge + } + + // Move cursor to correct position. + c := b.Cursor() + k, _, flags := c.seek(key) + + // Return an error if there is an existing key with a bucket value. + if bytes.Equal(key, k) && (flags&bucketLeafFlag) != 0 { + return ErrIncompatibleValue + } + + // Insert into node. + key = cloneBytes(key) + c.node().put(key, key, value, 0, 0) + + return nil +} + +// Delete removes a key from the bucket. +// If the key does not exist then nothing is done and a nil error is returned. +// Returns an error if the bucket was created from a read-only transaction. +func (b *Bucket) Delete(key []byte) error { + if b.tx.db == nil { + return ErrTxClosed + } else if !b.Writable() { + return ErrTxNotWritable + } + + // Move cursor to correct position. + c := b.Cursor() + _, _, flags := c.seek(key) + + // Return an error if there is already existing bucket value. + if (flags & bucketLeafFlag) != 0 { + return ErrIncompatibleValue + } + + // Delete the node if we have a matching key. + c.node().del(key) + + return nil +} + +// NextSequence returns an autoincrementing integer for the bucket. +func (b *Bucket) NextSequence() (uint64, error) { + if b.tx.db == nil { + return 0, ErrTxClosed + } else if !b.Writable() { + return 0, ErrTxNotWritable + } + + // Materialize the root node if it hasn't been already so that the + // bucket will be saved during commit. + if b.rootNode == nil { + _ = b.node(b.root, nil) + } + + // Increment and return the sequence. + b.bucket.sequence++ + return b.bucket.sequence, nil +} + +// ForEach executes a function for each key/value pair in a bucket. +// If the provided function returns an error then the iteration is stopped and +// the error is returned to the caller. +func (b *Bucket) ForEach(fn func(k, v []byte) error) error { + if b.tx.db == nil { + return ErrTxClosed + } + c := b.Cursor() + for k, v := c.First(); k != nil; k, v = c.Next() { + if err := fn(k, v); err != nil { + return err + } + } + return nil +} + +// Stat returns stats on a bucket. +func (b *Bucket) Stats() BucketStats { + var s, subStats BucketStats + pageSize := b.tx.db.pageSize + s.BucketN += 1 + if b.root == 0 { + s.InlineBucketN += 1 + } + b.forEachPage(func(p *page, depth int) { + if (p.flags & leafPageFlag) != 0 { + s.KeyN += int(p.count) + + // used totals the used bytes for the page + used := pageHeaderSize + + if p.count != 0 { + // If page has any elements, add all element headers. + used += leafPageElementSize * int(p.count-1) + + // Add all element key, value sizes. + // The computation takes advantage of the fact that the position + // of the last element's key/value equals to the total of the sizes + // of all previous elements' keys and values. + // It also includes the last element's header. + lastElement := p.leafPageElement(p.count - 1) + used += int(lastElement.pos + lastElement.ksize + lastElement.vsize) + } + + if b.root == 0 { + // For inlined bucket just update the inline stats + s.InlineBucketInuse += used + } else { + // For non-inlined bucket update all the leaf stats + s.LeafPageN++ + s.LeafInuse += used + s.LeafOverflowN += int(p.overflow) + + // Collect stats from sub-buckets. + // Do that by iterating over all element headers + // looking for the ones with the bucketLeafFlag. + for i := uint16(0); i < p.count; i++ { + e := p.leafPageElement(i) + if (e.flags & bucketLeafFlag) != 0 { + // For any bucket element, open the element value + // and recursively call Stats on the contained bucket. + subStats.Add(b.openBucket(e.value()).Stats()) + } + } + } + } else if (p.flags & branchPageFlag) != 0 { + s.BranchPageN++ + lastElement := p.branchPageElement(p.count - 1) + + // used totals the used bytes for the page + // Add header and all element headers. + used := pageHeaderSize + (branchPageElementSize * int(p.count-1)) + + // Add size of all keys and values. + // Again, use the fact that last element's position equals to + // the total of key, value sizes of all previous elements. + used += int(lastElement.pos + lastElement.ksize) + s.BranchInuse += used + s.BranchOverflowN += int(p.overflow) + } + + // Keep track of maximum page depth. + if depth+1 > s.Depth { + s.Depth = (depth + 1) + } + }) + + // Alloc stats can be computed from page counts and pageSize. + s.BranchAlloc = (s.BranchPageN + s.BranchOverflowN) * pageSize + s.LeafAlloc = (s.LeafPageN + s.LeafOverflowN) * pageSize + + // Add the max depth of sub-buckets to get total nested depth. + s.Depth += subStats.Depth + // Add the stats for all sub-buckets + s.Add(subStats) + return s +} + +// forEachPage iterates over every page in a bucket, including inline pages. +func (b *Bucket) forEachPage(fn func(*page, int)) { + // If we have an inline page then just use that. + if b.page != nil { + fn(b.page, 0) + return + } + + // Otherwise traverse the page hierarchy. + b.tx.forEachPage(b.root, 0, fn) +} + +// forEachPageNode iterates over every page (or node) in a bucket. +// This also includes inline pages. +func (b *Bucket) forEachPageNode(fn func(*page, *node, int)) { + // If we have an inline page or root node then just use that. + if b.page != nil { + fn(b.page, nil, 0) + return + } + b._forEachPageNode(b.root, 0, fn) +} + +func (b *Bucket) _forEachPageNode(pgid pgid, depth int, fn func(*page, *node, int)) { + var p, n = b.pageNode(pgid) + + // Execute function. + fn(p, n, depth) + + // Recursively loop over children. + if p != nil { + if (p.flags & branchPageFlag) != 0 { + for i := 0; i < int(p.count); i++ { + elem := p.branchPageElement(uint16(i)) + b._forEachPageNode(elem.pgid, depth+1, fn) + } + } + } else { + if !n.isLeaf { + for _, inode := range n.inodes { + b._forEachPageNode(inode.pgid, depth+1, fn) + } + } + } +} + +// spill writes all the nodes for this bucket to dirty pages. +func (b *Bucket) spill() error { + // Spill all child buckets first. + for name, child := range b.buckets { + // If the child bucket is small enough and it has no child buckets then + // write it inline into the parent bucket's page. Otherwise spill it + // like a normal bucket and make the parent value a pointer to the page. + var value []byte + if child.inlineable() { + child.free() + value = child.write() + } else { + if err := child.spill(); err != nil { + return err + } + + // Update the child bucket header in this bucket. + value = make([]byte, unsafe.Sizeof(bucket{})) + var bucket = (*bucket)(unsafe.Pointer(&value[0])) + *bucket = *child.bucket + } + + // Skip writing the bucket if there are no materialized nodes. + if child.rootNode == nil { + continue + } + + // Update parent node. + var c = b.Cursor() + k, _, flags := c.seek([]byte(name)) + if !bytes.Equal([]byte(name), k) { + panic(fmt.Sprintf("misplaced bucket header: %x -> %x", []byte(name), k)) + } + if flags&bucketLeafFlag == 0 { + panic(fmt.Sprintf("unexpected bucket header flag: %x", flags)) + } + c.node().put([]byte(name), []byte(name), value, 0, bucketLeafFlag) + } + + // Ignore if there's not a materialized root node. + if b.rootNode == nil { + return nil + } + + // Spill nodes. + if err := b.rootNode.spill(); err != nil { + return err + } + b.rootNode = b.rootNode.root() + + // Update the root node for this bucket. + if b.rootNode.pgid >= b.tx.meta.pgid { + panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", b.rootNode.pgid, b.tx.meta.pgid)) + } + b.root = b.rootNode.pgid + + return nil +} + +// inlineable returns true if a bucket is small enough to be written inline +// and if it contains no subbuckets. Otherwise returns false. +func (b *Bucket) inlineable() bool { + var n = b.rootNode + + // Bucket must only contain a single leaf node. + if n == nil || !n.isLeaf { + return false + } + + // Bucket is not inlineable if it contains subbuckets or if it goes beyond + // our threshold for inline bucket size. + var size = pageHeaderSize + for _, inode := range n.inodes { + size += leafPageElementSize + len(inode.key) + len(inode.value) + + if inode.flags&bucketLeafFlag != 0 { + return false + } else if size > b.maxInlineBucketSize() { + return false + } + } + + return true +} + +// Returns the maximum total size of a bucket to make it a candidate for inlining. +func (b *Bucket) maxInlineBucketSize() int { + return b.tx.db.pageSize / 4 +} + +// write allocates and writes a bucket to a byte slice. +func (b *Bucket) write() []byte { + // Allocate the appropriate size. + var n = b.rootNode + var value = make([]byte, bucketHeaderSize+n.size()) + + // Write a bucket header. + var bucket = (*bucket)(unsafe.Pointer(&value[0])) + *bucket = *b.bucket + + // Convert byte slice to a fake page and write the root node. + var p = (*page)(unsafe.Pointer(&value[bucketHeaderSize])) + n.write(p) + + return value +} + +// rebalance attempts to balance all nodes. +func (b *Bucket) rebalance() { + for _, n := range b.nodes { + n.rebalance() + } + for _, child := range b.buckets { + child.rebalance() + } +} + +// node creates a node from a page and associates it with a given parent. +func (b *Bucket) node(pgid pgid, parent *node) *node { + _assert(b.nodes != nil, "nodes map expected") + + // Retrieve node if it's already been created. + if n := b.nodes[pgid]; n != nil { + return n + } + + // Otherwise create a node and cache it. + n := &node{bucket: b, parent: parent} + if parent == nil { + b.rootNode = n + } else { + parent.children = append(parent.children, n) + } + + // Use the inline page if this is an inline bucket. + var p = b.page + if p == nil { + p = b.tx.page(pgid) + } + + // Read the page into the node and cache it. + n.read(p) + b.nodes[pgid] = n + + // Update statistics. + b.tx.stats.NodeCount++ + + return n +} + +// free recursively frees all pages in the bucket. +func (b *Bucket) free() { + if b.root == 0 { + return + } + + var tx = b.tx + b.forEachPageNode(func(p *page, n *node, _ int) { + if p != nil { + tx.db.freelist.free(tx.meta.txid, p) + } else { + n.free() + } + }) + b.root = 0 +} + +// dereference removes all references to the old mmap. +func (b *Bucket) dereference() { + if b.rootNode != nil { + b.rootNode.root().dereference() + } + + for _, child := range b.buckets { + child.dereference() + } +} + +// pageNode returns the in-memory node, if it exists. +// Otherwise returns the underlying page. +func (b *Bucket) pageNode(id pgid) (*page, *node) { + // Inline buckets have a fake page embedded in their value so treat them + // differently. We'll return the rootNode (if available) or the fake page. + if b.root == 0 { + if id != 0 { + panic(fmt.Sprintf("inline bucket non-zero page access(2): %d != 0", id)) + } + if b.rootNode != nil { + return nil, b.rootNode + } + return b.page, nil + } + + // Check the node cache for non-inline buckets. + if b.nodes != nil { + if n := b.nodes[id]; n != nil { + return nil, n + } + } + + // Finally lookup the page from the transaction if no node is materialized. + return b.tx.page(id), nil +} + +// BucketStats records statistics about resources used by a bucket. +type BucketStats struct { + // Page count statistics. + BranchPageN int // number of logical branch pages + BranchOverflowN int // number of physical branch overflow pages + LeafPageN int // number of logical leaf pages + LeafOverflowN int // number of physical leaf overflow pages + + // Tree statistics. + KeyN int // number of keys/value pairs + Depth int // number of levels in B+tree + + // Page size utilization. + BranchAlloc int // bytes allocated for physical branch pages + BranchInuse int // bytes actually used for branch data + LeafAlloc int // bytes allocated for physical leaf pages + LeafInuse int // bytes actually used for leaf data + + // Bucket statistics + BucketN int // total number of buckets including the top bucket + InlineBucketN int // total number on inlined buckets + InlineBucketInuse int // bytes used for inlined buckets (also accounted for in LeafInuse) +} + +func (s *BucketStats) Add(other BucketStats) { + s.BranchPageN += other.BranchPageN + s.BranchOverflowN += other.BranchOverflowN + s.LeafPageN += other.LeafPageN + s.LeafOverflowN += other.LeafOverflowN + s.KeyN += other.KeyN + if s.Depth < other.Depth { + s.Depth = other.Depth + } + s.BranchAlloc += other.BranchAlloc + s.BranchInuse += other.BranchInuse + s.LeafAlloc += other.LeafAlloc + s.LeafInuse += other.LeafInuse + + s.BucketN += other.BucketN + s.InlineBucketN += other.InlineBucketN + s.InlineBucketInuse += other.InlineBucketInuse +} + +// cloneBytes returns a copy of a given slice. +func cloneBytes(v []byte) []byte { + var clone = make([]byte, len(v)) + copy(clone, v) + return clone +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/bucket_test.go b/Godeps/_workspace/src/github.com/boltdb/bolt/bucket_test.go new file mode 100644 index 000000000..7ceb6f536 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/bucket_test.go @@ -0,0 +1,1153 @@ +package bolt_test + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "math/rand" + "os" + "strconv" + "strings" + "testing" + "testing/quick" + + "github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt" +) + +// Ensure that a bucket that gets a non-existent key returns nil. +func TestBucket_Get_NonExistent(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + value := tx.Bucket([]byte("widgets")).Get([]byte("foo")) + assert(t, value == nil, "") + return nil + }) +} + +// Ensure that a bucket can read a value that is not flushed yet. +func TestBucket_Get_FromNode(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + b := tx.Bucket([]byte("widgets")) + b.Put([]byte("foo"), []byte("bar")) + value := b.Get([]byte("foo")) + equals(t, []byte("bar"), value) + return nil + }) +} + +// Ensure that a bucket retrieved via Get() returns a nil. +func TestBucket_Get_IncompatibleValue(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + _, err := tx.Bucket([]byte("widgets")).CreateBucket([]byte("foo")) + ok(t, err) + assert(t, tx.Bucket([]byte("widgets")).Get([]byte("foo")) == nil, "") + return nil + }) +} + +// Ensure that a bucket can write a key/value. +func TestBucket_Put(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + err := tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar")) + ok(t, err) + value := tx.Bucket([]byte("widgets")).Get([]byte("foo")) + equals(t, value, []byte("bar")) + return nil + }) +} + +// Ensure that a bucket can rewrite a key in the same transaction. +func TestBucket_Put_Repeat(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + b := tx.Bucket([]byte("widgets")) + ok(t, b.Put([]byte("foo"), []byte("bar"))) + ok(t, b.Put([]byte("foo"), []byte("baz"))) + value := tx.Bucket([]byte("widgets")).Get([]byte("foo")) + equals(t, value, []byte("baz")) + return nil + }) +} + +// Ensure that a bucket can write a bunch of large values. +func TestBucket_Put_Large(t *testing.T) { + db := NewTestDB() + defer db.Close() + + count, factor := 100, 200 + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + b := tx.Bucket([]byte("widgets")) + for i := 1; i < count; i++ { + ok(t, b.Put([]byte(strings.Repeat("0", i*factor)), []byte(strings.Repeat("X", (count-i)*factor)))) + } + return nil + }) + db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + for i := 1; i < count; i++ { + value := b.Get([]byte(strings.Repeat("0", i*factor))) + equals(t, []byte(strings.Repeat("X", (count-i)*factor)), value) + } + return nil + }) +} + +// Ensure that a database can perform multiple large appends safely. +func TestDB_Put_VeryLarge(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + n, batchN := 400000, 200000 + ksize, vsize := 8, 500 + + db := NewTestDB() + defer db.Close() + + for i := 0; i < n; i += batchN { + err := db.Update(func(tx *bolt.Tx) error { + b, _ := tx.CreateBucketIfNotExists([]byte("widgets")) + for j := 0; j < batchN; j++ { + k, v := make([]byte, ksize), make([]byte, vsize) + binary.BigEndian.PutUint32(k, uint32(i+j)) + ok(t, b.Put(k, v)) + } + return nil + }) + ok(t, err) + } +} + +// Ensure that a setting a value on a key with a bucket value returns an error. +func TestBucket_Put_IncompatibleValue(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + _, err := tx.Bucket([]byte("widgets")).CreateBucket([]byte("foo")) + ok(t, err) + equals(t, bolt.ErrIncompatibleValue, tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar"))) + return nil + }) +} + +// Ensure that a setting a value while the transaction is closed returns an error. +func TestBucket_Put_Closed(t *testing.T) { + db := NewTestDB() + defer db.Close() + tx, _ := db.Begin(true) + tx.CreateBucket([]byte("widgets")) + b := tx.Bucket([]byte("widgets")) + tx.Rollback() + equals(t, bolt.ErrTxClosed, b.Put([]byte("foo"), []byte("bar"))) +} + +// Ensure that setting a value on a read-only bucket returns an error. +func TestBucket_Put_ReadOnly(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + ok(t, err) + return nil + }) + db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + err := b.Put([]byte("foo"), []byte("bar")) + equals(t, err, bolt.ErrTxNotWritable) + return nil + }) +} + +// Ensure that a bucket can delete an existing key. +func TestBucket_Delete(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar")) + err := tx.Bucket([]byte("widgets")).Delete([]byte("foo")) + ok(t, err) + value := tx.Bucket([]byte("widgets")).Get([]byte("foo")) + assert(t, value == nil, "") + return nil + }) +} + +// Ensure that deleting a large set of keys will work correctly. +func TestBucket_Delete_Large(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + var b, _ = tx.CreateBucket([]byte("widgets")) + for i := 0; i < 100; i++ { + ok(t, b.Put([]byte(strconv.Itoa(i)), []byte(strings.Repeat("*", 1024)))) + } + return nil + }) + db.Update(func(tx *bolt.Tx) error { + var b = tx.Bucket([]byte("widgets")) + for i := 0; i < 100; i++ { + ok(t, b.Delete([]byte(strconv.Itoa(i)))) + } + return nil + }) + db.View(func(tx *bolt.Tx) error { + var b = tx.Bucket([]byte("widgets")) + for i := 0; i < 100; i++ { + assert(t, b.Get([]byte(strconv.Itoa(i))) == nil, "") + } + return nil + }) +} + +// Deleting a very large list of keys will cause the freelist to use overflow. +func TestBucket_Delete_FreelistOverflow(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + db := NewTestDB() + defer db.Close() + k := make([]byte, 16) + for i := uint64(0); i < 10000; i++ { + err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucketIfNotExists([]byte("0")) + if err != nil { + t.Fatalf("bucket error: %s", err) + } + + for j := uint64(0); j < 1000; j++ { + binary.BigEndian.PutUint64(k[:8], i) + binary.BigEndian.PutUint64(k[8:], j) + if err := b.Put(k, nil); err != nil { + t.Fatalf("put error: %s", err) + } + } + + return nil + }) + + if err != nil { + t.Fatalf("update error: %s", err) + } + } + + // Delete all of them in one large transaction + err := db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("0")) + c := b.Cursor() + for k, _ := c.First(); k != nil; k, _ = c.Next() { + b.Delete(k) + } + return nil + }) + + // Check that a freelist overflow occurred. + ok(t, err) +} + +// Ensure that accessing and updating nested buckets is ok across transactions. +func TestBucket_Nested(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + // Create a widgets bucket. + b, err := tx.CreateBucket([]byte("widgets")) + ok(t, err) + + // Create a widgets/foo bucket. + _, err = b.CreateBucket([]byte("foo")) + ok(t, err) + + // Create a widgets/bar key. + ok(t, b.Put([]byte("bar"), []byte("0000"))) + + return nil + }) + db.MustCheck() + + // Update widgets/bar. + db.Update(func(tx *bolt.Tx) error { + var b = tx.Bucket([]byte("widgets")) + ok(t, b.Put([]byte("bar"), []byte("xxxx"))) + return nil + }) + db.MustCheck() + + // Cause a split. + db.Update(func(tx *bolt.Tx) error { + var b = tx.Bucket([]byte("widgets")) + for i := 0; i < 10000; i++ { + ok(t, b.Put([]byte(strconv.Itoa(i)), []byte(strconv.Itoa(i)))) + } + return nil + }) + db.MustCheck() + + // Insert into widgets/foo/baz. + db.Update(func(tx *bolt.Tx) error { + var b = tx.Bucket([]byte("widgets")) + ok(t, b.Bucket([]byte("foo")).Put([]byte("baz"), []byte("yyyy"))) + return nil + }) + db.MustCheck() + + // Verify. + db.View(func(tx *bolt.Tx) error { + var b = tx.Bucket([]byte("widgets")) + equals(t, []byte("yyyy"), b.Bucket([]byte("foo")).Get([]byte("baz"))) + equals(t, []byte("xxxx"), b.Get([]byte("bar"))) + for i := 0; i < 10000; i++ { + equals(t, []byte(strconv.Itoa(i)), b.Get([]byte(strconv.Itoa(i)))) + } + return nil + }) +} + +// Ensure that deleting a bucket using Delete() returns an error. +func TestBucket_Delete_Bucket(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + b := tx.Bucket([]byte("widgets")) + _, err := b.CreateBucket([]byte("foo")) + ok(t, err) + equals(t, bolt.ErrIncompatibleValue, b.Delete([]byte("foo"))) + return nil + }) +} + +// Ensure that deleting a key on a read-only bucket returns an error. +func TestBucket_Delete_ReadOnly(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + return nil + }) + db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + err := b.Delete([]byte("foo")) + equals(t, err, bolt.ErrTxNotWritable) + return nil + }) +} + +// Ensure that a deleting value while the transaction is closed returns an error. +func TestBucket_Delete_Closed(t *testing.T) { + db := NewTestDB() + defer db.Close() + tx, _ := db.Begin(true) + tx.CreateBucket([]byte("widgets")) + b := tx.Bucket([]byte("widgets")) + tx.Rollback() + equals(t, bolt.ErrTxClosed, b.Delete([]byte("foo"))) +} + +// Ensure that deleting a bucket causes nested buckets to be deleted. +func TestBucket_DeleteBucket_Nested(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + _, err := tx.Bucket([]byte("widgets")).CreateBucket([]byte("foo")) + ok(t, err) + _, err = tx.Bucket([]byte("widgets")).Bucket([]byte("foo")).CreateBucket([]byte("bar")) + ok(t, err) + ok(t, tx.Bucket([]byte("widgets")).Bucket([]byte("foo")).Bucket([]byte("bar")).Put([]byte("baz"), []byte("bat"))) + ok(t, tx.Bucket([]byte("widgets")).DeleteBucket([]byte("foo"))) + return nil + }) +} + +// Ensure that deleting a bucket causes nested buckets to be deleted after they have been committed. +func TestBucket_DeleteBucket_Nested2(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + _, err := tx.Bucket([]byte("widgets")).CreateBucket([]byte("foo")) + ok(t, err) + _, err = tx.Bucket([]byte("widgets")).Bucket([]byte("foo")).CreateBucket([]byte("bar")) + ok(t, err) + ok(t, tx.Bucket([]byte("widgets")).Bucket([]byte("foo")).Bucket([]byte("bar")).Put([]byte("baz"), []byte("bat"))) + return nil + }) + db.Update(func(tx *bolt.Tx) error { + assert(t, tx.Bucket([]byte("widgets")) != nil, "") + assert(t, tx.Bucket([]byte("widgets")).Bucket([]byte("foo")) != nil, "") + assert(t, tx.Bucket([]byte("widgets")).Bucket([]byte("foo")).Bucket([]byte("bar")) != nil, "") + equals(t, []byte("bat"), tx.Bucket([]byte("widgets")).Bucket([]byte("foo")).Bucket([]byte("bar")).Get([]byte("baz"))) + ok(t, tx.DeleteBucket([]byte("widgets"))) + return nil + }) + db.View(func(tx *bolt.Tx) error { + assert(t, tx.Bucket([]byte("widgets")) == nil, "") + return nil + }) +} + +// Ensure that deleting a child bucket with multiple pages causes all pages to get collected. +func TestBucket_DeleteBucket_Large(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + ok(t, err) + _, err = tx.Bucket([]byte("widgets")).CreateBucket([]byte("foo")) + ok(t, err) + b := tx.Bucket([]byte("widgets")).Bucket([]byte("foo")) + for i := 0; i < 1000; i++ { + ok(t, b.Put([]byte(fmt.Sprintf("%d", i)), []byte(fmt.Sprintf("%0100d", i)))) + } + return nil + }) + db.Update(func(tx *bolt.Tx) error { + ok(t, tx.DeleteBucket([]byte("widgets"))) + return nil + }) + + // NOTE: Consistency check in TestDB.Close() will panic if pages not freed properly. +} + +// Ensure that a simple value retrieved via Bucket() returns a nil. +func TestBucket_Bucket_IncompatibleValue(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + ok(t, tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar"))) + assert(t, tx.Bucket([]byte("widgets")).Bucket([]byte("foo")) == nil, "") + return nil + }) +} + +// Ensure that creating a bucket on an existing non-bucket key returns an error. +func TestBucket_CreateBucket_IncompatibleValue(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + ok(t, err) + ok(t, tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar"))) + _, err = tx.Bucket([]byte("widgets")).CreateBucket([]byte("foo")) + equals(t, bolt.ErrIncompatibleValue, err) + return nil + }) +} + +// Ensure that deleting a bucket on an existing non-bucket key returns an error. +func TestBucket_DeleteBucket_IncompatibleValue(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + ok(t, err) + ok(t, tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar"))) + equals(t, bolt.ErrIncompatibleValue, tx.Bucket([]byte("widgets")).DeleteBucket([]byte("foo"))) + return nil + }) +} + +// Ensure that a bucket can return an autoincrementing sequence. +func TestBucket_NextSequence(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + tx.CreateBucket([]byte("woojits")) + + // Make sure sequence increments. + seq, err := tx.Bucket([]byte("widgets")).NextSequence() + ok(t, err) + equals(t, seq, uint64(1)) + seq, err = tx.Bucket([]byte("widgets")).NextSequence() + ok(t, err) + equals(t, seq, uint64(2)) + + // Buckets should be separate. + seq, err = tx.Bucket([]byte("woojits")).NextSequence() + ok(t, err) + equals(t, seq, uint64(1)) + return nil + }) +} + +// Ensure that a bucket will persist an autoincrementing sequence even if its +// the only thing updated on the bucket. +// https://github.com/boltdb/bolt/issues/296 +func TestBucket_NextSequence_Persist(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + _, _ = tx.CreateBucket([]byte("widgets")) + return nil + }) + + db.Update(func(tx *bolt.Tx) error { + _, _ = tx.Bucket([]byte("widgets")).NextSequence() + return nil + }) + + db.Update(func(tx *bolt.Tx) error { + seq, err := tx.Bucket([]byte("widgets")).NextSequence() + if err != nil { + t.Fatalf("unexpected error: %s", err) + } else if seq != 2 { + t.Fatalf("unexpected sequence: %d", seq) + } + return nil + }) +} + +// Ensure that retrieving the next sequence on a read-only bucket returns an error. +func TestBucket_NextSequence_ReadOnly(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + return nil + }) + db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + i, err := b.NextSequence() + equals(t, i, uint64(0)) + equals(t, err, bolt.ErrTxNotWritable) + return nil + }) +} + +// Ensure that retrieving the next sequence for a bucket on a closed database return an error. +func TestBucket_NextSequence_Closed(t *testing.T) { + db := NewTestDB() + defer db.Close() + tx, _ := db.Begin(true) + tx.CreateBucket([]byte("widgets")) + b := tx.Bucket([]byte("widgets")) + tx.Rollback() + _, err := b.NextSequence() + equals(t, bolt.ErrTxClosed, err) +} + +// Ensure a user can loop over all key/value pairs in a bucket. +func TestBucket_ForEach(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("0000")) + tx.Bucket([]byte("widgets")).Put([]byte("baz"), []byte("0001")) + tx.Bucket([]byte("widgets")).Put([]byte("bar"), []byte("0002")) + + var index int + err := tx.Bucket([]byte("widgets")).ForEach(func(k, v []byte) error { + switch index { + case 0: + equals(t, k, []byte("bar")) + equals(t, v, []byte("0002")) + case 1: + equals(t, k, []byte("baz")) + equals(t, v, []byte("0001")) + case 2: + equals(t, k, []byte("foo")) + equals(t, v, []byte("0000")) + } + index++ + return nil + }) + ok(t, err) + equals(t, index, 3) + return nil + }) +} + +// Ensure a database can stop iteration early. +func TestBucket_ForEach_ShortCircuit(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + tx.Bucket([]byte("widgets")).Put([]byte("bar"), []byte("0000")) + tx.Bucket([]byte("widgets")).Put([]byte("baz"), []byte("0000")) + tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("0000")) + + var index int + err := tx.Bucket([]byte("widgets")).ForEach(func(k, v []byte) error { + index++ + if bytes.Equal(k, []byte("baz")) { + return errors.New("marker") + } + return nil + }) + equals(t, errors.New("marker"), err) + equals(t, 2, index) + return nil + }) +} + +// Ensure that looping over a bucket on a closed database returns an error. +func TestBucket_ForEach_Closed(t *testing.T) { + db := NewTestDB() + defer db.Close() + tx, _ := db.Begin(true) + tx.CreateBucket([]byte("widgets")) + b := tx.Bucket([]byte("widgets")) + tx.Rollback() + err := b.ForEach(func(k, v []byte) error { return nil }) + equals(t, bolt.ErrTxClosed, err) +} + +// Ensure that an error is returned when inserting with an empty key. +func TestBucket_Put_EmptyKey(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + err := tx.Bucket([]byte("widgets")).Put([]byte(""), []byte("bar")) + equals(t, err, bolt.ErrKeyRequired) + err = tx.Bucket([]byte("widgets")).Put(nil, []byte("bar")) + equals(t, err, bolt.ErrKeyRequired) + return nil + }) +} + +// Ensure that an error is returned when inserting with a key that's too large. +func TestBucket_Put_KeyTooLarge(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + err := tx.Bucket([]byte("widgets")).Put(make([]byte, 32769), []byte("bar")) + equals(t, err, bolt.ErrKeyTooLarge) + return nil + }) +} + +// Ensure a bucket can calculate stats. +func TestBucket_Stats(t *testing.T) { + db := NewTestDB() + defer db.Close() + + // Add bucket with fewer keys but one big value. + big_key := []byte("really-big-value") + for i := 0; i < 500; i++ { + db.Update(func(tx *bolt.Tx) error { + b, _ := tx.CreateBucketIfNotExists([]byte("woojits")) + return b.Put([]byte(fmt.Sprintf("%03d", i)), []byte(strconv.Itoa(i))) + }) + } + db.Update(func(tx *bolt.Tx) error { + b, _ := tx.CreateBucketIfNotExists([]byte("woojits")) + return b.Put(big_key, []byte(strings.Repeat("*", 10000))) + }) + + db.MustCheck() + db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("woojits")) + stats := b.Stats() + equals(t, 1, stats.BranchPageN) + equals(t, 0, stats.BranchOverflowN) + equals(t, 7, stats.LeafPageN) + equals(t, 2, stats.LeafOverflowN) + equals(t, 501, stats.KeyN) + equals(t, 2, stats.Depth) + + branchInuse := 16 // branch page header + branchInuse += 7 * 16 // branch elements + branchInuse += 7 * 3 // branch keys (6 3-byte keys) + equals(t, branchInuse, stats.BranchInuse) + + leafInuse := 7 * 16 // leaf page header + leafInuse += 501 * 16 // leaf elements + leafInuse += 500*3 + len(big_key) // leaf keys + leafInuse += 1*10 + 2*90 + 3*400 + 10000 // leaf values + equals(t, leafInuse, stats.LeafInuse) + + if os.Getpagesize() == 4096 { + // Incompatible page size + equals(t, 4096, stats.BranchAlloc) + equals(t, 36864, stats.LeafAlloc) + } + + equals(t, 1, stats.BucketN) + equals(t, 0, stats.InlineBucketN) + equals(t, 0, stats.InlineBucketInuse) + return nil + }) +} + +// Ensure a bucket with random insertion utilizes fill percentage correctly. +func TestBucket_Stats_RandomFill(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } else if os.Getpagesize() != 4096 { + t.Skip("invalid page size for test") + } + + db := NewTestDB() + defer db.Close() + + // Add a set of values in random order. It will be the same random + // order so we can maintain consistency between test runs. + var count int + r := rand.New(rand.NewSource(42)) + for _, i := range r.Perm(1000) { + db.Update(func(tx *bolt.Tx) error { + b, _ := tx.CreateBucketIfNotExists([]byte("woojits")) + b.FillPercent = 0.9 + for _, j := range r.Perm(100) { + index := (j * 10000) + i + b.Put([]byte(fmt.Sprintf("%d000000000000000", index)), []byte("0000000000")) + count++ + } + return nil + }) + } + db.MustCheck() + + db.View(func(tx *bolt.Tx) error { + s := tx.Bucket([]byte("woojits")).Stats() + equals(t, 100000, s.KeyN) + + equals(t, 98, s.BranchPageN) + equals(t, 0, s.BranchOverflowN) + equals(t, 130984, s.BranchInuse) + equals(t, 401408, s.BranchAlloc) + + equals(t, 3412, s.LeafPageN) + equals(t, 0, s.LeafOverflowN) + equals(t, 4742482, s.LeafInuse) + equals(t, 13975552, s.LeafAlloc) + return nil + }) +} + +// Ensure a bucket can calculate stats. +func TestBucket_Stats_Small(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + // Add a bucket that fits on a single root leaf. + b, err := tx.CreateBucket([]byte("whozawhats")) + ok(t, err) + b.Put([]byte("foo"), []byte("bar")) + + return nil + }) + db.MustCheck() + db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("whozawhats")) + stats := b.Stats() + equals(t, 0, stats.BranchPageN) + equals(t, 0, stats.BranchOverflowN) + equals(t, 0, stats.LeafPageN) + equals(t, 0, stats.LeafOverflowN) + equals(t, 1, stats.KeyN) + equals(t, 1, stats.Depth) + equals(t, 0, stats.BranchInuse) + equals(t, 0, stats.LeafInuse) + if os.Getpagesize() == 4096 { + // Incompatible page size + equals(t, 0, stats.BranchAlloc) + equals(t, 0, stats.LeafAlloc) + } + equals(t, 1, stats.BucketN) + equals(t, 1, stats.InlineBucketN) + equals(t, 16+16+6, stats.InlineBucketInuse) + return nil + }) +} + +func TestBucket_Stats_EmptyBucket(t *testing.T) { + db := NewTestDB() + defer db.Close() + + db.Update(func(tx *bolt.Tx) error { + // Add a bucket that fits on a single root leaf. + _, err := tx.CreateBucket([]byte("whozawhats")) + ok(t, err) + return nil + }) + db.MustCheck() + db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("whozawhats")) + stats := b.Stats() + equals(t, 0, stats.BranchPageN) + equals(t, 0, stats.BranchOverflowN) + equals(t, 0, stats.LeafPageN) + equals(t, 0, stats.LeafOverflowN) + equals(t, 0, stats.KeyN) + equals(t, 1, stats.Depth) + equals(t, 0, stats.BranchInuse) + equals(t, 0, stats.LeafInuse) + if os.Getpagesize() == 4096 { + // Incompatible page size + equals(t, 0, stats.BranchAlloc) + equals(t, 0, stats.LeafAlloc) + } + equals(t, 1, stats.BucketN) + equals(t, 1, stats.InlineBucketN) + equals(t, 16, stats.InlineBucketInuse) + return nil + }) +} + +// Ensure a bucket can calculate stats. +func TestBucket_Stats_Nested(t *testing.T) { + db := NewTestDB() + defer db.Close() + + db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("foo")) + ok(t, err) + for i := 0; i < 100; i++ { + b.Put([]byte(fmt.Sprintf("%02d", i)), []byte(fmt.Sprintf("%02d", i))) + } + bar, err := b.CreateBucket([]byte("bar")) + ok(t, err) + for i := 0; i < 10; i++ { + bar.Put([]byte(strconv.Itoa(i)), []byte(strconv.Itoa(i))) + } + baz, err := bar.CreateBucket([]byte("baz")) + ok(t, err) + for i := 0; i < 10; i++ { + baz.Put([]byte(strconv.Itoa(i)), []byte(strconv.Itoa(i))) + } + return nil + }) + + db.MustCheck() + + db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("foo")) + stats := b.Stats() + equals(t, 0, stats.BranchPageN) + equals(t, 0, stats.BranchOverflowN) + equals(t, 2, stats.LeafPageN) + equals(t, 0, stats.LeafOverflowN) + equals(t, 122, stats.KeyN) + equals(t, 3, stats.Depth) + equals(t, 0, stats.BranchInuse) + + foo := 16 // foo (pghdr) + foo += 101 * 16 // foo leaf elements + foo += 100*2 + 100*2 // foo leaf key/values + foo += 3 + 16 // foo -> bar key/value + + bar := 16 // bar (pghdr) + bar += 11 * 16 // bar leaf elements + bar += 10 + 10 // bar leaf key/values + bar += 3 + 16 // bar -> baz key/value + + baz := 16 // baz (inline) (pghdr) + baz += 10 * 16 // baz leaf elements + baz += 10 + 10 // baz leaf key/values + + equals(t, foo+bar+baz, stats.LeafInuse) + if os.Getpagesize() == 4096 { + // Incompatible page size + equals(t, 0, stats.BranchAlloc) + equals(t, 8192, stats.LeafAlloc) + } + equals(t, 3, stats.BucketN) + equals(t, 1, stats.InlineBucketN) + equals(t, baz, stats.InlineBucketInuse) + return nil + }) +} + +// Ensure a large bucket can calculate stats. +func TestBucket_Stats_Large(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + db := NewTestDB() + defer db.Close() + + var index int + for i := 0; i < 100; i++ { + db.Update(func(tx *bolt.Tx) error { + // Add bucket with lots of keys. + b, _ := tx.CreateBucketIfNotExists([]byte("widgets")) + for i := 0; i < 1000; i++ { + b.Put([]byte(strconv.Itoa(index)), []byte(strconv.Itoa(index))) + index++ + } + return nil + }) + } + db.MustCheck() + + db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + stats := b.Stats() + equals(t, 13, stats.BranchPageN) + equals(t, 0, stats.BranchOverflowN) + equals(t, 1196, stats.LeafPageN) + equals(t, 0, stats.LeafOverflowN) + equals(t, 100000, stats.KeyN) + equals(t, 3, stats.Depth) + equals(t, 25257, stats.BranchInuse) + equals(t, 2596916, stats.LeafInuse) + if os.Getpagesize() == 4096 { + // Incompatible page size + equals(t, 53248, stats.BranchAlloc) + equals(t, 4898816, stats.LeafAlloc) + } + equals(t, 1, stats.BucketN) + equals(t, 0, stats.InlineBucketN) + equals(t, 0, stats.InlineBucketInuse) + return nil + }) +} + +// Ensure that a bucket can write random keys and values across multiple transactions. +func TestBucket_Put_Single(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + index := 0 + f := func(items testdata) bool { + db := NewTestDB() + defer db.Close() + + m := make(map[string][]byte) + + db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + return err + }) + for _, item := range items { + db.Update(func(tx *bolt.Tx) error { + if err := tx.Bucket([]byte("widgets")).Put(item.Key, item.Value); err != nil { + panic("put error: " + err.Error()) + } + m[string(item.Key)] = item.Value + return nil + }) + + // Verify all key/values so far. + db.View(func(tx *bolt.Tx) error { + i := 0 + for k, v := range m { + value := tx.Bucket([]byte("widgets")).Get([]byte(k)) + if !bytes.Equal(value, v) { + t.Logf("value mismatch [run %d] (%d of %d):\nkey: %x\ngot: %x\nexp: %x", index, i, len(m), []byte(k), value, v) + db.CopyTempFile() + t.FailNow() + } + i++ + } + return nil + }) + } + + index++ + return true + } + if err := quick.Check(f, qconfig()); err != nil { + t.Error(err) + } +} + +// Ensure that a transaction can insert multiple key/value pairs at once. +func TestBucket_Put_Multiple(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + f := func(items testdata) bool { + db := NewTestDB() + defer db.Close() + // Bulk insert all values. + db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + return err + }) + err := db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + for _, item := range items { + ok(t, b.Put(item.Key, item.Value)) + } + return nil + }) + ok(t, err) + + // Verify all items exist. + db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + for _, item := range items { + value := b.Get(item.Key) + if !bytes.Equal(item.Value, value) { + db.CopyTempFile() + t.Fatalf("exp=%x; got=%x", item.Value, value) + } + } + return nil + }) + return true + } + if err := quick.Check(f, qconfig()); err != nil { + t.Error(err) + } +} + +// Ensure that a transaction can delete all key/value pairs and return to a single leaf page. +func TestBucket_Delete_Quick(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + f := func(items testdata) bool { + db := NewTestDB() + defer db.Close() + // Bulk insert all values. + db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + return err + }) + err := db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + for _, item := range items { + ok(t, b.Put(item.Key, item.Value)) + } + return nil + }) + ok(t, err) + + // Remove items one at a time and check consistency. + for _, item := range items { + err := db.Update(func(tx *bolt.Tx) error { + return tx.Bucket([]byte("widgets")).Delete(item.Key) + }) + ok(t, err) + } + + // Anything before our deletion index should be nil. + db.View(func(tx *bolt.Tx) error { + tx.Bucket([]byte("widgets")).ForEach(func(k, v []byte) error { + t.Fatalf("bucket should be empty; found: %06x", trunc(k, 3)) + return nil + }) + return nil + }) + return true + } + if err := quick.Check(f, qconfig()); err != nil { + t.Error(err) + } +} + +func ExampleBucket_Put() { + // Open the database. + db, _ := bolt.Open(tempfile(), 0666, nil) + defer os.Remove(db.Path()) + defer db.Close() + + // Start a write transaction. + db.Update(func(tx *bolt.Tx) error { + // Create a bucket. + tx.CreateBucket([]byte("widgets")) + + // Set the value "bar" for the key "foo". + tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar")) + return nil + }) + + // Read value back in a different read-only transaction. + db.View(func(tx *bolt.Tx) error { + value := tx.Bucket([]byte("widgets")).Get([]byte("foo")) + fmt.Printf("The value of 'foo' is: %s\n", value) + return nil + }) + + // Output: + // The value of 'foo' is: bar +} + +func ExampleBucket_Delete() { + // Open the database. + db, _ := bolt.Open(tempfile(), 0666, nil) + defer os.Remove(db.Path()) + defer db.Close() + + // Start a write transaction. + db.Update(func(tx *bolt.Tx) error { + // Create a bucket. + tx.CreateBucket([]byte("widgets")) + b := tx.Bucket([]byte("widgets")) + + // Set the value "bar" for the key "foo". + b.Put([]byte("foo"), []byte("bar")) + + // Retrieve the key back from the database and verify it. + value := b.Get([]byte("foo")) + fmt.Printf("The value of 'foo' was: %s\n", value) + return nil + }) + + // Delete the key in a different write transaction. + db.Update(func(tx *bolt.Tx) error { + return tx.Bucket([]byte("widgets")).Delete([]byte("foo")) + }) + + // Retrieve the key again. + db.View(func(tx *bolt.Tx) error { + value := tx.Bucket([]byte("widgets")).Get([]byte("foo")) + if value == nil { + fmt.Printf("The value of 'foo' is now: nil\n") + } + return nil + }) + + // Output: + // The value of 'foo' was: bar + // The value of 'foo' is now: nil +} + +func ExampleBucket_ForEach() { + // Open the database. + db, _ := bolt.Open(tempfile(), 0666, nil) + defer os.Remove(db.Path()) + defer db.Close() + + // Insert data into a bucket. + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("animals")) + b := tx.Bucket([]byte("animals")) + b.Put([]byte("dog"), []byte("fun")) + b.Put([]byte("cat"), []byte("lame")) + b.Put([]byte("liger"), []byte("awesome")) + + // Iterate over items in sorted key order. + b.ForEach(func(k, v []byte) error { + fmt.Printf("A %s is %s.\n", k, v) + return nil + }) + return nil + }) + + // Output: + // A cat is lame. + // A dog is fun. + // A liger is awesome. +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/bolt b/Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/bolt new file mode 100644 index 000000000..fa30da00f Binary files /dev/null and b/Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/bolt differ diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/main.go b/Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/main.go new file mode 100644 index 000000000..5a7ae8e4e --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/main.go @@ -0,0 +1,1529 @@ +package main + +import ( + "bytes" + "encoding/binary" + "errors" + "flag" + "fmt" + "io" + "io/ioutil" + "math/rand" + "os" + "runtime" + "runtime/pprof" + "strconv" + "strings" + "time" + "unicode" + "unicode/utf8" + "unsafe" + + "github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt" +) + +var ( + // ErrUsage is returned when a usage message was printed and the process + // should simply exit with an error. + ErrUsage = errors.New("usage") + + // ErrUnknownCommand is returned when a CLI command is not specified. + ErrUnknownCommand = errors.New("unknown command") + + // ErrPathRequired is returned when the path to a Bolt database is not specified. + ErrPathRequired = errors.New("path required") + + // ErrFileNotFound is returned when a Bolt database does not exist. + ErrFileNotFound = errors.New("file not found") + + // ErrInvalidValue is returned when a benchmark reads an unexpected value. + ErrInvalidValue = errors.New("invalid value") + + // ErrCorrupt is returned when a checking a data file finds errors. + ErrCorrupt = errors.New("invalid value") + + // ErrNonDivisibleBatchSize is returned when the batch size can't be evenly + // divided by the iteration count. + ErrNonDivisibleBatchSize = errors.New("number of iterations must be divisible by the batch size") + + // ErrPageIDRequired is returned when a required page id is not specified. + ErrPageIDRequired = errors.New("page id required") + + // ErrPageNotFound is returned when specifying a page above the high water mark. + ErrPageNotFound = errors.New("page not found") + + // ErrPageFreed is returned when reading a page that has already been freed. + ErrPageFreed = errors.New("page freed") +) + +// PageHeaderSize represents the size of the bolt.page header. +const PageHeaderSize = 16 + +func main() { + m := NewMain() + if err := m.Run(os.Args[1:]...); err == ErrUsage { + os.Exit(2) + } else if err != nil { + fmt.Println(err.Error()) + os.Exit(1) + } +} + +// Main represents the main program execution. +type Main struct { + Stdin io.Reader + Stdout io.Writer + Stderr io.Writer +} + +// NewMain returns a new instance of Main connect to the standard input/output. +func NewMain() *Main { + return &Main{ + Stdin: os.Stdin, + Stdout: os.Stdout, + Stderr: os.Stderr, + } +} + +// Run executes the program. +func (m *Main) Run(args ...string) error { + // Require a command at the beginning. + if len(args) == 0 || strings.HasPrefix(args[0], "-") { + fmt.Fprintln(m.Stderr, m.Usage()) + return ErrUsage + } + + // Execute command. + switch args[0] { + case "help": + fmt.Fprintln(m.Stderr, m.Usage()) + return ErrUsage + case "bench": + return newBenchCommand(m).Run(args[1:]...) + case "check": + return newCheckCommand(m).Run(args[1:]...) + case "dump": + return newDumpCommand(m).Run(args[1:]...) + case "info": + return newInfoCommand(m).Run(args[1:]...) + case "page": + return newPageCommand(m).Run(args[1:]...) + case "pages": + return newPagesCommand(m).Run(args[1:]...) + case "stats": + return newStatsCommand(m).Run(args[1:]...) + default: + return ErrUnknownCommand + } +} + +// Usage returns the help message. +func (m *Main) Usage() string { + return strings.TrimLeft(` +Bolt is a tool for inspecting bolt databases. + +Usage: + + bolt command [arguments] + +The commands are: + + bench run synthetic benchmark against bolt + check verifies integrity of bolt database + info print basic info + help print this screen + pages print list of pages with their types + stats iterate over all pages and generate usage stats + +Use "bolt [command] -h" for more information about a command. +`, "\n") +} + +// CheckCommand represents the "check" command execution. +type CheckCommand struct { + Stdin io.Reader + Stdout io.Writer + Stderr io.Writer +} + +// NewCheckCommand returns a CheckCommand. +func newCheckCommand(m *Main) *CheckCommand { + return &CheckCommand{ + Stdin: m.Stdin, + Stdout: m.Stdout, + Stderr: m.Stderr, + } +} + +// Run executes the command. +func (cmd *CheckCommand) Run(args ...string) error { + // Parse flags. + fs := flag.NewFlagSet("", flag.ContinueOnError) + help := fs.Bool("h", false, "") + if err := fs.Parse(args); err != nil { + return err + } else if *help { + fmt.Fprintln(cmd.Stderr, cmd.Usage()) + return ErrUsage + } + + // Require database path. + path := fs.Arg(0) + if path == "" { + return ErrPathRequired + } else if _, err := os.Stat(path); os.IsNotExist(err) { + return ErrFileNotFound + } + + // Open database. + db, err := bolt.Open(path, 0666, nil) + if err != nil { + return err + } + defer db.Close() + + // Perform consistency check. + return db.View(func(tx *bolt.Tx) error { + var count int + ch := tx.Check() + loop: + for { + select { + case err, ok := <-ch: + if !ok { + break loop + } + fmt.Fprintln(cmd.Stdout, err) + count++ + } + } + + // Print summary of errors. + if count > 0 { + fmt.Fprintf(cmd.Stdout, "%d errors found\n", count) + return ErrCorrupt + } + + // Notify user that database is valid. + fmt.Fprintln(cmd.Stdout, "OK") + return nil + }) +} + +// Usage returns the help message. +func (cmd *CheckCommand) Usage() string { + return strings.TrimLeft(` +usage: bolt check PATH + +Check opens a database at PATH and runs an exhaustive check to verify that +all pages are accessible or are marked as freed. It also verifies that no +pages are double referenced. + +Verification errors will stream out as they are found and the process will +return after all pages have been checked. +`, "\n") +} + +// InfoCommand represents the "info" command execution. +type InfoCommand struct { + Stdin io.Reader + Stdout io.Writer + Stderr io.Writer +} + +// NewInfoCommand returns a InfoCommand. +func newInfoCommand(m *Main) *InfoCommand { + return &InfoCommand{ + Stdin: m.Stdin, + Stdout: m.Stdout, + Stderr: m.Stderr, + } +} + +// Run executes the command. +func (cmd *InfoCommand) Run(args ...string) error { + // Parse flags. + fs := flag.NewFlagSet("", flag.ContinueOnError) + help := fs.Bool("h", false, "") + if err := fs.Parse(args); err != nil { + return err + } else if *help { + fmt.Fprintln(cmd.Stderr, cmd.Usage()) + return ErrUsage + } + + // Require database path. + path := fs.Arg(0) + if path == "" { + return ErrPathRequired + } else if _, err := os.Stat(path); os.IsNotExist(err) { + return ErrFileNotFound + } + + // Open the database. + db, err := bolt.Open(path, 0666, nil) + if err != nil { + return err + } + defer db.Close() + + // Print basic database info. + info := db.Info() + fmt.Fprintf(cmd.Stdout, "Page Size: %d\n", info.PageSize) + + return nil +} + +// Usage returns the help message. +func (cmd *InfoCommand) Usage() string { + return strings.TrimLeft(` +usage: bolt info PATH + +Info prints basic information about the Bolt database at PATH. +`, "\n") +} + +// DumpCommand represents the "dump" command execution. +type DumpCommand struct { + Stdin io.Reader + Stdout io.Writer + Stderr io.Writer +} + +// newDumpCommand returns a DumpCommand. +func newDumpCommand(m *Main) *DumpCommand { + return &DumpCommand{ + Stdin: m.Stdin, + Stdout: m.Stdout, + Stderr: m.Stderr, + } +} + +// Run executes the command. +func (cmd *DumpCommand) Run(args ...string) error { + // Parse flags. + fs := flag.NewFlagSet("", flag.ContinueOnError) + help := fs.Bool("h", false, "") + if err := fs.Parse(args); err != nil { + return err + } else if *help { + fmt.Fprintln(cmd.Stderr, cmd.Usage()) + return ErrUsage + } + + // Require database path and page id. + path := fs.Arg(0) + if path == "" { + return ErrPathRequired + } else if _, err := os.Stat(path); os.IsNotExist(err) { + return ErrFileNotFound + } + + // Read page ids. + pageIDs, err := atois(fs.Args()[1:]) + if err != nil { + return err + } else if len(pageIDs) == 0 { + return ErrPageIDRequired + } + + // Open database to retrieve page size. + pageSize, err := ReadPageSize(path) + if err != nil { + return err + } + + // Open database file handler. + f, err := os.Open(path) + if err != nil { + return err + } + defer func() { _ = f.Close() }() + + // Print each page listed. + for i, pageID := range pageIDs { + // Print a separator. + if i > 0 { + fmt.Fprintln(cmd.Stdout, "===============================================\n") + } + + // Print page to stdout. + if err := cmd.PrintPage(cmd.Stdout, f, pageID, pageSize); err != nil { + return err + } + } + + return nil +} + +// PrintPage prints a given page as hexidecimal. +func (cmd *DumpCommand) PrintPage(w io.Writer, r io.ReaderAt, pageID int, pageSize int) error { + const bytesPerLineN = 16 + + // Read page into buffer. + buf := make([]byte, pageSize) + addr := pageID * pageSize + if n, err := r.ReadAt(buf, int64(addr)); err != nil { + return err + } else if n != pageSize { + return io.ErrUnexpectedEOF + } + + // Write out to writer in 16-byte lines. + var prev []byte + var skipped bool + for offset := 0; offset < pageSize; offset += bytesPerLineN { + // Retrieve current 16-byte line. + line := buf[offset : offset+bytesPerLineN] + isLastLine := (offset == (pageSize - bytesPerLineN)) + + // If it's the same as the previous line then print a skip. + if bytes.Equal(line, prev) && !isLastLine { + if !skipped { + fmt.Fprintf(w, "%07x *\n", addr+offset) + skipped = true + } + } else { + // Print line as hexadecimal in 2-byte groups. + fmt.Fprintf(w, "%07x %04x %04x %04x %04x %04x %04x %04x %04x\n", addr+offset, + line[0:2], line[2:4], line[4:6], line[6:8], + line[8:10], line[10:12], line[12:14], line[14:16], + ) + + skipped = false + } + + // Save the previous line. + prev = line + } + fmt.Fprint(w, "\n") + + return nil +} + +// Usage returns the help message. +func (cmd *DumpCommand) Usage() string { + return strings.TrimLeft(` +usage: bolt dump -page PAGEID PATH + +Dump prints a hexidecimal dump of a single page. +`, "\n") +} + +// PageCommand represents the "page" command execution. +type PageCommand struct { + Stdin io.Reader + Stdout io.Writer + Stderr io.Writer +} + +// newPageCommand returns a PageCommand. +func newPageCommand(m *Main) *PageCommand { + return &PageCommand{ + Stdin: m.Stdin, + Stdout: m.Stdout, + Stderr: m.Stderr, + } +} + +// Run executes the command. +func (cmd *PageCommand) Run(args ...string) error { + // Parse flags. + fs := flag.NewFlagSet("", flag.ContinueOnError) + help := fs.Bool("h", false, "") + if err := fs.Parse(args); err != nil { + return err + } else if *help { + fmt.Fprintln(cmd.Stderr, cmd.Usage()) + return ErrUsage + } + + // Require database path and page id. + path := fs.Arg(0) + if path == "" { + return ErrPathRequired + } else if _, err := os.Stat(path); os.IsNotExist(err) { + return ErrFileNotFound + } + + // Read page ids. + pageIDs, err := atois(fs.Args()[1:]) + if err != nil { + return err + } else if len(pageIDs) == 0 { + return ErrPageIDRequired + } + + // Open database file handler. + f, err := os.Open(path) + if err != nil { + return err + } + defer func() { _ = f.Close() }() + + // Print each page listed. + for i, pageID := range pageIDs { + // Print a separator. + if i > 0 { + fmt.Fprintln(cmd.Stdout, "===============================================\n") + } + + // Retrieve page info and page size. + p, buf, err := ReadPage(path, pageID) + if err != nil { + return err + } + + // Print basic page info. + fmt.Fprintf(cmd.Stdout, "Page ID: %d\n", p.id) + fmt.Fprintf(cmd.Stdout, "Page Type: %s\n", p.Type()) + fmt.Fprintf(cmd.Stdout, "Total Size: %d bytes\n", len(buf)) + + // Print type-specific data. + switch p.Type() { + case "meta": + err = cmd.PrintMeta(cmd.Stdout, buf) + case "leaf": + err = cmd.PrintLeaf(cmd.Stdout, buf) + case "branch": + err = cmd.PrintBranch(cmd.Stdout, buf) + case "freelist": + err = cmd.PrintFreelist(cmd.Stdout, buf) + } + if err != nil { + return err + } + } + + return nil +} + +// PrintMeta prints the data from the meta page. +func (cmd *PageCommand) PrintMeta(w io.Writer, buf []byte) error { + m := (*meta)(unsafe.Pointer(&buf[PageHeaderSize])) + fmt.Fprintf(w, "Version: %d\n", m.version) + fmt.Fprintf(w, "Page Size: %d bytes\n", m.pageSize) + fmt.Fprintf(w, "Flags: %08x\n", m.flags) + fmt.Fprintf(w, "Root: \n", m.root.root) + fmt.Fprintf(w, "Freelist: \n", m.freelist) + fmt.Fprintf(w, "HWM: \n", m.pgid) + fmt.Fprintf(w, "Txn ID: %d\n", m.txid) + fmt.Fprintf(w, "Checksum: %016x\n", m.checksum) + fmt.Fprintf(w, "\n") + return nil +} + +// PrintLeaf prints the data for a leaf page. +func (cmd *PageCommand) PrintLeaf(w io.Writer, buf []byte) error { + p := (*page)(unsafe.Pointer(&buf[0])) + + // Print number of items. + fmt.Fprintf(w, "Item Count: %d\n", p.count) + fmt.Fprintf(w, "\n") + + // Print each key/value. + for i := uint16(0); i < p.count; i++ { + e := p.leafPageElement(i) + + // Format key as string. + var k string + if isPrintable(string(e.key())) { + k = fmt.Sprintf("%q", string(e.key())) + } else { + k = fmt.Sprintf("%x", string(e.key())) + } + + // Format value as string. + var v string + if (e.flags & uint32(bucketLeafFlag)) != 0 { + b := (*bucket)(unsafe.Pointer(&e.value()[0])) + v = fmt.Sprintf("", b.root, b.sequence) + } else if isPrintable(string(e.value())) { + k = fmt.Sprintf("%q", string(e.value())) + } else { + k = fmt.Sprintf("%x", string(e.value())) + } + + fmt.Fprintf(w, "%s: %s\n", k, v) + } + fmt.Fprintf(w, "\n") + return nil +} + +// PrintBranch prints the data for a leaf page. +func (cmd *PageCommand) PrintBranch(w io.Writer, buf []byte) error { + p := (*page)(unsafe.Pointer(&buf[0])) + + // Print number of items. + fmt.Fprintf(w, "Item Count: %d\n", p.count) + fmt.Fprintf(w, "\n") + + // Print each key/value. + for i := uint16(0); i < p.count; i++ { + e := p.branchPageElement(i) + + // Format key as string. + var k string + if isPrintable(string(e.key())) { + k = fmt.Sprintf("%q", string(e.key())) + } else { + k = fmt.Sprintf("%x", string(e.key())) + } + + fmt.Fprintf(w, "%s: \n", k, e.pgid) + } + fmt.Fprintf(w, "\n") + return nil +} + +// PrintFreelist prints the data for a freelist page. +func (cmd *PageCommand) PrintFreelist(w io.Writer, buf []byte) error { + p := (*page)(unsafe.Pointer(&buf[0])) + + // Print number of items. + fmt.Fprintf(w, "Item Count: %d\n", p.count) + fmt.Fprintf(w, "\n") + + // Print each page in the freelist. + ids := (*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)) + for i := uint16(0); i < p.count; i++ { + fmt.Fprintf(w, "%d\n", ids[i]) + } + fmt.Fprintf(w, "\n") + return nil +} + +// PrintPage prints a given page as hexidecimal. +func (cmd *PageCommand) PrintPage(w io.Writer, r io.ReaderAt, pageID int, pageSize int) error { + const bytesPerLineN = 16 + + // Read page into buffer. + buf := make([]byte, pageSize) + addr := pageID * pageSize + if n, err := r.ReadAt(buf, int64(addr)); err != nil { + return err + } else if n != pageSize { + return io.ErrUnexpectedEOF + } + + // Write out to writer in 16-byte lines. + var prev []byte + var skipped bool + for offset := 0; offset < pageSize; offset += bytesPerLineN { + // Retrieve current 16-byte line. + line := buf[offset : offset+bytesPerLineN] + isLastLine := (offset == (pageSize - bytesPerLineN)) + + // If it's the same as the previous line then print a skip. + if bytes.Equal(line, prev) && !isLastLine { + if !skipped { + fmt.Fprintf(w, "%07x *\n", addr+offset) + skipped = true + } + } else { + // Print line as hexadecimal in 2-byte groups. + fmt.Fprintf(w, "%07x %04x %04x %04x %04x %04x %04x %04x %04x\n", addr+offset, + line[0:2], line[2:4], line[4:6], line[6:8], + line[8:10], line[10:12], line[12:14], line[14:16], + ) + + skipped = false + } + + // Save the previous line. + prev = line + } + fmt.Fprint(w, "\n") + + return nil +} + +// Usage returns the help message. +func (cmd *PageCommand) Usage() string { + return strings.TrimLeft(` +usage: bolt page -page PATH pageid [pageid...] + +Page prints one or more pages in human readable format. +`, "\n") +} + +// PagesCommand represents the "pages" command execution. +type PagesCommand struct { + Stdin io.Reader + Stdout io.Writer + Stderr io.Writer +} + +// NewPagesCommand returns a PagesCommand. +func newPagesCommand(m *Main) *PagesCommand { + return &PagesCommand{ + Stdin: m.Stdin, + Stdout: m.Stdout, + Stderr: m.Stderr, + } +} + +// Run executes the command. +func (cmd *PagesCommand) Run(args ...string) error { + // Parse flags. + fs := flag.NewFlagSet("", flag.ContinueOnError) + help := fs.Bool("h", false, "") + if err := fs.Parse(args); err != nil { + return err + } else if *help { + fmt.Fprintln(cmd.Stderr, cmd.Usage()) + return ErrUsage + } + + // Require database path. + path := fs.Arg(0) + if path == "" { + return ErrPathRequired + } else if _, err := os.Stat(path); os.IsNotExist(err) { + return ErrFileNotFound + } + + // Open database. + db, err := bolt.Open(path, 0666, nil) + if err != nil { + return err + } + defer func() { _ = db.Close() }() + + // Write header. + fmt.Fprintln(cmd.Stdout, "ID TYPE ITEMS OVRFLW") + fmt.Fprintln(cmd.Stdout, "======== ========== ====== ======") + + return db.Update(func(tx *bolt.Tx) error { + var id int + for { + p, err := tx.Page(id) + if err != nil { + return &PageError{ID: id, Err: err} + } else if p == nil { + break + } + + // Only display count and overflow if this is a non-free page. + var count, overflow string + if p.Type != "free" { + count = strconv.Itoa(p.Count) + if p.OverflowCount > 0 { + overflow = strconv.Itoa(p.OverflowCount) + } + } + + // Print table row. + fmt.Fprintf(cmd.Stdout, "%-8d %-10s %-6s %-6s\n", p.ID, p.Type, count, overflow) + + // Move to the next non-overflow page. + id += 1 + if p.Type != "free" { + id += p.OverflowCount + } + } + return nil + }) +} + +// Usage returns the help message. +func (cmd *PagesCommand) Usage() string { + return strings.TrimLeft(` +usage: bolt pages PATH + +Pages prints a table of pages with their type (meta, leaf, branch, freelist). +Leaf and branch pages will show a key count in the "items" column while the +freelist will show the number of free pages in the "items" column. + +The "overflow" column shows the number of blocks that the page spills over +into. Normally there is no overflow but large keys and values can cause +a single page to take up multiple blocks. +`, "\n") +} + +// StatsCommand represents the "stats" command execution. +type StatsCommand struct { + Stdin io.Reader + Stdout io.Writer + Stderr io.Writer +} + +// NewStatsCommand returns a StatsCommand. +func newStatsCommand(m *Main) *StatsCommand { + return &StatsCommand{ + Stdin: m.Stdin, + Stdout: m.Stdout, + Stderr: m.Stderr, + } +} + +// Run executes the command. +func (cmd *StatsCommand) Run(args ...string) error { + // Parse flags. + fs := flag.NewFlagSet("", flag.ContinueOnError) + help := fs.Bool("h", false, "") + if err := fs.Parse(args); err != nil { + return err + } else if *help { + fmt.Fprintln(cmd.Stderr, cmd.Usage()) + return ErrUsage + } + + // Require database path. + path, prefix := fs.Arg(0), fs.Arg(1) + if path == "" { + return ErrPathRequired + } else if _, err := os.Stat(path); os.IsNotExist(err) { + return ErrFileNotFound + } + + // Open database. + db, err := bolt.Open(path, 0666, nil) + if err != nil { + return err + } + defer db.Close() + + return db.View(func(tx *bolt.Tx) error { + var s bolt.BucketStats + var count int + if err := tx.ForEach(func(name []byte, b *bolt.Bucket) error { + if bytes.HasPrefix(name, []byte(prefix)) { + s.Add(b.Stats()) + count += 1 + } + return nil + }); err != nil { + return err + } + + fmt.Fprintf(cmd.Stdout, "Aggregate statistics for %d buckets\n\n", count) + + fmt.Fprintln(cmd.Stdout, "Page count statistics") + fmt.Fprintf(cmd.Stdout, "\tNumber of logical branch pages: %d\n", s.BranchPageN) + fmt.Fprintf(cmd.Stdout, "\tNumber of physical branch overflow pages: %d\n", s.BranchOverflowN) + fmt.Fprintf(cmd.Stdout, "\tNumber of logical leaf pages: %d\n", s.LeafPageN) + fmt.Fprintf(cmd.Stdout, "\tNumber of physical leaf overflow pages: %d\n", s.LeafOverflowN) + + fmt.Fprintln(cmd.Stdout, "Tree statistics") + fmt.Fprintf(cmd.Stdout, "\tNumber of keys/value pairs: %d\n", s.KeyN) + fmt.Fprintf(cmd.Stdout, "\tNumber of levels in B+tree: %d\n", s.Depth) + + fmt.Fprintln(cmd.Stdout, "Page size utilization") + fmt.Fprintf(cmd.Stdout, "\tBytes allocated for physical branch pages: %d\n", s.BranchAlloc) + var percentage int + if s.BranchAlloc != 0 { + percentage = int(float32(s.BranchInuse) * 100.0 / float32(s.BranchAlloc)) + } + fmt.Fprintf(cmd.Stdout, "\tBytes actually used for branch data: %d (%d%%)\n", s.BranchInuse, percentage) + fmt.Fprintf(cmd.Stdout, "\tBytes allocated for physical leaf pages: %d\n", s.LeafAlloc) + percentage = 0 + if s.LeafAlloc != 0 { + percentage = int(float32(s.LeafInuse) * 100.0 / float32(s.LeafAlloc)) + } + fmt.Fprintf(cmd.Stdout, "\tBytes actually used for leaf data: %d (%d%%)\n", s.LeafInuse, percentage) + + fmt.Fprintln(cmd.Stdout, "Bucket statistics") + fmt.Fprintf(cmd.Stdout, "\tTotal number of buckets: %d\n", s.BucketN) + percentage = int(float32(s.InlineBucketN) * 100.0 / float32(s.BucketN)) + fmt.Fprintf(cmd.Stdout, "\tTotal number on inlined buckets: %d (%d%%)\n", s.InlineBucketN, percentage) + percentage = 0 + if s.LeafInuse != 0 { + percentage = int(float32(s.InlineBucketInuse) * 100.0 / float32(s.LeafInuse)) + } + fmt.Fprintf(cmd.Stdout, "\tBytes used for inlined buckets: %d (%d%%)\n", s.InlineBucketInuse, percentage) + + return nil + }) +} + +// Usage returns the help message. +func (cmd *StatsCommand) Usage() string { + return strings.TrimLeft(` +usage: bolt stats PATH + +Stats performs an extensive search of the database to track every page +reference. It starts at the current meta page and recursively iterates +through every accessible bucket. + +The following errors can be reported: + + already freed + The page is referenced more than once in the freelist. + + unreachable unfreed + The page is not referenced by a bucket or in the freelist. + + reachable freed + The page is referenced by a bucket but is also in the freelist. + + out of bounds + A page is referenced that is above the high water mark. + + multiple references + A page is referenced by more than one other page. + + invalid type + The page type is not "meta", "leaf", "branch", or "freelist". + +No errors should occur in your database. However, if for some reason you +experience corruption, please submit a ticket to the Bolt project page: + + https://github.com/boltdb/bolt/issues +`, "\n") +} + +var benchBucketName = []byte("bench") + +// BenchCommand represents the "bench" command execution. +type BenchCommand struct { + Stdin io.Reader + Stdout io.Writer + Stderr io.Writer +} + +// NewBenchCommand returns a BenchCommand using the +func newBenchCommand(m *Main) *BenchCommand { + return &BenchCommand{ + Stdin: m.Stdin, + Stdout: m.Stdout, + Stderr: m.Stderr, + } +} + +// Run executes the "bench" command. +func (cmd *BenchCommand) Run(args ...string) error { + // Parse CLI arguments. + options, err := cmd.ParseFlags(args) + if err != nil { + return err + } + + // Remove path if "-work" is not set. Otherwise keep path. + if options.Work { + fmt.Fprintf(cmd.Stdout, "work: %s\n", options.Path) + } else { + defer os.Remove(options.Path) + } + + // Create database. + db, err := bolt.Open(options.Path, 0666, nil) + if err != nil { + return err + } + db.NoSync = options.NoSync + defer db.Close() + + // Write to the database. + var results BenchResults + if err := cmd.runWrites(db, options, &results); err != nil { + return fmt.Errorf("write: ", err) + } + + // Read from the database. + if err := cmd.runReads(db, options, &results); err != nil { + return fmt.Errorf("bench: read: %s", err) + } + + // Print results. + fmt.Fprintf(os.Stderr, "# Write\t%v\t(%v/op)\t(%v op/sec)\n", results.WriteDuration, results.WriteOpDuration(), results.WriteOpsPerSecond()) + fmt.Fprintf(os.Stderr, "# Read\t%v\t(%v/op)\t(%v op/sec)\n", results.ReadDuration, results.ReadOpDuration(), results.ReadOpsPerSecond()) + fmt.Fprintln(os.Stderr, "") + return nil +} + +// ParseFlags parses the command line flags. +func (cmd *BenchCommand) ParseFlags(args []string) (*BenchOptions, error) { + var options BenchOptions + + // Parse flagset. + fs := flag.NewFlagSet("", flag.ContinueOnError) + fs.StringVar(&options.ProfileMode, "profile-mode", "rw", "") + fs.StringVar(&options.WriteMode, "write-mode", "seq", "") + fs.StringVar(&options.ReadMode, "read-mode", "seq", "") + fs.IntVar(&options.Iterations, "count", 1000, "") + fs.IntVar(&options.BatchSize, "batch-size", 0, "") + fs.IntVar(&options.KeySize, "key-size", 8, "") + fs.IntVar(&options.ValueSize, "value-size", 32, "") + fs.StringVar(&options.CPUProfile, "cpuprofile", "", "") + fs.StringVar(&options.MemProfile, "memprofile", "", "") + fs.StringVar(&options.BlockProfile, "blockprofile", "", "") + fs.Float64Var(&options.FillPercent, "fill-percent", bolt.DefaultFillPercent, "") + fs.BoolVar(&options.NoSync, "no-sync", false, "") + fs.BoolVar(&options.Work, "work", false, "") + fs.StringVar(&options.Path, "path", "", "") + fs.SetOutput(cmd.Stderr) + if err := fs.Parse(args); err != nil { + return nil, err + } + + // Set batch size to iteration size if not set. + // Require that batch size can be evenly divided by the iteration count. + if options.BatchSize == 0 { + options.BatchSize = options.Iterations + } else if options.Iterations%options.BatchSize != 0 { + return nil, ErrNonDivisibleBatchSize + } + + // Generate temp path if one is not passed in. + if options.Path == "" { + f, err := ioutil.TempFile("", "bolt-bench-") + if err != nil { + return nil, fmt.Errorf("temp file: %s", err) + } + f.Close() + os.Remove(f.Name()) + options.Path = f.Name() + } + + return &options, nil +} + +// Writes to the database. +func (cmd *BenchCommand) runWrites(db *bolt.DB, options *BenchOptions, results *BenchResults) error { + // Start profiling for writes. + if options.ProfileMode == "rw" || options.ProfileMode == "w" { + cmd.startProfiling(options) + } + + t := time.Now() + + var err error + switch options.WriteMode { + case "seq": + err = cmd.runWritesSequential(db, options, results) + case "rnd": + err = cmd.runWritesRandom(db, options, results) + case "seq-nest": + err = cmd.runWritesSequentialNested(db, options, results) + case "rnd-nest": + err = cmd.runWritesRandomNested(db, options, results) + default: + return fmt.Errorf("invalid write mode: %s", options.WriteMode) + } + + // Save time to write. + results.WriteDuration = time.Since(t) + + // Stop profiling for writes only. + if options.ProfileMode == "w" { + cmd.stopProfiling() + } + + return err +} + +func (cmd *BenchCommand) runWritesSequential(db *bolt.DB, options *BenchOptions, results *BenchResults) error { + var i = uint32(0) + return cmd.runWritesWithSource(db, options, results, func() uint32 { i++; return i }) +} + +func (cmd *BenchCommand) runWritesRandom(db *bolt.DB, options *BenchOptions, results *BenchResults) error { + r := rand.New(rand.NewSource(time.Now().UnixNano())) + return cmd.runWritesWithSource(db, options, results, func() uint32 { return r.Uint32() }) +} + +func (cmd *BenchCommand) runWritesSequentialNested(db *bolt.DB, options *BenchOptions, results *BenchResults) error { + var i = uint32(0) + return cmd.runWritesWithSource(db, options, results, func() uint32 { i++; return i }) +} + +func (cmd *BenchCommand) runWritesRandomNested(db *bolt.DB, options *BenchOptions, results *BenchResults) error { + r := rand.New(rand.NewSource(time.Now().UnixNano())) + return cmd.runWritesWithSource(db, options, results, func() uint32 { return r.Uint32() }) +} + +func (cmd *BenchCommand) runWritesWithSource(db *bolt.DB, options *BenchOptions, results *BenchResults, keySource func() uint32) error { + results.WriteOps = options.Iterations + + for i := 0; i < options.Iterations; i += options.BatchSize { + if err := db.Update(func(tx *bolt.Tx) error { + b, _ := tx.CreateBucketIfNotExists(benchBucketName) + b.FillPercent = options.FillPercent + + for j := 0; j < options.BatchSize; j++ { + key := make([]byte, options.KeySize) + value := make([]byte, options.ValueSize) + + // Write key as uint32. + binary.BigEndian.PutUint32(key, keySource()) + + // Insert key/value. + if err := b.Put(key, value); err != nil { + return err + } + } + + return nil + }); err != nil { + return err + } + } + return nil +} + +func (cmd *BenchCommand) runWritesNestedWithSource(db *bolt.DB, options *BenchOptions, results *BenchResults, keySource func() uint32) error { + results.WriteOps = options.Iterations + + for i := 0; i < options.Iterations; i += options.BatchSize { + if err := db.Update(func(tx *bolt.Tx) error { + top, err := tx.CreateBucketIfNotExists(benchBucketName) + if err != nil { + return err + } + top.FillPercent = options.FillPercent + + // Create bucket key. + name := make([]byte, options.KeySize) + binary.BigEndian.PutUint32(name, keySource()) + + // Create bucket. + b, err := top.CreateBucketIfNotExists(name) + if err != nil { + return err + } + b.FillPercent = options.FillPercent + + for j := 0; j < options.BatchSize; j++ { + var key = make([]byte, options.KeySize) + var value = make([]byte, options.ValueSize) + + // Generate key as uint32. + binary.BigEndian.PutUint32(key, keySource()) + + // Insert value into subbucket. + if err := b.Put(key, value); err != nil { + return err + } + } + + return nil + }); err != nil { + return err + } + } + return nil +} + +// Reads from the database. +func (cmd *BenchCommand) runReads(db *bolt.DB, options *BenchOptions, results *BenchResults) error { + // Start profiling for reads. + if options.ProfileMode == "r" { + cmd.startProfiling(options) + } + + t := time.Now() + + var err error + switch options.ReadMode { + case "seq": + switch options.WriteMode { + case "seq-nest", "rnd-nest": + err = cmd.runReadsSequentialNested(db, options, results) + default: + err = cmd.runReadsSequential(db, options, results) + } + default: + return fmt.Errorf("invalid read mode: %s", options.ReadMode) + } + + // Save read time. + results.ReadDuration = time.Since(t) + + // Stop profiling for reads. + if options.ProfileMode == "rw" || options.ProfileMode == "r" { + cmd.stopProfiling() + } + + return err +} + +func (cmd *BenchCommand) runReadsSequential(db *bolt.DB, options *BenchOptions, results *BenchResults) error { + return db.View(func(tx *bolt.Tx) error { + t := time.Now() + + for { + var count int + + c := tx.Bucket(benchBucketName).Cursor() + for k, v := c.First(); k != nil; k, v = c.Next() { + if v == nil { + return errors.New("invalid value") + } + count++ + } + + if options.WriteMode == "seq" && count != options.Iterations { + return fmt.Errorf("read seq: iter mismatch: expected %d, got %d", options.Iterations, count) + } + + results.ReadOps += count + + // Make sure we do this for at least a second. + if time.Since(t) >= time.Second { + break + } + } + + return nil + }) +} + +func (cmd *BenchCommand) runReadsSequentialNested(db *bolt.DB, options *BenchOptions, results *BenchResults) error { + return db.View(func(tx *bolt.Tx) error { + t := time.Now() + + for { + var count int + var top = tx.Bucket(benchBucketName) + if err := top.ForEach(func(name, _ []byte) error { + c := top.Bucket(name).Cursor() + for k, v := c.First(); k != nil; k, v = c.Next() { + if v == nil { + return ErrInvalidValue + } + count++ + } + return nil + }); err != nil { + return err + } + + if options.WriteMode == "seq-nest" && count != options.Iterations { + return fmt.Errorf("read seq-nest: iter mismatch: expected %d, got %d", options.Iterations, count) + } + + results.ReadOps += count + + // Make sure we do this for at least a second. + if time.Since(t) >= time.Second { + break + } + } + + return nil + }) +} + +// File handlers for the various profiles. +var cpuprofile, memprofile, blockprofile *os.File + +// Starts all profiles set on the options. +func (cmd *BenchCommand) startProfiling(options *BenchOptions) { + var err error + + // Start CPU profiling. + if options.CPUProfile != "" { + cpuprofile, err = os.Create(options.CPUProfile) + if err != nil { + fmt.Fprintf(cmd.Stderr, "bench: could not create cpu profile %q: %v\n", options.CPUProfile, err) + os.Exit(1) + } + pprof.StartCPUProfile(cpuprofile) + } + + // Start memory profiling. + if options.MemProfile != "" { + memprofile, err = os.Create(options.MemProfile) + if err != nil { + fmt.Fprintf(cmd.Stderr, "bench: could not create memory profile %q: %v\n", options.MemProfile, err) + os.Exit(1) + } + runtime.MemProfileRate = 4096 + } + + // Start fatal profiling. + if options.BlockProfile != "" { + blockprofile, err = os.Create(options.BlockProfile) + if err != nil { + fmt.Fprintf(cmd.Stderr, "bench: could not create block profile %q: %v\n", options.BlockProfile, err) + os.Exit(1) + } + runtime.SetBlockProfileRate(1) + } +} + +// Stops all profiles. +func (cmd *BenchCommand) stopProfiling() { + if cpuprofile != nil { + pprof.StopCPUProfile() + cpuprofile.Close() + cpuprofile = nil + } + + if memprofile != nil { + pprof.Lookup("heap").WriteTo(memprofile, 0) + memprofile.Close() + memprofile = nil + } + + if blockprofile != nil { + pprof.Lookup("block").WriteTo(blockprofile, 0) + blockprofile.Close() + blockprofile = nil + runtime.SetBlockProfileRate(0) + } +} + +// BenchOptions represents the set of options that can be passed to "bolt bench". +type BenchOptions struct { + ProfileMode string + WriteMode string + ReadMode string + Iterations int + BatchSize int + KeySize int + ValueSize int + CPUProfile string + MemProfile string + BlockProfile string + StatsInterval time.Duration + FillPercent float64 + NoSync bool + Work bool + Path string +} + +// BenchResults represents the performance results of the benchmark. +type BenchResults struct { + WriteOps int + WriteDuration time.Duration + ReadOps int + ReadDuration time.Duration +} + +// Returns the duration for a single write operation. +func (r *BenchResults) WriteOpDuration() time.Duration { + if r.WriteOps == 0 { + return 0 + } + return r.WriteDuration / time.Duration(r.WriteOps) +} + +// Returns average number of write operations that can be performed per second. +func (r *BenchResults) WriteOpsPerSecond() int { + var op = r.WriteOpDuration() + if op == 0 { + return 0 + } + return int(time.Second) / int(op) +} + +// Returns the duration for a single read operation. +func (r *BenchResults) ReadOpDuration() time.Duration { + if r.ReadOps == 0 { + return 0 + } + return r.ReadDuration / time.Duration(r.ReadOps) +} + +// Returns average number of read operations that can be performed per second. +func (r *BenchResults) ReadOpsPerSecond() int { + var op = r.ReadOpDuration() + if op == 0 { + return 0 + } + return int(time.Second) / int(op) +} + +type PageError struct { + ID int + Err error +} + +func (e *PageError) Error() string { + return fmt.Sprintf("page error: id=%d, err=%s", e.ID, e.Err) +} + +// isPrintable returns true if the string is valid unicode and contains only printable runes. +func isPrintable(s string) bool { + if !utf8.ValidString(s) { + return false + } + for _, ch := range s { + if !unicode.IsPrint(ch) { + return false + } + } + return true +} + +// ReadPage reads page info & full page data from a path. +// This is not transactionally safe. +func ReadPage(path string, pageID int) (*page, []byte, error) { + // Find page size. + pageSize, err := ReadPageSize(path) + if err != nil { + return nil, nil, fmt.Errorf("read page size: %s", err) + } + + // Open database file. + f, err := os.Open(path) + if err != nil { + return nil, nil, err + } + defer f.Close() + + // Read one block into buffer. + buf := make([]byte, pageSize) + if n, err := f.ReadAt(buf, int64(pageID*pageSize)); err != nil { + return nil, nil, err + } else if n != len(buf) { + return nil, nil, io.ErrUnexpectedEOF + } + + // Determine total number of blocks. + p := (*page)(unsafe.Pointer(&buf[0])) + overflowN := p.overflow + + // Re-read entire page (with overflow) into buffer. + buf = make([]byte, (int(overflowN)+1)*pageSize) + if n, err := f.ReadAt(buf, int64(pageID*pageSize)); err != nil { + return nil, nil, err + } else if n != len(buf) { + return nil, nil, io.ErrUnexpectedEOF + } + p = (*page)(unsafe.Pointer(&buf[0])) + + return p, buf, nil +} + +// ReadPageSize reads page size a path. +// This is not transactionally safe. +func ReadPageSize(path string) (int, error) { + // Open database file. + f, err := os.Open(path) + if err != nil { + return 0, err + } + defer f.Close() + + // Read 4KB chunk. + buf := make([]byte, 4096) + if _, err := io.ReadFull(f, buf); err != nil { + return 0, err + } + + // Read page size from metadata. + m := (*meta)(unsafe.Pointer(&buf[PageHeaderSize])) + return int(m.pageSize), nil +} + +// atois parses a slice of strings into integers. +func atois(strs []string) ([]int, error) { + var a []int + for _, str := range strs { + i, err := strconv.Atoi(str) + if err != nil { + return nil, err + } + a = append(a, i) + } + return a, nil +} + +// DO NOT EDIT. Copied from the "bolt" package. +const maxAllocSize = 0xFFFFFFF + +// DO NOT EDIT. Copied from the "bolt" package. +const ( + branchPageFlag = 0x01 + leafPageFlag = 0x02 + metaPageFlag = 0x04 + freelistPageFlag = 0x10 +) + +// DO NOT EDIT. Copied from the "bolt" package. +const bucketLeafFlag = 0x01 + +// DO NOT EDIT. Copied from the "bolt" package. +type pgid uint64 + +// DO NOT EDIT. Copied from the "bolt" package. +type txid uint64 + +// DO NOT EDIT. Copied from the "bolt" package. +type meta struct { + magic uint32 + version uint32 + pageSize uint32 + flags uint32 + root bucket + freelist pgid + pgid pgid + txid txid + checksum uint64 +} + +// DO NOT EDIT. Copied from the "bolt" package. +type bucket struct { + root pgid + sequence uint64 +} + +// DO NOT EDIT. Copied from the "bolt" package. +type page struct { + id pgid + flags uint16 + count uint16 + overflow uint32 + ptr uintptr +} + +// DO NOT EDIT. Copied from the "bolt" package. +func (p *page) Type() string { + if (p.flags & branchPageFlag) != 0 { + return "branch" + } else if (p.flags & leafPageFlag) != 0 { + return "leaf" + } else if (p.flags & metaPageFlag) != 0 { + return "meta" + } else if (p.flags & freelistPageFlag) != 0 { + return "freelist" + } + return fmt.Sprintf("unknown<%02x>", p.flags) +} + +// DO NOT EDIT. Copied from the "bolt" package. +func (p *page) leafPageElement(index uint16) *leafPageElement { + n := &((*[0x7FFFFFF]leafPageElement)(unsafe.Pointer(&p.ptr)))[index] + return n +} + +// DO NOT EDIT. Copied from the "bolt" package. +func (p *page) branchPageElement(index uint16) *branchPageElement { + return &((*[0x7FFFFFF]branchPageElement)(unsafe.Pointer(&p.ptr)))[index] +} + +// DO NOT EDIT. Copied from the "bolt" package. +type branchPageElement struct { + pos uint32 + ksize uint32 + pgid pgid +} + +// DO NOT EDIT. Copied from the "bolt" package. +func (n *branchPageElement) key() []byte { + buf := (*[maxAllocSize]byte)(unsafe.Pointer(n)) + return buf[n.pos : n.pos+n.ksize] +} + +// DO NOT EDIT. Copied from the "bolt" package. +type leafPageElement struct { + flags uint32 + pos uint32 + ksize uint32 + vsize uint32 +} + +// DO NOT EDIT. Copied from the "bolt" package. +func (n *leafPageElement) key() []byte { + buf := (*[maxAllocSize]byte)(unsafe.Pointer(n)) + return buf[n.pos : n.pos+n.ksize] +} + +// DO NOT EDIT. Copied from the "bolt" package. +func (n *leafPageElement) value() []byte { + buf := (*[maxAllocSize]byte)(unsafe.Pointer(n)) + return buf[n.pos+n.ksize : n.pos+n.ksize+n.vsize] +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/main_test.go b/Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/main_test.go new file mode 100644 index 000000000..6b05ac23d --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/main_test.go @@ -0,0 +1,145 @@ +package main_test + +import ( + "bytes" + "io/ioutil" + "os" + "strconv" + "testing" + + "github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt" + "github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt" +) + +// Ensure the "info" command can print information about a database. +func TestInfoCommand_Run(t *testing.T) { + db := MustOpen(0666, nil) + db.DB.Close() + defer db.Close() + + // Run the info command. + m := NewMain() + if err := m.Run("info", db.Path); err != nil { + t.Fatal(err) + } +} + +// Ensure the "stats" command can execute correctly. +func TestStatsCommand_Run(t *testing.T) { + // Ignore + if os.Getpagesize() != 4096 { + t.Skip("system does not use 4KB page size") + } + + db := MustOpen(0666, nil) + defer db.Close() + + if err := db.Update(func(tx *bolt.Tx) error { + // Create "foo" bucket. + b, err := tx.CreateBucket([]byte("foo")) + if err != nil { + return err + } + for i := 0; i < 10; i++ { + if err := b.Put([]byte(strconv.Itoa(i)), []byte(strconv.Itoa(i))); err != nil { + return err + } + } + + // Create "bar" bucket. + b, err = tx.CreateBucket([]byte("bar")) + if err != nil { + return err + } + for i := 0; i < 100; i++ { + if err := b.Put([]byte(strconv.Itoa(i)), []byte(strconv.Itoa(i))); err != nil { + return err + } + } + + // Create "baz" bucket. + b, err = tx.CreateBucket([]byte("baz")) + if err != nil { + return err + } + if err := b.Put([]byte("key"), []byte("value")); err != nil { + return err + } + + return nil + }); err != nil { + t.Fatal(err) + } + db.DB.Close() + + // Generate expected result. + exp := "Aggregate statistics for 3 buckets\n\n" + + "Page count statistics\n" + + "\tNumber of logical branch pages: 0\n" + + "\tNumber of physical branch overflow pages: 0\n" + + "\tNumber of logical leaf pages: 1\n" + + "\tNumber of physical leaf overflow pages: 0\n" + + "Tree statistics\n" + + "\tNumber of keys/value pairs: 111\n" + + "\tNumber of levels in B+tree: 1\n" + + "Page size utilization\n" + + "\tBytes allocated for physical branch pages: 0\n" + + "\tBytes actually used for branch data: 0 (0%)\n" + + "\tBytes allocated for physical leaf pages: 4096\n" + + "\tBytes actually used for leaf data: 1996 (48%)\n" + + "Bucket statistics\n" + + "\tTotal number of buckets: 3\n" + + "\tTotal number on inlined buckets: 2 (66%)\n" + + "\tBytes used for inlined buckets: 236 (11%)\n" + + // Run the command. + m := NewMain() + if err := m.Run("stats", db.Path); err != nil { + t.Fatal(err) + } else if m.Stdout.String() != exp { + t.Fatalf("unexpected stdout:\n\n%s", m.Stdout.String()) + } +} + +// Main represents a test wrapper for main.Main that records output. +type Main struct { + *main.Main + Stdin bytes.Buffer + Stdout bytes.Buffer + Stderr bytes.Buffer +} + +// NewMain returns a new instance of Main. +func NewMain() *Main { + m := &Main{Main: main.NewMain()} + m.Main.Stdin = &m.Stdin + m.Main.Stdout = &m.Stdout + m.Main.Stderr = &m.Stderr + return m +} + +// MustOpen creates a Bolt database in a temporary location. +func MustOpen(mode os.FileMode, options *bolt.Options) *DB { + // Create temporary path. + f, _ := ioutil.TempFile("", "bolt-") + f.Close() + os.Remove(f.Name()) + + db, err := bolt.Open(f.Name(), mode, options) + if err != nil { + panic(err.Error()) + } + return &DB{DB: db, Path: f.Name()} +} + +// DB is a test wrapper for bolt.DB. +type DB struct { + *bolt.DB + Path string +} + +// Close closes and removes the database. +func (db *DB) Close() error { + defer os.Remove(db.Path) + return db.DB.Close() +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/p.out b/Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/p.out new file mode 100644 index 000000000..ace9346e7 Binary files /dev/null and b/Godeps/_workspace/src/github.com/boltdb/bolt/cmd/bolt/p.out differ diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/cursor.go b/Godeps/_workspace/src/github.com/boltdb/bolt/cursor.go new file mode 100644 index 000000000..006c54889 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/cursor.go @@ -0,0 +1,384 @@ +package bolt + +import ( + "bytes" + "fmt" + "sort" +) + +// Cursor represents an iterator that can traverse over all key/value pairs in a bucket in sorted order. +// Cursors see nested buckets with value == nil. +// Cursors can be obtained from a transaction and are valid as long as the transaction is open. +// +// Keys and values returned from the cursor are only valid for the life of the transaction. +// +// Changing data while traversing with a cursor may cause it to be invalidated +// and return unexpected keys and/or values. You must reposition your cursor +// after mutating data. +type Cursor struct { + bucket *Bucket + stack []elemRef +} + +// Bucket returns the bucket that this cursor was created from. +func (c *Cursor) Bucket() *Bucket { + return c.bucket +} + +// First moves the cursor to the first item in the bucket and returns its key and value. +// If the bucket is empty then a nil key and value are returned. +// The returned key and value are only valid for the life of the transaction. +func (c *Cursor) First() (key []byte, value []byte) { + _assert(c.bucket.tx.db != nil, "tx closed") + c.stack = c.stack[:0] + p, n := c.bucket.pageNode(c.bucket.root) + c.stack = append(c.stack, elemRef{page: p, node: n, index: 0}) + c.first() + k, v, flags := c.keyValue() + if (flags & uint32(bucketLeafFlag)) != 0 { + return k, nil + } + return k, v + +} + +// Last moves the cursor to the last item in the bucket and returns its key and value. +// If the bucket is empty then a nil key and value are returned. +// The returned key and value are only valid for the life of the transaction. +func (c *Cursor) Last() (key []byte, value []byte) { + _assert(c.bucket.tx.db != nil, "tx closed") + c.stack = c.stack[:0] + p, n := c.bucket.pageNode(c.bucket.root) + ref := elemRef{page: p, node: n} + ref.index = ref.count() - 1 + c.stack = append(c.stack, ref) + c.last() + k, v, flags := c.keyValue() + if (flags & uint32(bucketLeafFlag)) != 0 { + return k, nil + } + return k, v +} + +// Next moves the cursor to the next item in the bucket and returns its key and value. +// If the cursor is at the end of the bucket then a nil key and value are returned. +// The returned key and value are only valid for the life of the transaction. +func (c *Cursor) Next() (key []byte, value []byte) { + _assert(c.bucket.tx.db != nil, "tx closed") + k, v, flags := c.next() + if (flags & uint32(bucketLeafFlag)) != 0 { + return k, nil + } + return k, v +} + +// Prev moves the cursor to the previous item in the bucket and returns its key and value. +// If the cursor is at the beginning of the bucket then a nil key and value are returned. +// The returned key and value are only valid for the life of the transaction. +func (c *Cursor) Prev() (key []byte, value []byte) { + _assert(c.bucket.tx.db != nil, "tx closed") + + // Attempt to move back one element until we're successful. + // Move up the stack as we hit the beginning of each page in our stack. + for i := len(c.stack) - 1; i >= 0; i-- { + elem := &c.stack[i] + if elem.index > 0 { + elem.index-- + break + } + c.stack = c.stack[:i] + } + + // If we've hit the end then return nil. + if len(c.stack) == 0 { + return nil, nil + } + + // Move down the stack to find the last element of the last leaf under this branch. + c.last() + k, v, flags := c.keyValue() + if (flags & uint32(bucketLeafFlag)) != 0 { + return k, nil + } + return k, v +} + +// Seek moves the cursor to a given key and returns it. +// If the key does not exist then the next key is used. If no keys +// follow, a nil key is returned. +// The returned key and value are only valid for the life of the transaction. +func (c *Cursor) Seek(seek []byte) (key []byte, value []byte) { + k, v, flags := c.seek(seek) + + // If we ended up after the last element of a page then move to the next one. + if ref := &c.stack[len(c.stack)-1]; ref.index >= ref.count() { + k, v, flags = c.next() + } + + if k == nil { + return nil, nil + } else if (flags & uint32(bucketLeafFlag)) != 0 { + return k, nil + } + return k, v +} + +// Delete removes the current key/value under the cursor from the bucket. +// Delete fails if current key/value is a bucket or if the transaction is not writable. +func (c *Cursor) Delete() error { + if c.bucket.tx.db == nil { + return ErrTxClosed + } else if !c.bucket.Writable() { + return ErrTxNotWritable + } + + key, _, flags := c.keyValue() + // Return an error if current value is a bucket. + if (flags & bucketLeafFlag) != 0 { + return ErrIncompatibleValue + } + c.node().del(key) + + return nil +} + +// seek moves the cursor to a given key and returns it. +// If the key does not exist then the next key is used. +func (c *Cursor) seek(seek []byte) (key []byte, value []byte, flags uint32) { + _assert(c.bucket.tx.db != nil, "tx closed") + + // Start from root page/node and traverse to correct page. + c.stack = c.stack[:0] + c.search(seek, c.bucket.root) + ref := &c.stack[len(c.stack)-1] + + // If the cursor is pointing to the end of page/node then return nil. + if ref.index >= ref.count() { + return nil, nil, 0 + } + + // If this is a bucket then return a nil value. + return c.keyValue() +} + +// first moves the cursor to the first leaf element under the last page in the stack. +func (c *Cursor) first() { + for { + // Exit when we hit a leaf page. + var ref = &c.stack[len(c.stack)-1] + if ref.isLeaf() { + break + } + + // Keep adding pages pointing to the first element to the stack. + var pgid pgid + if ref.node != nil { + pgid = ref.node.inodes[ref.index].pgid + } else { + pgid = ref.page.branchPageElement(uint16(ref.index)).pgid + } + p, n := c.bucket.pageNode(pgid) + c.stack = append(c.stack, elemRef{page: p, node: n, index: 0}) + } +} + +// last moves the cursor to the last leaf element under the last page in the stack. +func (c *Cursor) last() { + for { + // Exit when we hit a leaf page. + ref := &c.stack[len(c.stack)-1] + if ref.isLeaf() { + break + } + + // Keep adding pages pointing to the last element in the stack. + var pgid pgid + if ref.node != nil { + pgid = ref.node.inodes[ref.index].pgid + } else { + pgid = ref.page.branchPageElement(uint16(ref.index)).pgid + } + p, n := c.bucket.pageNode(pgid) + + var nextRef = elemRef{page: p, node: n} + nextRef.index = nextRef.count() - 1 + c.stack = append(c.stack, nextRef) + } +} + +// next moves to the next leaf element and returns the key and value. +// If the cursor is at the last leaf element then it stays there and returns nil. +func (c *Cursor) next() (key []byte, value []byte, flags uint32) { + // Attempt to move over one element until we're successful. + // Move up the stack as we hit the end of each page in our stack. + var i int + for i = len(c.stack) - 1; i >= 0; i-- { + elem := &c.stack[i] + if elem.index < elem.count()-1 { + elem.index++ + break + } + } + + // If we've hit the root page then stop and return. This will leave the + // cursor on the last element of the last page. + if i == -1 { + return nil, nil, 0 + } + + // Otherwise start from where we left off in the stack and find the + // first element of the first leaf page. + c.stack = c.stack[:i+1] + c.first() + return c.keyValue() +} + +// search recursively performs a binary search against a given page/node until it finds a given key. +func (c *Cursor) search(key []byte, pgid pgid) { + p, n := c.bucket.pageNode(pgid) + if p != nil && (p.flags&(branchPageFlag|leafPageFlag)) == 0 { + panic(fmt.Sprintf("invalid page type: %d: %x", p.id, p.flags)) + } + e := elemRef{page: p, node: n} + c.stack = append(c.stack, e) + + // If we're on a leaf page/node then find the specific node. + if e.isLeaf() { + c.nsearch(key) + return + } + + if n != nil { + c.searchNode(key, n) + return + } + c.searchPage(key, p) +} + +func (c *Cursor) searchNode(key []byte, n *node) { + var exact bool + index := sort.Search(len(n.inodes), func(i int) bool { + // TODO(benbjohnson): Optimize this range search. It's a bit hacky right now. + // sort.Search() finds the lowest index where f() != -1 but we need the highest index. + ret := bytes.Compare(n.inodes[i].key, key) + if ret == 0 { + exact = true + } + return ret != -1 + }) + if !exact && index > 0 { + index-- + } + c.stack[len(c.stack)-1].index = index + + // Recursively search to the next page. + c.search(key, n.inodes[index].pgid) +} + +func (c *Cursor) searchPage(key []byte, p *page) { + // Binary search for the correct range. + inodes := p.branchPageElements() + + var exact bool + index := sort.Search(int(p.count), func(i int) bool { + // TODO(benbjohnson): Optimize this range search. It's a bit hacky right now. + // sort.Search() finds the lowest index where f() != -1 but we need the highest index. + ret := bytes.Compare(inodes[i].key(), key) + if ret == 0 { + exact = true + } + return ret != -1 + }) + if !exact && index > 0 { + index-- + } + c.stack[len(c.stack)-1].index = index + + // Recursively search to the next page. + c.search(key, inodes[index].pgid) +} + +// nsearch searches the leaf node on the top of the stack for a key. +func (c *Cursor) nsearch(key []byte) { + e := &c.stack[len(c.stack)-1] + p, n := e.page, e.node + + // If we have a node then search its inodes. + if n != nil { + index := sort.Search(len(n.inodes), func(i int) bool { + return bytes.Compare(n.inodes[i].key, key) != -1 + }) + e.index = index + return + } + + // If we have a page then search its leaf elements. + inodes := p.leafPageElements() + index := sort.Search(int(p.count), func(i int) bool { + return bytes.Compare(inodes[i].key(), key) != -1 + }) + e.index = index +} + +// keyValue returns the key and value of the current leaf element. +func (c *Cursor) keyValue() ([]byte, []byte, uint32) { + ref := &c.stack[len(c.stack)-1] + if ref.count() == 0 || ref.index >= ref.count() { + return nil, nil, 0 + } + + // Retrieve value from node. + if ref.node != nil { + inode := &ref.node.inodes[ref.index] + return inode.key, inode.value, inode.flags + } + + // Or retrieve value from page. + elem := ref.page.leafPageElement(uint16(ref.index)) + return elem.key(), elem.value(), elem.flags +} + +// node returns the node that the cursor is currently positioned on. +func (c *Cursor) node() *node { + _assert(len(c.stack) > 0, "accessing a node with a zero-length cursor stack") + + // If the top of the stack is a leaf node then just return it. + if ref := &c.stack[len(c.stack)-1]; ref.node != nil && ref.isLeaf() { + return ref.node + } + + // Start from root and traverse down the hierarchy. + var n = c.stack[0].node + if n == nil { + n = c.bucket.node(c.stack[0].page.id, nil) + } + for _, ref := range c.stack[:len(c.stack)-1] { + _assert(!n.isLeaf, "expected branch node") + n = n.childAt(int(ref.index)) + } + _assert(n.isLeaf, "expected leaf node") + return n +} + +// elemRef represents a reference to an element on a given page/node. +type elemRef struct { + page *page + node *node + index int +} + +// isLeaf returns whether the ref is pointing at a leaf page/node. +func (r *elemRef) isLeaf() bool { + if r.node != nil { + return r.node.isLeaf + } + return (r.page.flags & leafPageFlag) != 0 +} + +// count returns the number of inodes or page elements. +func (r *elemRef) count() int { + if r.node != nil { + return len(r.node.inodes) + } + return int(r.page.count) +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/cursor_test.go b/Godeps/_workspace/src/github.com/boltdb/bolt/cursor_test.go new file mode 100644 index 000000000..5f8ca5b5a --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/cursor_test.go @@ -0,0 +1,511 @@ +package bolt_test + +import ( + "bytes" + "encoding/binary" + "fmt" + "os" + "sort" + "testing" + "testing/quick" + + "github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt" +) + +// Ensure that a cursor can return a reference to the bucket that created it. +func TestCursor_Bucket(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + b, _ := tx.CreateBucket([]byte("widgets")) + c := b.Cursor() + equals(t, b, c.Bucket()) + return nil + }) +} + +// Ensure that a Tx cursor can seek to the appropriate keys. +func TestCursor_Seek(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + ok(t, err) + ok(t, b.Put([]byte("foo"), []byte("0001"))) + ok(t, b.Put([]byte("bar"), []byte("0002"))) + ok(t, b.Put([]byte("baz"), []byte("0003"))) + _, err = b.CreateBucket([]byte("bkt")) + ok(t, err) + return nil + }) + db.View(func(tx *bolt.Tx) error { + c := tx.Bucket([]byte("widgets")).Cursor() + + // Exact match should go to the key. + k, v := c.Seek([]byte("bar")) + equals(t, []byte("bar"), k) + equals(t, []byte("0002"), v) + + // Inexact match should go to the next key. + k, v = c.Seek([]byte("bas")) + equals(t, []byte("baz"), k) + equals(t, []byte("0003"), v) + + // Low key should go to the first key. + k, v = c.Seek([]byte("")) + equals(t, []byte("bar"), k) + equals(t, []byte("0002"), v) + + // High key should return no key. + k, v = c.Seek([]byte("zzz")) + assert(t, k == nil, "") + assert(t, v == nil, "") + + // Buckets should return their key but no value. + k, v = c.Seek([]byte("bkt")) + equals(t, []byte("bkt"), k) + assert(t, v == nil, "") + + return nil + }) +} + +func TestCursor_Delete(t *testing.T) { + db := NewTestDB() + defer db.Close() + + var count = 1000 + + // Insert every other key between 0 and $count. + db.Update(func(tx *bolt.Tx) error { + b, _ := tx.CreateBucket([]byte("widgets")) + for i := 0; i < count; i += 1 { + k := make([]byte, 8) + binary.BigEndian.PutUint64(k, uint64(i)) + b.Put(k, make([]byte, 100)) + } + b.CreateBucket([]byte("sub")) + return nil + }) + + db.Update(func(tx *bolt.Tx) error { + c := tx.Bucket([]byte("widgets")).Cursor() + bound := make([]byte, 8) + binary.BigEndian.PutUint64(bound, uint64(count/2)) + for key, _ := c.First(); bytes.Compare(key, bound) < 0; key, _ = c.Next() { + if err := c.Delete(); err != nil { + return err + } + } + c.Seek([]byte("sub")) + err := c.Delete() + equals(t, err, bolt.ErrIncompatibleValue) + return nil + }) + + db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + equals(t, b.Stats().KeyN, count/2+1) + return nil + }) +} + +// Ensure that a Tx cursor can seek to the appropriate keys when there are a +// large number of keys. This test also checks that seek will always move +// forward to the next key. +// +// Related: https://github.com/boltdb/bolt/pull/187 +func TestCursor_Seek_Large(t *testing.T) { + db := NewTestDB() + defer db.Close() + + var count = 10000 + + // Insert every other key between 0 and $count. + db.Update(func(tx *bolt.Tx) error { + b, _ := tx.CreateBucket([]byte("widgets")) + for i := 0; i < count; i += 100 { + for j := i; j < i+100; j += 2 { + k := make([]byte, 8) + binary.BigEndian.PutUint64(k, uint64(j)) + b.Put(k, make([]byte, 100)) + } + } + return nil + }) + + db.View(func(tx *bolt.Tx) error { + c := tx.Bucket([]byte("widgets")).Cursor() + for i := 0; i < count; i++ { + seek := make([]byte, 8) + binary.BigEndian.PutUint64(seek, uint64(i)) + + k, _ := c.Seek(seek) + + // The last seek is beyond the end of the the range so + // it should return nil. + if i == count-1 { + assert(t, k == nil, "") + continue + } + + // Otherwise we should seek to the exact key or the next key. + num := binary.BigEndian.Uint64(k) + if i%2 == 0 { + equals(t, uint64(i), num) + } else { + equals(t, uint64(i+1), num) + } + } + + return nil + }) +} + +// Ensure that a cursor can iterate over an empty bucket without error. +func TestCursor_EmptyBucket(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + return err + }) + db.View(func(tx *bolt.Tx) error { + c := tx.Bucket([]byte("widgets")).Cursor() + k, v := c.First() + assert(t, k == nil, "") + assert(t, v == nil, "") + return nil + }) +} + +// Ensure that a Tx cursor can reverse iterate over an empty bucket without error. +func TestCursor_EmptyBucketReverse(t *testing.T) { + db := NewTestDB() + defer db.Close() + + db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + return err + }) + db.View(func(tx *bolt.Tx) error { + c := tx.Bucket([]byte("widgets")).Cursor() + k, v := c.Last() + assert(t, k == nil, "") + assert(t, v == nil, "") + return nil + }) +} + +// Ensure that a Tx cursor can iterate over a single root with a couple elements. +func TestCursor_Iterate_Leaf(t *testing.T) { + db := NewTestDB() + defer db.Close() + + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + tx.Bucket([]byte("widgets")).Put([]byte("baz"), []byte{}) + tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte{0}) + tx.Bucket([]byte("widgets")).Put([]byte("bar"), []byte{1}) + return nil + }) + tx, _ := db.Begin(false) + c := tx.Bucket([]byte("widgets")).Cursor() + + k, v := c.First() + equals(t, string(k), "bar") + equals(t, v, []byte{1}) + + k, v = c.Next() + equals(t, string(k), "baz") + equals(t, v, []byte{}) + + k, v = c.Next() + equals(t, string(k), "foo") + equals(t, v, []byte{0}) + + k, v = c.Next() + assert(t, k == nil, "") + assert(t, v == nil, "") + + k, v = c.Next() + assert(t, k == nil, "") + assert(t, v == nil, "") + + tx.Rollback() +} + +// Ensure that a Tx cursor can iterate in reverse over a single root with a couple elements. +func TestCursor_LeafRootReverse(t *testing.T) { + db := NewTestDB() + defer db.Close() + + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + tx.Bucket([]byte("widgets")).Put([]byte("baz"), []byte{}) + tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte{0}) + tx.Bucket([]byte("widgets")).Put([]byte("bar"), []byte{1}) + return nil + }) + tx, _ := db.Begin(false) + c := tx.Bucket([]byte("widgets")).Cursor() + + k, v := c.Last() + equals(t, string(k), "foo") + equals(t, v, []byte{0}) + + k, v = c.Prev() + equals(t, string(k), "baz") + equals(t, v, []byte{}) + + k, v = c.Prev() + equals(t, string(k), "bar") + equals(t, v, []byte{1}) + + k, v = c.Prev() + assert(t, k == nil, "") + assert(t, v == nil, "") + + k, v = c.Prev() + assert(t, k == nil, "") + assert(t, v == nil, "") + + tx.Rollback() +} + +// Ensure that a Tx cursor can restart from the beginning. +func TestCursor_Restart(t *testing.T) { + db := NewTestDB() + defer db.Close() + + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + tx.Bucket([]byte("widgets")).Put([]byte("bar"), []byte{}) + tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte{}) + return nil + }) + + tx, _ := db.Begin(false) + c := tx.Bucket([]byte("widgets")).Cursor() + + k, _ := c.First() + equals(t, string(k), "bar") + + k, _ = c.Next() + equals(t, string(k), "foo") + + k, _ = c.First() + equals(t, string(k), "bar") + + k, _ = c.Next() + equals(t, string(k), "foo") + + tx.Rollback() +} + +// Ensure that a Tx can iterate over all elements in a bucket. +func TestCursor_QuickCheck(t *testing.T) { + f := func(items testdata) bool { + db := NewTestDB() + defer db.Close() + + // Bulk insert all values. + tx, _ := db.Begin(true) + tx.CreateBucket([]byte("widgets")) + b := tx.Bucket([]byte("widgets")) + for _, item := range items { + ok(t, b.Put(item.Key, item.Value)) + } + ok(t, tx.Commit()) + + // Sort test data. + sort.Sort(items) + + // Iterate over all items and check consistency. + var index = 0 + tx, _ = db.Begin(false) + c := tx.Bucket([]byte("widgets")).Cursor() + for k, v := c.First(); k != nil && index < len(items); k, v = c.Next() { + equals(t, k, items[index].Key) + equals(t, v, items[index].Value) + index++ + } + equals(t, len(items), index) + tx.Rollback() + + return true + } + if err := quick.Check(f, qconfig()); err != nil { + t.Error(err) + } +} + +// Ensure that a transaction can iterate over all elements in a bucket in reverse. +func TestCursor_QuickCheck_Reverse(t *testing.T) { + f := func(items testdata) bool { + db := NewTestDB() + defer db.Close() + + // Bulk insert all values. + tx, _ := db.Begin(true) + tx.CreateBucket([]byte("widgets")) + b := tx.Bucket([]byte("widgets")) + for _, item := range items { + ok(t, b.Put(item.Key, item.Value)) + } + ok(t, tx.Commit()) + + // Sort test data. + sort.Sort(revtestdata(items)) + + // Iterate over all items and check consistency. + var index = 0 + tx, _ = db.Begin(false) + c := tx.Bucket([]byte("widgets")).Cursor() + for k, v := c.Last(); k != nil && index < len(items); k, v = c.Prev() { + equals(t, k, items[index].Key) + equals(t, v, items[index].Value) + index++ + } + equals(t, len(items), index) + tx.Rollback() + + return true + } + if err := quick.Check(f, qconfig()); err != nil { + t.Error(err) + } +} + +// Ensure that a Tx cursor can iterate over subbuckets. +func TestCursor_QuickCheck_BucketsOnly(t *testing.T) { + db := NewTestDB() + defer db.Close() + + db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + ok(t, err) + _, err = b.CreateBucket([]byte("foo")) + ok(t, err) + _, err = b.CreateBucket([]byte("bar")) + ok(t, err) + _, err = b.CreateBucket([]byte("baz")) + ok(t, err) + return nil + }) + db.View(func(tx *bolt.Tx) error { + var names []string + c := tx.Bucket([]byte("widgets")).Cursor() + for k, v := c.First(); k != nil; k, v = c.Next() { + names = append(names, string(k)) + assert(t, v == nil, "") + } + equals(t, names, []string{"bar", "baz", "foo"}) + return nil + }) +} + +// Ensure that a Tx cursor can reverse iterate over subbuckets. +func TestCursor_QuickCheck_BucketsOnly_Reverse(t *testing.T) { + db := NewTestDB() + defer db.Close() + + db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + ok(t, err) + _, err = b.CreateBucket([]byte("foo")) + ok(t, err) + _, err = b.CreateBucket([]byte("bar")) + ok(t, err) + _, err = b.CreateBucket([]byte("baz")) + ok(t, err) + return nil + }) + db.View(func(tx *bolt.Tx) error { + var names []string + c := tx.Bucket([]byte("widgets")).Cursor() + for k, v := c.Last(); k != nil; k, v = c.Prev() { + names = append(names, string(k)) + assert(t, v == nil, "") + } + equals(t, names, []string{"foo", "baz", "bar"}) + return nil + }) +} + +func ExampleCursor() { + // Open the database. + db, _ := bolt.Open(tempfile(), 0666, nil) + defer os.Remove(db.Path()) + defer db.Close() + + // Start a read-write transaction. + db.Update(func(tx *bolt.Tx) error { + // Create a new bucket. + tx.CreateBucket([]byte("animals")) + + // Insert data into a bucket. + b := tx.Bucket([]byte("animals")) + b.Put([]byte("dog"), []byte("fun")) + b.Put([]byte("cat"), []byte("lame")) + b.Put([]byte("liger"), []byte("awesome")) + + // Create a cursor for iteration. + c := b.Cursor() + + // Iterate over items in sorted key order. This starts from the + // first key/value pair and updates the k/v variables to the + // next key/value on each iteration. + // + // The loop finishes at the end of the cursor when a nil key is returned. + for k, v := c.First(); k != nil; k, v = c.Next() { + fmt.Printf("A %s is %s.\n", k, v) + } + + return nil + }) + + // Output: + // A cat is lame. + // A dog is fun. + // A liger is awesome. +} + +func ExampleCursor_reverse() { + // Open the database. + db, _ := bolt.Open(tempfile(), 0666, nil) + defer os.Remove(db.Path()) + defer db.Close() + + // Start a read-write transaction. + db.Update(func(tx *bolt.Tx) error { + // Create a new bucket. + tx.CreateBucket([]byte("animals")) + + // Insert data into a bucket. + b := tx.Bucket([]byte("animals")) + b.Put([]byte("dog"), []byte("fun")) + b.Put([]byte("cat"), []byte("lame")) + b.Put([]byte("liger"), []byte("awesome")) + + // Create a cursor for iteration. + c := b.Cursor() + + // Iterate over items in reverse sorted key order. This starts + // from the last key/value pair and updates the k/v variables to + // the previous key/value on each iteration. + // + // The loop finishes at the beginning of the cursor when a nil key + // is returned. + for k, v := c.Last(); k != nil; k, v = c.Prev() { + fmt.Printf("A %s is %s.\n", k, v) + } + + return nil + }) + + // Output: + // A liger is awesome. + // A dog is fun. + // A cat is lame. +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/db.go b/Godeps/_workspace/src/github.com/boltdb/bolt/db.go new file mode 100644 index 000000000..8f0e90b55 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/db.go @@ -0,0 +1,732 @@ +package bolt + +import ( + "fmt" + "hash/fnv" + "os" + "runtime" + "runtime/debug" + "strings" + "sync" + "time" + "unsafe" +) + +// The largest step that can be taken when remapping the mmap. +const maxMmapStep = 1 << 30 // 1GB + +// The data file format version. +const version = 2 + +// Represents a marker value to indicate that a file is a Bolt DB. +const magic uint32 = 0xED0CDAED + +// IgnoreNoSync specifies whether the NoSync field of a DB is ignored when +// syncing changes to a file. This is required as some operating systems, +// such as OpenBSD, do not have a unified buffer cache (UBC) and writes +// must be synchronzied using the msync(2) syscall. +const IgnoreNoSync = runtime.GOOS == "openbsd" + +// Default values if not set in a DB instance. +const ( + DefaultMaxBatchSize int = 1000 + DefaultMaxBatchDelay = 10 * time.Millisecond +) + +// DB represents a collection of buckets persisted to a file on disk. +// All data access is performed through transactions which can be obtained through the DB. +// All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called. +type DB struct { + // When enabled, the database will perform a Check() after every commit. + // A panic is issued if the database is in an inconsistent state. This + // flag has a large performance impact so it should only be used for + // debugging purposes. + StrictMode bool + + // Setting the NoSync flag will cause the database to skip fsync() + // calls after each commit. This can be useful when bulk loading data + // into a database and you can restart the bulk load in the event of + // a system failure or database corruption. Do not set this flag for + // normal use. + // + // If the package global IgnoreNoSync constant is true, this value is + // ignored. See the comment on that constant for more details. + // + // THIS IS UNSAFE. PLEASE USE WITH CAUTION. + NoSync bool + + // MaxBatchSize is the maximum size of a batch. Default value is + // copied from DefaultMaxBatchSize in Open. + // + // If <=0, disables batching. + // + // Do not change concurrently with calls to Batch. + MaxBatchSize int + + // MaxBatchDelay is the maximum delay before a batch starts. + // Default value is copied from DefaultMaxBatchDelay in Open. + // + // If <=0, effectively disables batching. + // + // Do not change concurrently with calls to Batch. + MaxBatchDelay time.Duration + + path string + file *os.File + dataref []byte // mmap'ed readonly, write throws SEGV + data *[maxMapSize]byte + datasz int + meta0 *meta + meta1 *meta + pageSize int + opened bool + rwtx *Tx + txs []*Tx + freelist *freelist + stats Stats + + batchMu sync.Mutex + batch *batch + + rwlock sync.Mutex // Allows only one writer at a time. + metalock sync.Mutex // Protects meta page access. + mmaplock sync.RWMutex // Protects mmap access during remapping. + statlock sync.RWMutex // Protects stats access. + + ops struct { + writeAt func(b []byte, off int64) (n int, err error) + } +} + +// Path returns the path to currently open database file. +func (db *DB) Path() string { + return db.path +} + +// GoString returns the Go string representation of the database. +func (db *DB) GoString() string { + return fmt.Sprintf("bolt.DB{path:%q}", db.path) +} + +// String returns the string representation of the database. +func (db *DB) String() string { + return fmt.Sprintf("DB<%q>", db.path) +} + +// Open creates and opens a database at the given path. +// If the file does not exist then it will be created automatically. +// Passing in nil options will cause Bolt to open the database with the default options. +func Open(path string, mode os.FileMode, options *Options) (*DB, error) { + var db = &DB{opened: true} + + // Set default options if no options are provided. + if options == nil { + options = DefaultOptions + } + + // Set default values for later DB operations. + db.MaxBatchSize = DefaultMaxBatchSize + db.MaxBatchDelay = DefaultMaxBatchDelay + + // Open data file and separate sync handler for metadata writes. + db.path = path + + var err error + if db.file, err = os.OpenFile(db.path, os.O_RDWR|os.O_CREATE, mode); err != nil { + _ = db.close() + return nil, err + } + + // Lock file so that other processes using Bolt cannot use the database + // at the same time. This would cause corruption since the two processes + // would write meta pages and free pages separately. + if err := flock(db.file, options.Timeout); err != nil { + _ = db.close() + return nil, err + } + + // Default values for test hooks + db.ops.writeAt = db.file.WriteAt + + // Initialize the database if it doesn't exist. + if info, err := db.file.Stat(); err != nil { + return nil, fmt.Errorf("stat error: %s", err) + } else if info.Size() == 0 { + // Initialize new files with meta pages. + if err := db.init(); err != nil { + return nil, err + } + } else { + // Read the first meta page to determine the page size. + var buf [0x1000]byte + if _, err := db.file.ReadAt(buf[:], 0); err == nil { + m := db.pageInBuffer(buf[:], 0).meta() + if err := m.validate(); err != nil { + return nil, fmt.Errorf("meta0 error: %s", err) + } + db.pageSize = int(m.pageSize) + } + } + + // Memory map the data file. + if err := db.mmap(0); err != nil { + _ = db.close() + return nil, err + } + + // Read in the freelist. + db.freelist = newFreelist() + db.freelist.read(db.page(db.meta().freelist)) + + // Mark the database as opened and return. + return db, nil +} + +// mmap opens the underlying memory-mapped file and initializes the meta references. +// minsz is the minimum size that the new mmap can be. +func (db *DB) mmap(minsz int) error { + db.mmaplock.Lock() + defer db.mmaplock.Unlock() + + info, err := db.file.Stat() + if err != nil { + return fmt.Errorf("mmap stat error: %s", err) + } else if int(info.Size()) < db.pageSize*2 { + return fmt.Errorf("file size too small") + } + + // Ensure the size is at least the minimum size. + var size = int(info.Size()) + if size < minsz { + size = minsz + } + size, err = db.mmapSize(size) + if err != nil { + return err + } + + // Dereference all mmap references before unmapping. + if db.rwtx != nil { + db.rwtx.root.dereference() + } + + // Unmap existing data before continuing. + if err := db.munmap(); err != nil { + return err + } + + // Memory-map the data file as a byte slice. + if err := mmap(db, size); err != nil { + return err + } + + // Save references to the meta pages. + db.meta0 = db.page(0).meta() + db.meta1 = db.page(1).meta() + + // Validate the meta pages. + if err := db.meta0.validate(); err != nil { + return fmt.Errorf("meta0 error: %s", err) + } + if err := db.meta1.validate(); err != nil { + return fmt.Errorf("meta1 error: %s", err) + } + + return nil +} + +// munmap unmaps the data file from memory. +func (db *DB) munmap() error { + if err := munmap(db); err != nil { + return fmt.Errorf("unmap error: " + err.Error()) + } + return nil +} + +// mmapSize determines the appropriate size for the mmap given the current size +// of the database. The minimum size is 1MB and doubles until it reaches 1GB. +// Returns an error if the new mmap size is greater than the max allowed. +func (db *DB) mmapSize(size int) (int, error) { + // Double the size from 1MB until 1GB. + for i := uint(20); i <= 30; i++ { + if size <= 1< maxMapSize { + return 0, fmt.Errorf("mmap too large") + } + + // If larger than 1GB then grow by 1GB at a time. + sz := int64(size) + if remainder := sz % int64(maxMmapStep); remainder > 0 { + sz += int64(maxMmapStep) - remainder + } + + // Ensure that the mmap size is a multiple of the page size. + // This should always be true since we're incrementing in MBs. + pageSize := int64(db.pageSize) + if (sz % pageSize) != 0 { + sz = ((sz / pageSize) + 1) * pageSize + } + + // If we've exceeded the max size then only grow up to the max size. + if sz > maxMapSize { + sz = maxMapSize + } + + return int(sz), nil +} + +// init creates a new database file and initializes its meta pages. +func (db *DB) init() error { + // Set the page size to the OS page size. + db.pageSize = os.Getpagesize() + + // Create two meta pages on a buffer. + buf := make([]byte, db.pageSize*4) + for i := 0; i < 2; i++ { + p := db.pageInBuffer(buf[:], pgid(i)) + p.id = pgid(i) + p.flags = metaPageFlag + + // Initialize the meta page. + m := p.meta() + m.magic = magic + m.version = version + m.pageSize = uint32(db.pageSize) + m.freelist = 2 + m.root = bucket{root: 3} + m.pgid = 4 + m.txid = txid(i) + } + + // Write an empty freelist at page 3. + p := db.pageInBuffer(buf[:], pgid(2)) + p.id = pgid(2) + p.flags = freelistPageFlag + p.count = 0 + + // Write an empty leaf page at page 4. + p = db.pageInBuffer(buf[:], pgid(3)) + p.id = pgid(3) + p.flags = leafPageFlag + p.count = 0 + + // Write the buffer to our data file. + if _, err := db.ops.writeAt(buf, 0); err != nil { + return err + } + if err := fdatasync(db); err != nil { + return err + } + + return nil +} + +// Close releases all database resources. +// All transactions must be closed before closing the database. +func (db *DB) Close() error { + db.metalock.Lock() + defer db.metalock.Unlock() + return db.close() +} + +func (db *DB) close() error { + db.opened = false + + db.freelist = nil + db.path = "" + + // Clear ops. + db.ops.writeAt = nil + + // Close the mmap. + if err := db.munmap(); err != nil { + return err + } + + // Close file handles. + if db.file != nil { + // Unlock the file. + _ = funlock(db.file) + + // Close the file descriptor. + if err := db.file.Close(); err != nil { + return fmt.Errorf("db file close: %s", err) + } + db.file = nil + } + + return nil +} + +// Begin starts a new transaction. +// Multiple read-only transactions can be used concurrently but only one +// write transaction can be used at a time. Starting multiple write transactions +// will cause the calls to block and be serialized until the current write +// transaction finishes. +// +// IMPORTANT: You must close read-only transactions after you are finished or +// else the database will not reclaim old pages. +func (db *DB) Begin(writable bool) (*Tx, error) { + if writable { + return db.beginRWTx() + } + return db.beginTx() +} + +func (db *DB) beginTx() (*Tx, error) { + // Lock the meta pages while we initialize the transaction. We obtain + // the meta lock before the mmap lock because that's the order that the + // write transaction will obtain them. + db.metalock.Lock() + + // Obtain a read-only lock on the mmap. When the mmap is remapped it will + // obtain a write lock so all transactions must finish before it can be + // remapped. + db.mmaplock.RLock() + + // Exit if the database is not open yet. + if !db.opened { + db.mmaplock.RUnlock() + db.metalock.Unlock() + return nil, ErrDatabaseNotOpen + } + + // Create a transaction associated with the database. + t := &Tx{} + t.init(db) + + // Keep track of transaction until it closes. + db.txs = append(db.txs, t) + n := len(db.txs) + + // Unlock the meta pages. + db.metalock.Unlock() + + // Update the transaction stats. + db.statlock.Lock() + db.stats.TxN++ + db.stats.OpenTxN = n + db.statlock.Unlock() + + return t, nil +} + +func (db *DB) beginRWTx() (*Tx, error) { + // Obtain writer lock. This is released by the transaction when it closes. + // This enforces only one writer transaction at a time. + db.rwlock.Lock() + + // Once we have the writer lock then we can lock the meta pages so that + // we can set up the transaction. + db.metalock.Lock() + defer db.metalock.Unlock() + + // Exit if the database is not open yet. + if !db.opened { + db.rwlock.Unlock() + return nil, ErrDatabaseNotOpen + } + + // Create a transaction associated with the database. + t := &Tx{writable: true} + t.init(db) + db.rwtx = t + + // Free any pages associated with closed read-only transactions. + var minid txid = 0xFFFFFFFFFFFFFFFF + for _, t := range db.txs { + if t.meta.txid < minid { + minid = t.meta.txid + } + } + if minid > 0 { + db.freelist.release(minid - 1) + } + + return t, nil +} + +// removeTx removes a transaction from the database. +func (db *DB) removeTx(tx *Tx) { + // Release the read lock on the mmap. + db.mmaplock.RUnlock() + + // Use the meta lock to restrict access to the DB object. + db.metalock.Lock() + + // Remove the transaction. + for i, t := range db.txs { + if t == tx { + db.txs = append(db.txs[:i], db.txs[i+1:]...) + break + } + } + n := len(db.txs) + + // Unlock the meta pages. + db.metalock.Unlock() + + // Merge statistics. + db.statlock.Lock() + db.stats.OpenTxN = n + db.stats.TxStats.add(&tx.stats) + db.statlock.Unlock() +} + +// Update executes a function within the context of a read-write managed transaction. +// If no error is returned from the function then the transaction is committed. +// If an error is returned then the entire transaction is rolled back. +// Any error that is returned from the function or returned from the commit is +// returned from the Update() method. +// +// Attempting to manually commit or rollback within the function will cause a panic. +func (db *DB) Update(fn func(*Tx) error) error { + t, err := db.Begin(true) + if err != nil { + return err + } + + // Make sure the transaction rolls back in the event of a panic. + defer func() { + if t.db != nil { + t.rollback() + } + }() + + // Mark as a managed tx so that the inner function cannot manually commit. + t.managed = true + + // If an error is returned from the function then rollback and return error. + err = fn(t) + t.managed = false + if err != nil { + _ = t.Rollback() + return err + } + + return t.Commit() +} + +// View executes a function within the context of a managed read-only transaction. +// Any error that is returned from the function is returned from the View() method. +// +// Attempting to manually rollback within the function will cause a panic. +func (db *DB) View(fn func(*Tx) error) error { + t, err := db.Begin(false) + if err != nil { + return err + } + + // Make sure the transaction rolls back in the event of a panic. + defer func() { + if t.db != nil { + t.rollback() + } + }() + + // Mark as a managed tx so that the inner function cannot manually rollback. + t.managed = true + + // If an error is returned from the function then pass it through. + err = fn(t) + t.managed = false + if err != nil { + _ = t.Rollback() + return err + } + + if err := t.Rollback(); err != nil { + return err + } + + return nil +} + +// Stats retrieves ongoing performance stats for the database. +// This is only updated when a transaction closes. +func (db *DB) Stats() Stats { + db.statlock.RLock() + defer db.statlock.RUnlock() + return db.stats +} + +// This is for internal access to the raw data bytes from the C cursor, use +// carefully, or not at all. +func (db *DB) Info() *Info { + return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize} +} + +// page retrieves a page reference from the mmap based on the current page size. +func (db *DB) page(id pgid) *page { + pos := id * pgid(db.pageSize) + return (*page)(unsafe.Pointer(&db.data[pos])) +} + +// pageInBuffer retrieves a page reference from a given byte array based on the current page size. +func (db *DB) pageInBuffer(b []byte, id pgid) *page { + return (*page)(unsafe.Pointer(&b[id*pgid(db.pageSize)])) +} + +// meta retrieves the current meta page reference. +func (db *DB) meta() *meta { + if db.meta0.txid > db.meta1.txid { + return db.meta0 + } + return db.meta1 +} + +// allocate returns a contiguous block of memory starting at a given page. +func (db *DB) allocate(count int) (*page, error) { + // Allocate a temporary buffer for the page. + buf := make([]byte, count*db.pageSize) + p := (*page)(unsafe.Pointer(&buf[0])) + p.overflow = uint32(count - 1) + + // Use pages from the freelist if they are available. + if p.id = db.freelist.allocate(count); p.id != 0 { + return p, nil + } + + // Resize mmap() if we're at the end. + p.id = db.rwtx.meta.pgid + var minsz = int((p.id+pgid(count))+1) * db.pageSize + if minsz >= db.datasz { + if err := db.mmap(minsz); err != nil { + return nil, fmt.Errorf("mmap allocate error: %s", err) + } + } + + // Move the page id high water mark. + db.rwtx.meta.pgid += pgid(count) + + return p, nil +} + +// Options represents the options that can be set when opening a database. +type Options struct { + // Timeout is the amount of time to wait to obtain a file lock. + // When set to zero it will wait indefinitely. This option is only + // available on Darwin and Linux. + Timeout time.Duration +} + +// DefaultOptions represent the options used if nil options are passed into Open(). +// No timeout is used which will cause Bolt to wait indefinitely for a lock. +var DefaultOptions = &Options{ + Timeout: 0, +} + +// Stats represents statistics about the database. +type Stats struct { + // Freelist stats + FreePageN int // total number of free pages on the freelist + PendingPageN int // total number of pending pages on the freelist + FreeAlloc int // total bytes allocated in free pages + FreelistInuse int // total bytes used by the freelist + + // Transaction stats + TxN int // total number of started read transactions + OpenTxN int // number of currently open read transactions + + TxStats TxStats // global, ongoing stats. +} + +// Sub calculates and returns the difference between two sets of database stats. +// This is useful when obtaining stats at two different points and time and +// you need the performance counters that occurred within that time span. +func (s *Stats) Sub(other *Stats) Stats { + if other == nil { + return *s + } + var diff Stats + diff.FreePageN = s.FreePageN + diff.PendingPageN = s.PendingPageN + diff.FreeAlloc = s.FreeAlloc + diff.FreelistInuse = s.FreelistInuse + diff.TxN = other.TxN - s.TxN + diff.TxStats = s.TxStats.Sub(&other.TxStats) + return diff +} + +func (s *Stats) add(other *Stats) { + s.TxStats.add(&other.TxStats) +} + +type Info struct { + Data uintptr + PageSize int +} + +type meta struct { + magic uint32 + version uint32 + pageSize uint32 + flags uint32 + root bucket + freelist pgid + pgid pgid + txid txid + checksum uint64 +} + +// validate checks the marker bytes and version of the meta page to ensure it matches this binary. +func (m *meta) validate() error { + if m.checksum != 0 && m.checksum != m.sum64() { + return ErrChecksum + } else if m.magic != magic { + return ErrInvalid + } else if m.version != version { + return ErrVersionMismatch + } + return nil +} + +// copy copies one meta object to another. +func (m *meta) copy(dest *meta) { + *dest = *m +} + +// write writes the meta onto a page. +func (m *meta) write(p *page) { + if m.root.root >= m.pgid { + panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid)) + } else if m.freelist >= m.pgid { + panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid)) + } + + // Page id is either going to be 0 or 1 which we can determine by the transaction ID. + p.id = pgid(m.txid % 2) + p.flags |= metaPageFlag + + // Calculate the checksum. + m.checksum = m.sum64() + + m.copy(p.meta()) +} + +// generates the checksum for the meta. +func (m *meta) sum64() uint64 { + var h = fnv.New64a() + _, _ = h.Write((*[unsafe.Offsetof(meta{}.checksum)]byte)(unsafe.Pointer(m))[:]) + return h.Sum64() +} + +// _assert will panic with a given formatted message if the given condition is false. +func _assert(condition bool, msg string, v ...interface{}) { + if !condition { + panic(fmt.Sprintf("assertion failed: "+msg, v...)) + } +} + +func warn(v ...interface{}) { fmt.Fprintln(os.Stderr, v...) } +func warnf(msg string, v ...interface{}) { fmt.Fprintf(os.Stderr, msg+"\n", v...) } + +func printstack() { + stack := strings.Join(strings.Split(string(debug.Stack()), "\n")[2:], "\n") + fmt.Fprintln(os.Stderr, stack) +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/db_test.go b/Godeps/_workspace/src/github.com/boltdb/bolt/db_test.go new file mode 100644 index 000000000..08ca9fbab --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/db_test.go @@ -0,0 +1,790 @@ +package bolt_test + +import ( + "encoding/binary" + "errors" + "flag" + "fmt" + "io/ioutil" + "os" + "regexp" + "runtime" + "sort" + "strings" + "testing" + "time" + + "github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt" +) + +var statsFlag = flag.Bool("stats", false, "show performance stats") + +// Ensure that opening a database with a bad path returns an error. +func TestOpen_BadPath(t *testing.T) { + db, err := bolt.Open("", 0666, nil) + assert(t, err != nil, "err: %s", err) + assert(t, db == nil, "") +} + +// Ensure that a database can be opened without error. +func TestOpen(t *testing.T) { + path := tempfile() + defer os.Remove(path) + db, err := bolt.Open(path, 0666, nil) + assert(t, db != nil, "") + ok(t, err) + equals(t, db.Path(), path) + ok(t, db.Close()) +} + +// Ensure that opening an already open database file will timeout. +func TestOpen_Timeout(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("timeout not supported on windows") + } + + path := tempfile() + defer os.Remove(path) + + // Open a data file. + db0, err := bolt.Open(path, 0666, nil) + assert(t, db0 != nil, "") + ok(t, err) + + // Attempt to open the database again. + start := time.Now() + db1, err := bolt.Open(path, 0666, &bolt.Options{Timeout: 100 * time.Millisecond}) + assert(t, db1 == nil, "") + equals(t, bolt.ErrTimeout, err) + assert(t, time.Since(start) > 100*time.Millisecond, "") + + db0.Close() +} + +// Ensure that opening an already open database file will wait until its closed. +func TestOpen_Wait(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("timeout not supported on windows") + } + + path := tempfile() + defer os.Remove(path) + + // Open a data file. + db0, err := bolt.Open(path, 0666, nil) + assert(t, db0 != nil, "") + ok(t, err) + + // Close it in just a bit. + time.AfterFunc(100*time.Millisecond, func() { db0.Close() }) + + // Attempt to open the database again. + start := time.Now() + db1, err := bolt.Open(path, 0666, &bolt.Options{Timeout: 200 * time.Millisecond}) + assert(t, db1 != nil, "") + ok(t, err) + assert(t, time.Since(start) > 100*time.Millisecond, "") +} + +// Ensure that opening a database does not increase its size. +// https://github.com/boltdb/bolt/issues/291 +func TestOpen_Size(t *testing.T) { + // Open a data file. + db := NewTestDB() + path := db.Path() + defer db.Close() + + // Insert until we get above the minimum 4MB size. + ok(t, db.Update(func(tx *bolt.Tx) error { + b, _ := tx.CreateBucketIfNotExists([]byte("data")) + for i := 0; i < 10000; i++ { + ok(t, b.Put([]byte(fmt.Sprintf("%04d", i)), make([]byte, 1000))) + } + return nil + })) + + // Close database and grab the size. + db.DB.Close() + sz := fileSize(path) + if sz == 0 { + t.Fatalf("unexpected new file size: %d", sz) + } + + // Reopen database, update, and check size again. + db0, err := bolt.Open(path, 0666, nil) + ok(t, err) + ok(t, db0.Update(func(tx *bolt.Tx) error { return tx.Bucket([]byte("data")).Put([]byte{0}, []byte{0}) })) + ok(t, db0.Close()) + newSz := fileSize(path) + if newSz == 0 { + t.Fatalf("unexpected new file size: %d", newSz) + } + + // Compare the original size with the new size. + if sz != newSz { + t.Fatalf("unexpected file growth: %d => %d", sz, newSz) + } +} + +// Ensure that opening a database beyond the max step size does not increase its size. +// https://github.com/boltdb/bolt/issues/303 +func TestOpen_Size_Large(t *testing.T) { + if testing.Short() { + t.Skip("short mode") + } + + // Open a data file. + db := NewTestDB() + path := db.Path() + defer db.Close() + + // Insert until we get above the minimum 4MB size. + var index uint64 + for i := 0; i < 10000; i++ { + ok(t, db.Update(func(tx *bolt.Tx) error { + b, _ := tx.CreateBucketIfNotExists([]byte("data")) + for j := 0; j < 1000; j++ { + ok(t, b.Put(u64tob(index), make([]byte, 50))) + index++ + } + return nil + })) + } + + // Close database and grab the size. + db.DB.Close() + sz := fileSize(path) + if sz == 0 { + t.Fatalf("unexpected new file size: %d", sz) + } else if sz < (1 << 30) { + t.Fatalf("expected larger initial size: %d", sz) + } + + // Reopen database, update, and check size again. + db0, err := bolt.Open(path, 0666, nil) + ok(t, err) + ok(t, db0.Update(func(tx *bolt.Tx) error { return tx.Bucket([]byte("data")).Put([]byte{0}, []byte{0}) })) + ok(t, db0.Close()) + newSz := fileSize(path) + if newSz == 0 { + t.Fatalf("unexpected new file size: %d", newSz) + } + + // Compare the original size with the new size. + if sz != newSz { + t.Fatalf("unexpected file growth: %d => %d", sz, newSz) + } +} + +// Ensure that a re-opened database is consistent. +func TestOpen_Check(t *testing.T) { + path := tempfile() + defer os.Remove(path) + + db, err := bolt.Open(path, 0666, nil) + ok(t, err) + ok(t, db.View(func(tx *bolt.Tx) error { return <-tx.Check() })) + db.Close() + + db, err = bolt.Open(path, 0666, nil) + ok(t, err) + ok(t, db.View(func(tx *bolt.Tx) error { return <-tx.Check() })) + db.Close() +} + +// Ensure that the database returns an error if the file handle cannot be open. +func TestDB_Open_FileError(t *testing.T) { + path := tempfile() + defer os.Remove(path) + + _, err := bolt.Open(path+"/youre-not-my-real-parent", 0666, nil) + assert(t, err.(*os.PathError) != nil, "") + equals(t, path+"/youre-not-my-real-parent", err.(*os.PathError).Path) + equals(t, "open", err.(*os.PathError).Op) +} + +// Ensure that write errors to the meta file handler during initialization are returned. +func TestDB_Open_MetaInitWriteError(t *testing.T) { + t.Skip("pending") +} + +// Ensure that a database that is too small returns an error. +func TestDB_Open_FileTooSmall(t *testing.T) { + path := tempfile() + defer os.Remove(path) + + db, err := bolt.Open(path, 0666, nil) + ok(t, err) + db.Close() + + // corrupt the database + ok(t, os.Truncate(path, int64(os.Getpagesize()))) + + db, err = bolt.Open(path, 0666, nil) + equals(t, errors.New("file size too small"), err) +} + +// TODO(benbjohnson): Test corruption at every byte of the first two pages. + +// Ensure that a database cannot open a transaction when it's not open. +func TestDB_Begin_DatabaseNotOpen(t *testing.T) { + var db bolt.DB + tx, err := db.Begin(false) + assert(t, tx == nil, "") + equals(t, err, bolt.ErrDatabaseNotOpen) +} + +// Ensure that a read-write transaction can be retrieved. +func TestDB_BeginRW(t *testing.T) { + db := NewTestDB() + defer db.Close() + tx, err := db.Begin(true) + assert(t, tx != nil, "") + ok(t, err) + assert(t, tx.DB() == db.DB, "") + equals(t, tx.Writable(), true) + ok(t, tx.Commit()) +} + +// Ensure that opening a transaction while the DB is closed returns an error. +func TestDB_BeginRW_Closed(t *testing.T) { + var db bolt.DB + tx, err := db.Begin(true) + equals(t, err, bolt.ErrDatabaseNotOpen) + assert(t, tx == nil, "") +} + +// Ensure a database can provide a transactional block. +func TestDB_Update(t *testing.T) { + db := NewTestDB() + defer db.Close() + err := db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + b := tx.Bucket([]byte("widgets")) + b.Put([]byte("foo"), []byte("bar")) + b.Put([]byte("baz"), []byte("bat")) + b.Delete([]byte("foo")) + return nil + }) + ok(t, err) + err = db.View(func(tx *bolt.Tx) error { + assert(t, tx.Bucket([]byte("widgets")).Get([]byte("foo")) == nil, "") + equals(t, []byte("bat"), tx.Bucket([]byte("widgets")).Get([]byte("baz"))) + return nil + }) + ok(t, err) +} + +// Ensure a closed database returns an error while running a transaction block +func TestDB_Update_Closed(t *testing.T) { + var db bolt.DB + err := db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + return nil + }) + equals(t, err, bolt.ErrDatabaseNotOpen) +} + +// Ensure a panic occurs while trying to commit a managed transaction. +func TestDB_Update_ManualCommit(t *testing.T) { + db := NewTestDB() + defer db.Close() + + var ok bool + db.Update(func(tx *bolt.Tx) error { + func() { + defer func() { + if r := recover(); r != nil { + ok = true + } + }() + tx.Commit() + }() + return nil + }) + assert(t, ok, "expected panic") +} + +// Ensure a panic occurs while trying to rollback a managed transaction. +func TestDB_Update_ManualRollback(t *testing.T) { + db := NewTestDB() + defer db.Close() + + var ok bool + db.Update(func(tx *bolt.Tx) error { + func() { + defer func() { + if r := recover(); r != nil { + ok = true + } + }() + tx.Rollback() + }() + return nil + }) + assert(t, ok, "expected panic") +} + +// Ensure a panic occurs while trying to commit a managed transaction. +func TestDB_View_ManualCommit(t *testing.T) { + db := NewTestDB() + defer db.Close() + + var ok bool + db.Update(func(tx *bolt.Tx) error { + func() { + defer func() { + if r := recover(); r != nil { + ok = true + } + }() + tx.Commit() + }() + return nil + }) + assert(t, ok, "expected panic") +} + +// Ensure a panic occurs while trying to rollback a managed transaction. +func TestDB_View_ManualRollback(t *testing.T) { + db := NewTestDB() + defer db.Close() + + var ok bool + db.Update(func(tx *bolt.Tx) error { + func() { + defer func() { + if r := recover(); r != nil { + ok = true + } + }() + tx.Rollback() + }() + return nil + }) + assert(t, ok, "expected panic") +} + +// Ensure a write transaction that panics does not hold open locks. +func TestDB_Update_Panic(t *testing.T) { + db := NewTestDB() + defer db.Close() + + func() { + defer func() { + if r := recover(); r != nil { + t.Log("recover: update", r) + } + }() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + panic("omg") + }) + }() + + // Verify we can update again. + err := db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + return err + }) + ok(t, err) + + // Verify that our change persisted. + err = db.Update(func(tx *bolt.Tx) error { + assert(t, tx.Bucket([]byte("widgets")) != nil, "") + return nil + }) +} + +// Ensure a database can return an error through a read-only transactional block. +func TestDB_View_Error(t *testing.T) { + db := NewTestDB() + defer db.Close() + err := db.View(func(tx *bolt.Tx) error { + return errors.New("xxx") + }) + equals(t, errors.New("xxx"), err) +} + +// Ensure a read transaction that panics does not hold open locks. +func TestDB_View_Panic(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + return nil + }) + + func() { + defer func() { + if r := recover(); r != nil { + t.Log("recover: view", r) + } + }() + db.View(func(tx *bolt.Tx) error { + assert(t, tx.Bucket([]byte("widgets")) != nil, "") + panic("omg") + }) + }() + + // Verify that we can still use read transactions. + db.View(func(tx *bolt.Tx) error { + assert(t, tx.Bucket([]byte("widgets")) != nil, "") + return nil + }) +} + +// Ensure that an error is returned when a database write fails. +func TestDB_Commit_WriteFail(t *testing.T) { + t.Skip("pending") // TODO(benbjohnson) +} + +// Ensure that DB stats can be returned. +func TestDB_Stats(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + return err + }) + stats := db.Stats() + equals(t, 2, stats.TxStats.PageCount) + equals(t, 0, stats.FreePageN) + equals(t, 2, stats.PendingPageN) +} + +// Ensure that database pages are in expected order and type. +func TestDB_Consistency(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + return err + }) + + for i := 0; i < 10; i++ { + db.Update(func(tx *bolt.Tx) error { + ok(t, tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar"))) + return nil + }) + } + db.Update(func(tx *bolt.Tx) error { + p, _ := tx.Page(0) + assert(t, p != nil, "") + equals(t, "meta", p.Type) + + p, _ = tx.Page(1) + assert(t, p != nil, "") + equals(t, "meta", p.Type) + + p, _ = tx.Page(2) + assert(t, p != nil, "") + equals(t, "free", p.Type) + + p, _ = tx.Page(3) + assert(t, p != nil, "") + equals(t, "free", p.Type) + + p, _ = tx.Page(4) + assert(t, p != nil, "") + equals(t, "leaf", p.Type) + + p, _ = tx.Page(5) + assert(t, p != nil, "") + equals(t, "freelist", p.Type) + + p, _ = tx.Page(6) + assert(t, p == nil, "") + return nil + }) +} + +// Ensure that DB stats can be substracted from one another. +func TestDBStats_Sub(t *testing.T) { + var a, b bolt.Stats + a.TxStats.PageCount = 3 + a.FreePageN = 4 + b.TxStats.PageCount = 10 + b.FreePageN = 14 + diff := b.Sub(&a) + equals(t, 7, diff.TxStats.PageCount) + // free page stats are copied from the receiver and not subtracted + equals(t, 14, diff.FreePageN) +} + +func ExampleDB_Update() { + // Open the database. + db, _ := bolt.Open(tempfile(), 0666, nil) + defer os.Remove(db.Path()) + defer db.Close() + + // Execute several commands within a write transaction. + err := db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + if err != nil { + return err + } + if err := b.Put([]byte("foo"), []byte("bar")); err != nil { + return err + } + return nil + }) + + // If our transactional block didn't return an error then our data is saved. + if err == nil { + db.View(func(tx *bolt.Tx) error { + value := tx.Bucket([]byte("widgets")).Get([]byte("foo")) + fmt.Printf("The value of 'foo' is: %s\n", value) + return nil + }) + } + + // Output: + // The value of 'foo' is: bar +} + +func ExampleDB_View() { + // Open the database. + db, _ := bolt.Open(tempfile(), 0666, nil) + defer os.Remove(db.Path()) + defer db.Close() + + // Insert data into a bucket. + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("people")) + b := tx.Bucket([]byte("people")) + b.Put([]byte("john"), []byte("doe")) + b.Put([]byte("susy"), []byte("que")) + return nil + }) + + // Access data from within a read-only transactional block. + db.View(func(tx *bolt.Tx) error { + v := tx.Bucket([]byte("people")).Get([]byte("john")) + fmt.Printf("John's last name is %s.\n", v) + return nil + }) + + // Output: + // John's last name is doe. +} + +func ExampleDB_Begin_ReadOnly() { + // Open the database. + db, _ := bolt.Open(tempfile(), 0666, nil) + defer os.Remove(db.Path()) + defer db.Close() + + // Create a bucket. + db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + return err + }) + + // Create several keys in a transaction. + tx, _ := db.Begin(true) + b := tx.Bucket([]byte("widgets")) + b.Put([]byte("john"), []byte("blue")) + b.Put([]byte("abby"), []byte("red")) + b.Put([]byte("zephyr"), []byte("purple")) + tx.Commit() + + // Iterate over the values in sorted key order. + tx, _ = db.Begin(false) + c := tx.Bucket([]byte("widgets")).Cursor() + for k, v := c.First(); k != nil; k, v = c.Next() { + fmt.Printf("%s likes %s\n", k, v) + } + tx.Rollback() + + // Output: + // abby likes red + // john likes blue + // zephyr likes purple +} + +// TestDB represents a wrapper around a Bolt DB to handle temporary file +// creation and automatic cleanup on close. +type TestDB struct { + *bolt.DB +} + +// NewTestDB returns a new instance of TestDB. +func NewTestDB() *TestDB { + db, err := bolt.Open(tempfile(), 0666, nil) + if err != nil { + panic("cannot open db: " + err.Error()) + } + return &TestDB{db} +} + +// MustView executes a read-only function. Panic on error. +func (db *TestDB) MustView(fn func(tx *bolt.Tx) error) { + if err := db.DB.View(func(tx *bolt.Tx) error { + return fn(tx) + }); err != nil { + panic(err.Error()) + } +} + +// MustUpdate executes a read-write function. Panic on error. +func (db *TestDB) MustUpdate(fn func(tx *bolt.Tx) error) { + if err := db.DB.View(func(tx *bolt.Tx) error { + return fn(tx) + }); err != nil { + panic(err.Error()) + } +} + +// MustCreateBucket creates a new bucket. Panic on error. +func (db *TestDB) MustCreateBucket(name []byte) { + if err := db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte(name)) + return err + }); err != nil { + panic(err.Error()) + } +} + +// Close closes the database and deletes the underlying file. +func (db *TestDB) Close() { + // Log statistics. + if *statsFlag { + db.PrintStats() + } + + // Check database consistency after every test. + db.MustCheck() + + // Close database and remove file. + defer os.Remove(db.Path()) + db.DB.Close() +} + +// PrintStats prints the database stats +func (db *TestDB) PrintStats() { + var stats = db.Stats() + fmt.Printf("[db] %-20s %-20s %-20s\n", + fmt.Sprintf("pg(%d/%d)", stats.TxStats.PageCount, stats.TxStats.PageAlloc), + fmt.Sprintf("cur(%d)", stats.TxStats.CursorCount), + fmt.Sprintf("node(%d/%d)", stats.TxStats.NodeCount, stats.TxStats.NodeDeref), + ) + fmt.Printf(" %-20s %-20s %-20s\n", + fmt.Sprintf("rebal(%d/%v)", stats.TxStats.Rebalance, truncDuration(stats.TxStats.RebalanceTime)), + fmt.Sprintf("spill(%d/%v)", stats.TxStats.Spill, truncDuration(stats.TxStats.SpillTime)), + fmt.Sprintf("w(%d/%v)", stats.TxStats.Write, truncDuration(stats.TxStats.WriteTime)), + ) +} + +// MustCheck runs a consistency check on the database and panics if any errors are found. +func (db *TestDB) MustCheck() { + db.View(func(tx *bolt.Tx) error { + // Collect all the errors. + var errors []error + for err := range tx.Check() { + errors = append(errors, err) + if len(errors) > 10 { + break + } + } + + // If errors occurred, copy the DB and print the errors. + if len(errors) > 0 { + var path = tempfile() + tx.CopyFile(path, 0600) + + // Print errors. + fmt.Print("\n\n") + fmt.Printf("consistency check failed (%d errors)\n", len(errors)) + for _, err := range errors { + fmt.Println(err) + } + fmt.Println("") + fmt.Println("db saved to:") + fmt.Println(path) + fmt.Print("\n\n") + os.Exit(-1) + } + + return nil + }) +} + +// CopyTempFile copies a database to a temporary file. +func (db *TestDB) CopyTempFile() { + path := tempfile() + db.View(func(tx *bolt.Tx) error { return tx.CopyFile(path, 0600) }) + fmt.Println("db copied to: ", path) +} + +// tempfile returns a temporary file path. +func tempfile() string { + f, _ := ioutil.TempFile("", "bolt-") + f.Close() + os.Remove(f.Name()) + return f.Name() +} + +// mustContainKeys checks that a bucket contains a given set of keys. +func mustContainKeys(b *bolt.Bucket, m map[string]string) { + found := make(map[string]string) + b.ForEach(func(k, _ []byte) error { + found[string(k)] = "" + return nil + }) + + // Check for keys found in bucket that shouldn't be there. + var keys []string + for k, _ := range found { + if _, ok := m[string(k)]; !ok { + keys = append(keys, k) + } + } + if len(keys) > 0 { + sort.Strings(keys) + panic(fmt.Sprintf("keys found(%d): %s", len(keys), strings.Join(keys, ","))) + } + + // Check for keys not found in bucket that should be there. + for k, _ := range m { + if _, ok := found[string(k)]; !ok { + keys = append(keys, k) + } + } + if len(keys) > 0 { + sort.Strings(keys) + panic(fmt.Sprintf("keys not found(%d): %s", len(keys), strings.Join(keys, ","))) + } +} + +func trunc(b []byte, length int) []byte { + if length < len(b) { + return b[:length] + } + return b +} + +func truncDuration(d time.Duration) string { + return regexp.MustCompile(`^(\d+)(\.\d+)`).ReplaceAllString(d.String(), "$1") +} + +func fileSize(path string) int64 { + fi, err := os.Stat(path) + if err != nil { + return 0 + } + return fi.Size() +} + +func warn(v ...interface{}) { fmt.Fprintln(os.Stderr, v...) } +func warnf(msg string, v ...interface{}) { fmt.Fprintf(os.Stderr, msg+"\n", v...) } + +// u64tob converts a uint64 into an 8-byte slice. +func u64tob(v uint64) []byte { + b := make([]byte, 8) + binary.BigEndian.PutUint64(b, v) + return b +} + +// btou64 converts an 8-byte slice into an uint64. +func btou64(b []byte) uint64 { return binary.BigEndian.Uint64(b) } diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/doc.go b/Godeps/_workspace/src/github.com/boltdb/bolt/doc.go new file mode 100644 index 000000000..cc937845d --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/doc.go @@ -0,0 +1,44 @@ +/* +Package bolt implements a low-level key/value store in pure Go. It supports +fully serializable transactions, ACID semantics, and lock-free MVCC with +multiple readers and a single writer. Bolt can be used for projects that +want a simple data store without the need to add large dependencies such as +Postgres or MySQL. + +Bolt is a single-level, zero-copy, B+tree data store. This means that Bolt is +optimized for fast read access and does not require recovery in the event of a +system crash. Transactions which have not finished committing will simply be +rolled back in the event of a crash. + +The design of Bolt is based on Howard Chu's LMDB database project. + +Bolt currently works on Windows, Mac OS X, and Linux. + + +Basics + +There are only a few types in Bolt: DB, Bucket, Tx, and Cursor. The DB is +a collection of buckets and is represented by a single file on disk. A bucket is +a collection of unique keys that are associated with values. + +Transactions provide either read-only or read-write access to the database. +Read-only transactions can retrieve key/value pairs and can use Cursors to +iterate over the dataset sequentially. Read-write transactions can create and +delete buckets and can insert and remove keys. Only one read-write transaction +is allowed at a time. + + +Caveats + +The database uses a read-only, memory-mapped data file to ensure that +applications cannot corrupt the database, however, this means that keys and +values returned from Bolt cannot be changed. Writing to a read-only byte slice +will cause Go to panic. + +Keys and values retrieved from the database are only valid for the life of +the transaction. When used outside the transaction, these byte slices can +point to different data or can point to invalid memory which will cause a panic. + + +*/ +package bolt diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/errors.go b/Godeps/_workspace/src/github.com/boltdb/bolt/errors.go new file mode 100644 index 000000000..aa504f138 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/errors.go @@ -0,0 +1,66 @@ +package bolt + +import "errors" + +// These errors can be returned when opening or calling methods on a DB. +var ( + // ErrDatabaseNotOpen is returned when a DB instance is accessed before it + // is opened or after it is closed. + ErrDatabaseNotOpen = errors.New("database not open") + + // ErrDatabaseOpen is returned when opening a database that is + // already open. + ErrDatabaseOpen = errors.New("database already open") + + // ErrInvalid is returned when a data file is not a Bolt-formatted database. + ErrInvalid = errors.New("invalid database") + + // ErrVersionMismatch is returned when the data file was created with a + // different version of Bolt. + ErrVersionMismatch = errors.New("version mismatch") + + // ErrChecksum is returned when either meta page checksum does not match. + ErrChecksum = errors.New("checksum error") + + // ErrTimeout is returned when a database cannot obtain an exclusive lock + // on the data file after the timeout passed to Open(). + ErrTimeout = errors.New("timeout") +) + +// These errors can occur when beginning or committing a Tx. +var ( + // ErrTxNotWritable is returned when performing a write operation on a + // read-only transaction. + ErrTxNotWritable = errors.New("tx not writable") + + // ErrTxClosed is returned when committing or rolling back a transaction + // that has already been committed or rolled back. + ErrTxClosed = errors.New("tx closed") +) + +// These errors can occur when putting or deleting a value or a bucket. +var ( + // ErrBucketNotFound is returned when trying to access a bucket that has + // not been created yet. + ErrBucketNotFound = errors.New("bucket not found") + + // ErrBucketExists is returned when creating a bucket that already exists. + ErrBucketExists = errors.New("bucket already exists") + + // ErrBucketNameRequired is returned when creating a bucket with a blank name. + ErrBucketNameRequired = errors.New("bucket name required") + + // ErrKeyRequired is returned when inserting a zero-length key. + ErrKeyRequired = errors.New("key required") + + // ErrKeyTooLarge is returned when inserting a key that is larger than MaxKeySize. + ErrKeyTooLarge = errors.New("key too large") + + // ErrValueTooLarge is returned when inserting a value that is larger than MaxValueSize. + ErrValueTooLarge = errors.New("value too large") + + // ErrIncompatibleValue is returned when trying create or delete a bucket + // on an existing non-bucket key or when trying to create or delete a + // non-bucket key on an existing bucket key. + ErrIncompatibleValue = errors.New("incompatible value") +) diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/freelist.go b/Godeps/_workspace/src/github.com/boltdb/bolt/freelist.go new file mode 100644 index 000000000..1346e82e5 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/freelist.go @@ -0,0 +1,241 @@ +package bolt + +import ( + "fmt" + "sort" + "unsafe" +) + +// freelist represents a list of all pages that are available for allocation. +// It also tracks pages that have been freed but are still in use by open transactions. +type freelist struct { + ids []pgid // all free and available free page ids. + pending map[txid][]pgid // mapping of soon-to-be free page ids by tx. + cache map[pgid]bool // fast lookup of all free and pending page ids. +} + +// newFreelist returns an empty, initialized freelist. +func newFreelist() *freelist { + return &freelist{ + pending: make(map[txid][]pgid), + cache: make(map[pgid]bool), + } +} + +// size returns the size of the page after serialization. +func (f *freelist) size() int { + return pageHeaderSize + (int(unsafe.Sizeof(pgid(0))) * f.count()) +} + +// count returns count of pages on the freelist +func (f *freelist) count() int { + return f.free_count() + f.pending_count() +} + +// free_count returns count of free pages +func (f *freelist) free_count() int { + return len(f.ids) +} + +// pending_count returns count of pending pages +func (f *freelist) pending_count() int { + var count int + for _, list := range f.pending { + count += len(list) + } + return count +} + +// all returns a list of all free ids and all pending ids in one sorted list. +func (f *freelist) all() []pgid { + ids := make([]pgid, len(f.ids)) + copy(ids, f.ids) + + for _, list := range f.pending { + ids = append(ids, list...) + } + + sort.Sort(pgids(ids)) + return ids +} + +// allocate returns the starting page id of a contiguous list of pages of a given size. +// If a contiguous block cannot be found then 0 is returned. +func (f *freelist) allocate(n int) pgid { + if len(f.ids) == 0 { + return 0 + } + + var initial, previd pgid + for i, id := range f.ids { + if id <= 1 { + panic(fmt.Sprintf("invalid page allocation: %d", id)) + } + + // Reset initial page if this is not contiguous. + if previd == 0 || id-previd != 1 { + initial = id + } + + // If we found a contiguous block then remove it and return it. + if (id-initial)+1 == pgid(n) { + // If we're allocating off the beginning then take the fast path + // and just adjust the existing slice. This will use extra memory + // temporarily but the append() in free() will realloc the slice + // as is necessary. + if (i + 1) == n { + f.ids = f.ids[i+1:] + } else { + copy(f.ids[i-n+1:], f.ids[i+1:]) + f.ids = f.ids[:len(f.ids)-n] + } + + // Remove from the free cache. + for i := pgid(0); i < pgid(n); i++ { + delete(f.cache, initial+i) + } + + return initial + } + + previd = id + } + return 0 +} + +// free releases a page and its overflow for a given transaction id. +// If the page is already free then a panic will occur. +func (f *freelist) free(txid txid, p *page) { + if p.id <= 1 { + panic(fmt.Sprintf("cannot free page 0 or 1: %d", p.id)) + } + + // Free page and all its overflow pages. + var ids = f.pending[txid] + for id := p.id; id <= p.id+pgid(p.overflow); id++ { + // Verify that page is not already free. + if f.cache[id] { + panic(fmt.Sprintf("page %d already freed", id)) + } + + // Add to the freelist and cache. + ids = append(ids, id) + f.cache[id] = true + } + f.pending[txid] = ids +} + +// release moves all page ids for a transaction id (or older) to the freelist. +func (f *freelist) release(txid txid) { + for tid, ids := range f.pending { + if tid <= txid { + // Move transaction's pending pages to the available freelist. + // Don't remove from the cache since the page is still free. + f.ids = append(f.ids, ids...) + delete(f.pending, tid) + } + } + sort.Sort(pgids(f.ids)) +} + +// rollback removes the pages from a given pending tx. +func (f *freelist) rollback(txid txid) { + // Remove page ids from cache. + for _, id := range f.pending[txid] { + delete(f.cache, id) + } + + // Remove pages from pending list. + delete(f.pending, txid) +} + +// freed returns whether a given page is in the free list. +func (f *freelist) freed(pgid pgid) bool { + return f.cache[pgid] +} + +// read initializes the freelist from a freelist page. +func (f *freelist) read(p *page) { + // If the page.count is at the max uint16 value (64k) then it's considered + // an overflow and the size of the freelist is stored as the first element. + idx, count := 0, int(p.count) + if count == 0xFFFF { + idx = 1 + count = int(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0]) + } + + // Copy the list of page ids from the freelist. + ids := ((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[idx:count] + f.ids = make([]pgid, len(ids)) + copy(f.ids, ids) + + // Make sure they're sorted. + sort.Sort(pgids(f.ids)) + + // Rebuild the page cache. + f.reindex() +} + +// write writes the page ids onto a freelist page. All free and pending ids are +// saved to disk since in the event of a program crash, all pending ids will +// become free. +func (f *freelist) write(p *page) error { + // Combine the old free pgids and pgids waiting on an open transaction. + ids := f.all() + + // Update the header flag. + p.flags |= freelistPageFlag + + // The page.count can only hold up to 64k elements so if we overflow that + // number then we handle it by putting the size in the first element. + if len(ids) < 0xFFFF { + p.count = uint16(len(ids)) + copy(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[:], ids) + } else { + p.count = 0xFFFF + ((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0] = pgid(len(ids)) + copy(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[1:], ids) + } + + return nil +} + +// reload reads the freelist from a page and filters out pending items. +func (f *freelist) reload(p *page) { + f.read(p) + + // Build a cache of only pending pages. + pcache := make(map[pgid]bool) + for _, pendingIDs := range f.pending { + for _, pendingID := range pendingIDs { + pcache[pendingID] = true + } + } + + // Check each page in the freelist and build a new available freelist + // with any pages not in the pending lists. + var a []pgid + for _, id := range f.ids { + if !pcache[id] { + a = append(a, id) + } + } + f.ids = a + + // Once the available list is rebuilt then rebuild the free cache so that + // it includes the available and pending free pages. + f.reindex() +} + +// reindex rebuilds the free cache based on available and pending free lists. +func (f *freelist) reindex() { + f.cache = make(map[pgid]bool) + for _, id := range f.ids { + f.cache[id] = true + } + for _, pendingIDs := range f.pending { + for _, pendingID := range pendingIDs { + f.cache[pendingID] = true + } + } +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/freelist_test.go b/Godeps/_workspace/src/github.com/boltdb/bolt/freelist_test.go new file mode 100644 index 000000000..792ca9222 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/freelist_test.go @@ -0,0 +1,129 @@ +package bolt + +import ( + "reflect" + "testing" + "unsafe" +) + +// Ensure that a page is added to a transaction's freelist. +func TestFreelist_free(t *testing.T) { + f := newFreelist() + f.free(100, &page{id: 12}) + if !reflect.DeepEqual([]pgid{12}, f.pending[100]) { + t.Fatalf("exp=%v; got=%v", []pgid{12}, f.pending[100]) + } +} + +// Ensure that a page and its overflow is added to a transaction's freelist. +func TestFreelist_free_overflow(t *testing.T) { + f := newFreelist() + f.free(100, &page{id: 12, overflow: 3}) + if exp := []pgid{12, 13, 14, 15}; !reflect.DeepEqual(exp, f.pending[100]) { + t.Fatalf("exp=%v; got=%v", exp, f.pending[100]) + } +} + +// Ensure that a transaction's free pages can be released. +func TestFreelist_release(t *testing.T) { + f := newFreelist() + f.free(100, &page{id: 12, overflow: 1}) + f.free(100, &page{id: 9}) + f.free(102, &page{id: 39}) + f.release(100) + f.release(101) + if exp := []pgid{9, 12, 13}; !reflect.DeepEqual(exp, f.ids) { + t.Fatalf("exp=%v; got=%v", exp, f.ids) + } + + f.release(102) + if exp := []pgid{9, 12, 13, 39}; !reflect.DeepEqual(exp, f.ids) { + t.Fatalf("exp=%v; got=%v", exp, f.ids) + } +} + +// Ensure that a freelist can find contiguous blocks of pages. +func TestFreelist_allocate(t *testing.T) { + f := &freelist{ids: []pgid{3, 4, 5, 6, 7, 9, 12, 13, 18}} + if id := int(f.allocate(3)); id != 3 { + t.Fatalf("exp=3; got=%v", id) + } + if id := int(f.allocate(1)); id != 6 { + t.Fatalf("exp=6; got=%v", id) + } + if id := int(f.allocate(3)); id != 0 { + t.Fatalf("exp=0; got=%v", id) + } + if id := int(f.allocate(2)); id != 12 { + t.Fatalf("exp=12; got=%v", id) + } + if id := int(f.allocate(1)); id != 7 { + t.Fatalf("exp=7; got=%v", id) + } + if id := int(f.allocate(0)); id != 0 { + t.Fatalf("exp=0; got=%v", id) + } + if id := int(f.allocate(0)); id != 0 { + t.Fatalf("exp=0; got=%v", id) + } + if exp := []pgid{9, 18}; !reflect.DeepEqual(exp, f.ids) { + t.Fatalf("exp=%v; got=%v", exp, f.ids) + } + + if id := int(f.allocate(1)); id != 9 { + t.Fatalf("exp=9; got=%v", id) + } + if id := int(f.allocate(1)); id != 18 { + t.Fatalf("exp=18; got=%v", id) + } + if id := int(f.allocate(1)); id != 0 { + t.Fatalf("exp=0; got=%v", id) + } + if exp := []pgid{}; !reflect.DeepEqual(exp, f.ids) { + t.Fatalf("exp=%v; got=%v", exp, f.ids) + } +} + +// Ensure that a freelist can deserialize from a freelist page. +func TestFreelist_read(t *testing.T) { + // Create a page. + var buf [4096]byte + page := (*page)(unsafe.Pointer(&buf[0])) + page.flags = freelistPageFlag + page.count = 2 + + // Insert 2 page ids. + ids := (*[3]pgid)(unsafe.Pointer(&page.ptr)) + ids[0] = 23 + ids[1] = 50 + + // Deserialize page into a freelist. + f := newFreelist() + f.read(page) + + // Ensure that there are two page ids in the freelist. + if exp := []pgid{23, 50}; !reflect.DeepEqual(exp, f.ids) { + t.Fatalf("exp=%v; got=%v", exp, f.ids) + } +} + +// Ensure that a freelist can serialize into a freelist page. +func TestFreelist_write(t *testing.T) { + // Create a freelist and write it to a page. + var buf [4096]byte + f := &freelist{ids: []pgid{12, 39}, pending: make(map[txid][]pgid)} + f.pending[100] = []pgid{28, 11} + f.pending[101] = []pgid{3} + p := (*page)(unsafe.Pointer(&buf[0])) + f.write(p) + + // Read the page back out. + f2 := newFreelist() + f2.read(p) + + // Ensure that the freelist is correct. + // All pages should be present and in reverse order. + if exp := []pgid{3, 11, 12, 28, 39}; !reflect.DeepEqual(exp, f2.ids) { + t.Fatalf("exp=%v; got=%v", exp, f2.ids) + } +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/node.go b/Godeps/_workspace/src/github.com/boltdb/bolt/node.go new file mode 100644 index 000000000..05aefb8ad --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/node.go @@ -0,0 +1,627 @@ +package bolt + +import ( + "bytes" + "fmt" + "sort" + "unsafe" +) + +// node represents an in-memory, deserialized page. +type node struct { + bucket *Bucket + isLeaf bool + unbalanced bool + spilled bool + key []byte + pgid pgid + parent *node + children nodes + inodes inodes +} + +// root returns the top-level node this node is attached to. +func (n *node) root() *node { + if n.parent == nil { + return n + } + return n.parent.root() +} + +// minKeys returns the minimum number of inodes this node should have. +func (n *node) minKeys() int { + if n.isLeaf { + return 1 + } + return 2 +} + +// size returns the size of the node after serialization. +func (n *node) size() int { + sz, elsz := pageHeaderSize, n.pageElementSize() + for i := 0; i < len(n.inodes); i++ { + item := &n.inodes[i] + sz += elsz + len(item.key) + len(item.value) + } + return sz +} + +// sizeLessThan returns true if the node is less than a given size. +// This is an optimization to avoid calculating a large node when we only need +// to know if it fits inside a certain page size. +func (n *node) sizeLessThan(v int) bool { + sz, elsz := pageHeaderSize, n.pageElementSize() + for i := 0; i < len(n.inodes); i++ { + item := &n.inodes[i] + sz += elsz + len(item.key) + len(item.value) + if sz >= v { + return false + } + } + return true +} + +// pageElementSize returns the size of each page element based on the type of node. +func (n *node) pageElementSize() int { + if n.isLeaf { + return leafPageElementSize + } + return branchPageElementSize +} + +// childAt returns the child node at a given index. +func (n *node) childAt(index int) *node { + if n.isLeaf { + panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index)) + } + return n.bucket.node(n.inodes[index].pgid, n) +} + +// childIndex returns the index of a given child node. +func (n *node) childIndex(child *node) int { + index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 }) + return index +} + +// numChildren returns the number of children. +func (n *node) numChildren() int { + return len(n.inodes) +} + +// nextSibling returns the next node with the same parent. +func (n *node) nextSibling() *node { + if n.parent == nil { + return nil + } + index := n.parent.childIndex(n) + if index >= n.parent.numChildren()-1 { + return nil + } + return n.parent.childAt(index + 1) +} + +// prevSibling returns the previous node with the same parent. +func (n *node) prevSibling() *node { + if n.parent == nil { + return nil + } + index := n.parent.childIndex(n) + if index == 0 { + return nil + } + return n.parent.childAt(index - 1) +} + +// put inserts a key/value. +func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) { + if pgid >= n.bucket.tx.meta.pgid { + panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid)) + } else if len(oldKey) <= 0 { + panic("put: zero-length old key") + } else if len(newKey) <= 0 { + panic("put: zero-length new key") + } + + // Find insertion index. + index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 }) + + // Add capacity and shift nodes if we don't have an exact match and need to insert. + exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey)) + if !exact { + n.inodes = append(n.inodes, inode{}) + copy(n.inodes[index+1:], n.inodes[index:]) + } + + inode := &n.inodes[index] + inode.flags = flags + inode.key = newKey + inode.value = value + inode.pgid = pgid + _assert(len(inode.key) > 0, "put: zero-length inode key") +} + +// del removes a key from the node. +func (n *node) del(key []byte) { + // Find index of key. + index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 }) + + // Exit if the key isn't found. + if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) { + return + } + + // Delete inode from the node. + n.inodes = append(n.inodes[:index], n.inodes[index+1:]...) + + // Mark the node as needing rebalancing. + n.unbalanced = true +} + +// read initializes the node from a page. +func (n *node) read(p *page) { + n.pgid = p.id + n.isLeaf = ((p.flags & leafPageFlag) != 0) + n.inodes = make(inodes, int(p.count)) + + for i := 0; i < int(p.count); i++ { + inode := &n.inodes[i] + if n.isLeaf { + elem := p.leafPageElement(uint16(i)) + inode.flags = elem.flags + inode.key = elem.key() + inode.value = elem.value() + } else { + elem := p.branchPageElement(uint16(i)) + inode.pgid = elem.pgid + inode.key = elem.key() + } + _assert(len(inode.key) > 0, "read: zero-length inode key") + } + + // Save first key so we can find the node in the parent when we spill. + if len(n.inodes) > 0 { + n.key = n.inodes[0].key + _assert(len(n.key) > 0, "read: zero-length node key") + } else { + n.key = nil + } +} + +// write writes the items onto one or more pages. +func (n *node) write(p *page) { + // Initialize page. + if n.isLeaf { + p.flags |= leafPageFlag + } else { + p.flags |= branchPageFlag + } + + if len(n.inodes) >= 0xFFFF { + panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id)) + } + p.count = uint16(len(n.inodes)) + + // Loop over each item and write it to the page. + b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):] + for i, item := range n.inodes { + _assert(len(item.key) > 0, "write: zero-length inode key") + + // Write the page element. + if n.isLeaf { + elem := p.leafPageElement(uint16(i)) + elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) + elem.flags = item.flags + elem.ksize = uint32(len(item.key)) + elem.vsize = uint32(len(item.value)) + } else { + elem := p.branchPageElement(uint16(i)) + elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) + elem.ksize = uint32(len(item.key)) + elem.pgid = item.pgid + _assert(elem.pgid != p.id, "write: circular dependency occurred") + } + + // Write data for the element to the end of the page. + copy(b[0:], item.key) + b = b[len(item.key):] + copy(b[0:], item.value) + b = b[len(item.value):] + } + + // DEBUG ONLY: n.dump() +} + +// split breaks up a node into multiple smaller nodes, if appropriate. +// This should only be called from the spill() function. +func (n *node) split(pageSize int) []*node { + var nodes []*node + + node := n + for { + // Split node into two. + a, b := node.splitTwo(pageSize) + nodes = append(nodes, a) + + // If we can't split then exit the loop. + if b == nil { + break + } + + // Set node to b so it gets split on the next iteration. + node = b + } + + return nodes +} + +// splitTwo breaks up a node into two smaller nodes, if appropriate. +// This should only be called from the split() function. +func (n *node) splitTwo(pageSize int) (*node, *node) { + // Ignore the split if the page doesn't have at least enough nodes for + // two pages or if the nodes can fit in a single page. + if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) { + return n, nil + } + + // Determine the threshold before starting a new node. + var fillPercent = n.bucket.FillPercent + if fillPercent < minFillPercent { + fillPercent = minFillPercent + } else if fillPercent > maxFillPercent { + fillPercent = maxFillPercent + } + threshold := int(float64(pageSize) * fillPercent) + + // Determine split position and sizes of the two pages. + splitIndex, _ := n.splitIndex(threshold) + + // Split node into two separate nodes. + // If there's no parent then we'll need to create one. + if n.parent == nil { + n.parent = &node{bucket: n.bucket, children: []*node{n}} + } + + // Create a new node and add it to the parent. + next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent} + n.parent.children = append(n.parent.children, next) + + // Split inodes across two nodes. + next.inodes = n.inodes[splitIndex:] + n.inodes = n.inodes[:splitIndex] + + // Update the statistics. + n.bucket.tx.stats.Split++ + + return n, next +} + +// splitIndex finds the position where a page will fill a given threshold. +// It returns the index as well as the size of the first page. +// This is only be called from split(). +func (n *node) splitIndex(threshold int) (index, sz int) { + sz = pageHeaderSize + + // Loop until we only have the minimum number of keys required for the second page. + for i := 0; i < len(n.inodes)-minKeysPerPage; i++ { + index = i + inode := n.inodes[i] + elsize := n.pageElementSize() + len(inode.key) + len(inode.value) + + // If we have at least the minimum number of keys and adding another + // node would put us over the threshold then exit and return. + if i >= minKeysPerPage && sz+elsize > threshold { + break + } + + // Add the element size to the total size. + sz += elsize + } + + return +} + +// spill writes the nodes to dirty pages and splits nodes as it goes. +// Returns an error if dirty pages cannot be allocated. +func (n *node) spill() error { + var tx = n.bucket.tx + if n.spilled { + return nil + } + + // Spill child nodes first. Child nodes can materialize sibling nodes in + // the case of split-merge so we cannot use a range loop. We have to check + // the children size on every loop iteration. + sort.Sort(n.children) + for i := 0; i < len(n.children); i++ { + if err := n.children[i].spill(); err != nil { + return err + } + } + + // We no longer need the child list because it's only used for spill tracking. + n.children = nil + + // Split nodes into appropriate sizes. The first node will always be n. + var nodes = n.split(tx.db.pageSize) + for _, node := range nodes { + // Add node's page to the freelist if it's not new. + if node.pgid > 0 { + tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid)) + node.pgid = 0 + } + + // Allocate contiguous space for the node. + p, err := tx.allocate((node.size() / tx.db.pageSize) + 1) + if err != nil { + return err + } + + // Write the node. + if p.id >= tx.meta.pgid { + panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid)) + } + node.pgid = p.id + node.write(p) + node.spilled = true + + // Insert into parent inodes. + if node.parent != nil { + var key = node.key + if key == nil { + key = node.inodes[0].key + } + + node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0) + node.key = node.inodes[0].key + _assert(len(node.key) > 0, "spill: zero-length node key") + } + + // Update the statistics. + tx.stats.Spill++ + } + + // If the root node split and created a new root then we need to spill that + // as well. We'll clear out the children to make sure it doesn't try to respill. + if n.parent != nil && n.parent.pgid == 0 { + n.children = nil + return n.parent.spill() + } + + return nil +} + +// rebalance attempts to combine the node with sibling nodes if the node fill +// size is below a threshold or if there are not enough keys. +func (n *node) rebalance() { + if !n.unbalanced { + return + } + n.unbalanced = false + + // Update statistics. + n.bucket.tx.stats.Rebalance++ + + // Ignore if node is above threshold (25%) and has enough keys. + var threshold = n.bucket.tx.db.pageSize / 4 + if n.size() > threshold && len(n.inodes) > n.minKeys() { + return + } + + // Root node has special handling. + if n.parent == nil { + // If root node is a branch and only has one node then collapse it. + if !n.isLeaf && len(n.inodes) == 1 { + // Move root's child up. + child := n.bucket.node(n.inodes[0].pgid, n) + n.isLeaf = child.isLeaf + n.inodes = child.inodes[:] + n.children = child.children + + // Reparent all child nodes being moved. + for _, inode := range n.inodes { + if child, ok := n.bucket.nodes[inode.pgid]; ok { + child.parent = n + } + } + + // Remove old child. + child.parent = nil + delete(n.bucket.nodes, child.pgid) + child.free() + } + + return + } + + // If node has no keys then just remove it. + if n.numChildren() == 0 { + n.parent.del(n.key) + n.parent.removeChild(n) + delete(n.bucket.nodes, n.pgid) + n.free() + n.parent.rebalance() + return + } + + _assert(n.parent.numChildren() > 1, "parent must have at least 2 children") + + // Destination node is right sibling if idx == 0, otherwise left sibling. + var target *node + var useNextSibling = (n.parent.childIndex(n) == 0) + if useNextSibling { + target = n.nextSibling() + } else { + target = n.prevSibling() + } + + // If target node has extra nodes then just move one over. + if target.numChildren() > target.minKeys() { + if useNextSibling { + // Reparent and move node. + if child, ok := n.bucket.nodes[target.inodes[0].pgid]; ok { + child.parent.removeChild(child) + child.parent = n + child.parent.children = append(child.parent.children, child) + } + n.inodes = append(n.inodes, target.inodes[0]) + target.inodes = target.inodes[1:] + + // Update target key on parent. + target.parent.put(target.key, target.inodes[0].key, nil, target.pgid, 0) + target.key = target.inodes[0].key + _assert(len(target.key) > 0, "rebalance(1): zero-length node key") + } else { + // Reparent and move node. + if child, ok := n.bucket.nodes[target.inodes[len(target.inodes)-1].pgid]; ok { + child.parent.removeChild(child) + child.parent = n + child.parent.children = append(child.parent.children, child) + } + n.inodes = append(n.inodes, inode{}) + copy(n.inodes[1:], n.inodes) + n.inodes[0] = target.inodes[len(target.inodes)-1] + target.inodes = target.inodes[:len(target.inodes)-1] + } + + // Update parent key for node. + n.parent.put(n.key, n.inodes[0].key, nil, n.pgid, 0) + n.key = n.inodes[0].key + _assert(len(n.key) > 0, "rebalance(2): zero-length node key") + + return + } + + // If both this node and the target node are too small then merge them. + if useNextSibling { + // Reparent all child nodes being moved. + for _, inode := range target.inodes { + if child, ok := n.bucket.nodes[inode.pgid]; ok { + child.parent.removeChild(child) + child.parent = n + child.parent.children = append(child.parent.children, child) + } + } + + // Copy over inodes from target and remove target. + n.inodes = append(n.inodes, target.inodes...) + n.parent.del(target.key) + n.parent.removeChild(target) + delete(n.bucket.nodes, target.pgid) + target.free() + } else { + // Reparent all child nodes being moved. + for _, inode := range n.inodes { + if child, ok := n.bucket.nodes[inode.pgid]; ok { + child.parent.removeChild(child) + child.parent = target + child.parent.children = append(child.parent.children, child) + } + } + + // Copy over inodes to target and remove node. + target.inodes = append(target.inodes, n.inodes...) + n.parent.del(n.key) + n.parent.removeChild(n) + delete(n.bucket.nodes, n.pgid) + n.free() + } + + // Either this node or the target node was deleted from the parent so rebalance it. + n.parent.rebalance() +} + +// removes a node from the list of in-memory children. +// This does not affect the inodes. +func (n *node) removeChild(target *node) { + for i, child := range n.children { + if child == target { + n.children = append(n.children[:i], n.children[i+1:]...) + return + } + } +} + +// dereference causes the node to copy all its inode key/value references to heap memory. +// This is required when the mmap is reallocated so inodes are not pointing to stale data. +func (n *node) dereference() { + if n.key != nil { + key := make([]byte, len(n.key)) + copy(key, n.key) + n.key = key + _assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node") + } + + for i := range n.inodes { + inode := &n.inodes[i] + + key := make([]byte, len(inode.key)) + copy(key, inode.key) + inode.key = key + _assert(len(inode.key) > 0, "dereference: zero-length inode key") + + value := make([]byte, len(inode.value)) + copy(value, inode.value) + inode.value = value + } + + // Recursively dereference children. + for _, child := range n.children { + child.dereference() + } + + // Update statistics. + n.bucket.tx.stats.NodeDeref++ +} + +// free adds the node's underlying page to the freelist. +func (n *node) free() { + if n.pgid != 0 { + n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid)) + n.pgid = 0 + } +} + +// dump writes the contents of the node to STDERR for debugging purposes. +/* +func (n *node) dump() { + // Write node header. + var typ = "branch" + if n.isLeaf { + typ = "leaf" + } + warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes)) + + // Write out abbreviated version of each item. + for _, item := range n.inodes { + if n.isLeaf { + if item.flags&bucketLeafFlag != 0 { + bucket := (*bucket)(unsafe.Pointer(&item.value[0])) + warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root) + } else { + warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4)) + } + } else { + warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid) + } + } + warn("") +} +*/ + +type nodes []*node + +func (s nodes) Len() int { return len(s) } +func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] } +func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 } + +// inode represents an internal node inside of a node. +// It can be used to point to elements in a page or point +// to an element which hasn't been added to a page yet. +type inode struct { + flags uint32 + pgid pgid + key []byte + value []byte +} + +type inodes []inode diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/node_test.go b/Godeps/_workspace/src/github.com/boltdb/bolt/node_test.go new file mode 100644 index 000000000..fa5d10f99 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/node_test.go @@ -0,0 +1,156 @@ +package bolt + +import ( + "testing" + "unsafe" +) + +// Ensure that a node can insert a key/value. +func TestNode_put(t *testing.T) { + n := &node{inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{meta: &meta{pgid: 1}}}} + n.put([]byte("baz"), []byte("baz"), []byte("2"), 0, 0) + n.put([]byte("foo"), []byte("foo"), []byte("0"), 0, 0) + n.put([]byte("bar"), []byte("bar"), []byte("1"), 0, 0) + n.put([]byte("foo"), []byte("foo"), []byte("3"), 0, leafPageFlag) + + if len(n.inodes) != 3 { + t.Fatalf("exp=3; got=%d", len(n.inodes)) + } + if k, v := n.inodes[0].key, n.inodes[0].value; string(k) != "bar" || string(v) != "1" { + t.Fatalf("exp=; got=<%s,%s>", k, v) + } + if k, v := n.inodes[1].key, n.inodes[1].value; string(k) != "baz" || string(v) != "2" { + t.Fatalf("exp=; got=<%s,%s>", k, v) + } + if k, v := n.inodes[2].key, n.inodes[2].value; string(k) != "foo" || string(v) != "3" { + t.Fatalf("exp=; got=<%s,%s>", k, v) + } + if n.inodes[2].flags != uint32(leafPageFlag) { + t.Fatalf("not a leaf: %d", n.inodes[2].flags) + } +} + +// Ensure that a node can deserialize from a leaf page. +func TestNode_read_LeafPage(t *testing.T) { + // Create a page. + var buf [4096]byte + page := (*page)(unsafe.Pointer(&buf[0])) + page.flags = leafPageFlag + page.count = 2 + + // Insert 2 elements at the beginning. sizeof(leafPageElement) == 16 + nodes := (*[3]leafPageElement)(unsafe.Pointer(&page.ptr)) + nodes[0] = leafPageElement{flags: 0, pos: 32, ksize: 3, vsize: 4} // pos = sizeof(leafPageElement) * 2 + nodes[1] = leafPageElement{flags: 0, pos: 23, ksize: 10, vsize: 3} // pos = sizeof(leafPageElement) + 3 + 4 + + // Write data for the nodes at the end. + data := (*[4096]byte)(unsafe.Pointer(&nodes[2])) + copy(data[:], []byte("barfooz")) + copy(data[7:], []byte("helloworldbye")) + + // Deserialize page into a leaf. + n := &node{} + n.read(page) + + // Check that there are two inodes with correct data. + if !n.isLeaf { + t.Fatal("expected leaf") + } + if len(n.inodes) != 2 { + t.Fatalf("exp=2; got=%d", len(n.inodes)) + } + if k, v := n.inodes[0].key, n.inodes[0].value; string(k) != "bar" || string(v) != "fooz" { + t.Fatalf("exp=; got=<%s,%s>", k, v) + } + if k, v := n.inodes[1].key, n.inodes[1].value; string(k) != "helloworld" || string(v) != "bye" { + t.Fatalf("exp=; got=<%s,%s>", k, v) + } +} + +// Ensure that a node can serialize into a leaf page. +func TestNode_write_LeafPage(t *testing.T) { + // Create a node. + n := &node{isLeaf: true, inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: &meta{pgid: 1}}}} + n.put([]byte("susy"), []byte("susy"), []byte("que"), 0, 0) + n.put([]byte("ricki"), []byte("ricki"), []byte("lake"), 0, 0) + n.put([]byte("john"), []byte("john"), []byte("johnson"), 0, 0) + + // Write it to a page. + var buf [4096]byte + p := (*page)(unsafe.Pointer(&buf[0])) + n.write(p) + + // Read the page back in. + n2 := &node{} + n2.read(p) + + // Check that the two pages are the same. + if len(n2.inodes) != 3 { + t.Fatalf("exp=3; got=%d", len(n2.inodes)) + } + if k, v := n2.inodes[0].key, n2.inodes[0].value; string(k) != "john" || string(v) != "johnson" { + t.Fatalf("exp=; got=<%s,%s>", k, v) + } + if k, v := n2.inodes[1].key, n2.inodes[1].value; string(k) != "ricki" || string(v) != "lake" { + t.Fatalf("exp=; got=<%s,%s>", k, v) + } + if k, v := n2.inodes[2].key, n2.inodes[2].value; string(k) != "susy" || string(v) != "que" { + t.Fatalf("exp=; got=<%s,%s>", k, v) + } +} + +// Ensure that a node can split into appropriate subgroups. +func TestNode_split(t *testing.T) { + // Create a node. + n := &node{inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: &meta{pgid: 1}}}} + n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0) + n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0) + n.put([]byte("00000003"), []byte("00000003"), []byte("0123456701234567"), 0, 0) + n.put([]byte("00000004"), []byte("00000004"), []byte("0123456701234567"), 0, 0) + n.put([]byte("00000005"), []byte("00000005"), []byte("0123456701234567"), 0, 0) + + // Split between 2 & 3. + n.split(100) + + var parent = n.parent + if len(parent.children) != 2 { + t.Fatalf("exp=2; got=%d", len(parent.children)) + } + if len(parent.children[0].inodes) != 2 { + t.Fatalf("exp=2; got=%d", len(parent.children[0].inodes)) + } + if len(parent.children[1].inodes) != 3 { + t.Fatalf("exp=3; got=%d", len(parent.children[1].inodes)) + } +} + +// Ensure that a page with the minimum number of inodes just returns a single node. +func TestNode_split_MinKeys(t *testing.T) { + // Create a node. + n := &node{inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: &meta{pgid: 1}}}} + n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0) + n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0) + + // Split. + n.split(20) + if n.parent != nil { + t.Fatalf("expected nil parent") + } +} + +// Ensure that a node that has keys that all fit on a page just returns one leaf. +func TestNode_split_SinglePage(t *testing.T) { + // Create a node. + n := &node{inodes: make(inodes, 0), bucket: &Bucket{tx: &Tx{db: &DB{}, meta: &meta{pgid: 1}}}} + n.put([]byte("00000001"), []byte("00000001"), []byte("0123456701234567"), 0, 0) + n.put([]byte("00000002"), []byte("00000002"), []byte("0123456701234567"), 0, 0) + n.put([]byte("00000003"), []byte("00000003"), []byte("0123456701234567"), 0, 0) + n.put([]byte("00000004"), []byte("00000004"), []byte("0123456701234567"), 0, 0) + n.put([]byte("00000005"), []byte("00000005"), []byte("0123456701234567"), 0, 0) + + // Split. + n.split(4096) + if n.parent != nil { + t.Fatalf("expected nil parent") + } +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/page.go b/Godeps/_workspace/src/github.com/boltdb/bolt/page.go new file mode 100644 index 000000000..58e43c4b6 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/page.go @@ -0,0 +1,134 @@ +package bolt + +import ( + "fmt" + "os" + "unsafe" +) + +const pageHeaderSize = int(unsafe.Offsetof(((*page)(nil)).ptr)) + +const minKeysPerPage = 2 + +const branchPageElementSize = int(unsafe.Sizeof(branchPageElement{})) +const leafPageElementSize = int(unsafe.Sizeof(leafPageElement{})) + +const ( + branchPageFlag = 0x01 + leafPageFlag = 0x02 + metaPageFlag = 0x04 + freelistPageFlag = 0x10 +) + +const ( + bucketLeafFlag = 0x01 +) + +type pgid uint64 + +type page struct { + id pgid + flags uint16 + count uint16 + overflow uint32 + ptr uintptr +} + +// typ returns a human readable page type string used for debugging. +func (p *page) typ() string { + if (p.flags & branchPageFlag) != 0 { + return "branch" + } else if (p.flags & leafPageFlag) != 0 { + return "leaf" + } else if (p.flags & metaPageFlag) != 0 { + return "meta" + } else if (p.flags & freelistPageFlag) != 0 { + return "freelist" + } + return fmt.Sprintf("unknown<%02x>", p.flags) +} + +// meta returns a pointer to the metadata section of the page. +func (p *page) meta() *meta { + return (*meta)(unsafe.Pointer(&p.ptr)) +} + +// leafPageElement retrieves the leaf node by index +func (p *page) leafPageElement(index uint16) *leafPageElement { + n := &((*[0x7FFFFFF]leafPageElement)(unsafe.Pointer(&p.ptr)))[index] + return n +} + +// leafPageElements retrieves a list of leaf nodes. +func (p *page) leafPageElements() []leafPageElement { + return ((*[0x7FFFFFF]leafPageElement)(unsafe.Pointer(&p.ptr)))[:] +} + +// branchPageElement retrieves the branch node by index +func (p *page) branchPageElement(index uint16) *branchPageElement { + return &((*[0x7FFFFFF]branchPageElement)(unsafe.Pointer(&p.ptr)))[index] +} + +// branchPageElements retrieves a list of branch nodes. +func (p *page) branchPageElements() []branchPageElement { + return ((*[0x7FFFFFF]branchPageElement)(unsafe.Pointer(&p.ptr)))[:] +} + +// dump writes n bytes of the page to STDERR as hex output. +func (p *page) hexdump(n int) { + buf := (*[maxAllocSize]byte)(unsafe.Pointer(p))[:n] + fmt.Fprintf(os.Stderr, "%x\n", buf) +} + +type pages []*page + +func (s pages) Len() int { return len(s) } +func (s pages) Swap(i, j int) { s[i], s[j] = s[j], s[i] } +func (s pages) Less(i, j int) bool { return s[i].id < s[j].id } + +// branchPageElement represents a node on a branch page. +type branchPageElement struct { + pos uint32 + ksize uint32 + pgid pgid +} + +// key returns a byte slice of the node key. +func (n *branchPageElement) key() []byte { + buf := (*[maxAllocSize]byte)(unsafe.Pointer(n)) + return buf[n.pos : n.pos+n.ksize] +} + +// leafPageElement represents a node on a leaf page. +type leafPageElement struct { + flags uint32 + pos uint32 + ksize uint32 + vsize uint32 +} + +// key returns a byte slice of the node key. +func (n *leafPageElement) key() []byte { + buf := (*[maxAllocSize]byte)(unsafe.Pointer(n)) + return buf[n.pos : n.pos+n.ksize] +} + +// value returns a byte slice of the node value. +func (n *leafPageElement) value() []byte { + buf := (*[maxAllocSize]byte)(unsafe.Pointer(n)) + return buf[n.pos+n.ksize : n.pos+n.ksize+n.vsize] +} + +// PageInfo represents human readable information about a page. +type PageInfo struct { + ID int + Type string + Count int + OverflowCount int +} + +type pgids []pgid + +func (s pgids) Len() int { return len(s) } +func (s pgids) Swap(i, j int) { s[i], s[j] = s[j], s[i] } +func (s pgids) Less(i, j int) bool { return s[i] < s[j] } diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/page_test.go b/Godeps/_workspace/src/github.com/boltdb/bolt/page_test.go new file mode 100644 index 000000000..7a4d327fe --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/page_test.go @@ -0,0 +1,29 @@ +package bolt + +import ( + "testing" +) + +// Ensure that the page type can be returned in human readable format. +func TestPage_typ(t *testing.T) { + if typ := (&page{flags: branchPageFlag}).typ(); typ != "branch" { + t.Fatalf("exp=branch; got=%v", typ) + } + if typ := (&page{flags: leafPageFlag}).typ(); typ != "leaf" { + t.Fatalf("exp=leaf; got=%v", typ) + } + if typ := (&page{flags: metaPageFlag}).typ(); typ != "meta" { + t.Fatalf("exp=meta; got=%v", typ) + } + if typ := (&page{flags: freelistPageFlag}).typ(); typ != "freelist" { + t.Fatalf("exp=freelist; got=%v", typ) + } + if typ := (&page{flags: 20000}).typ(); typ != "unknown<4e20>" { + t.Fatalf("exp=unknown<4e20>; got=%v", typ) + } +} + +// Ensure that the hexdump debugging function doesn't blow up. +func TestPage_dump(t *testing.T) { + (&page{id: 256}).hexdump(16) +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/quick_test.go b/Godeps/_workspace/src/github.com/boltdb/bolt/quick_test.go new file mode 100644 index 000000000..4da581775 --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/quick_test.go @@ -0,0 +1,79 @@ +package bolt_test + +import ( + "bytes" + "flag" + "fmt" + "math/rand" + "os" + "reflect" + "testing/quick" + "time" +) + +// testing/quick defaults to 5 iterations and a random seed. +// You can override these settings from the command line: +// +// -quick.count The number of iterations to perform. +// -quick.seed The seed to use for randomizing. +// -quick.maxitems The maximum number of items to insert into a DB. +// -quick.maxksize The maximum size of a key. +// -quick.maxvsize The maximum size of a value. +// + +var qcount, qseed, qmaxitems, qmaxksize, qmaxvsize int + +func init() { + flag.IntVar(&qcount, "quick.count", 5, "") + flag.IntVar(&qseed, "quick.seed", int(time.Now().UnixNano())%100000, "") + flag.IntVar(&qmaxitems, "quick.maxitems", 1000, "") + flag.IntVar(&qmaxksize, "quick.maxksize", 1024, "") + flag.IntVar(&qmaxvsize, "quick.maxvsize", 1024, "") + flag.Parse() + fmt.Fprintln(os.Stderr, "seed:", qseed) + fmt.Fprintf(os.Stderr, "quick settings: count=%v, items=%v, ksize=%v, vsize=%v\n", qcount, qmaxitems, qmaxksize, qmaxvsize) +} + +func qconfig() *quick.Config { + return &quick.Config{ + MaxCount: qcount, + Rand: rand.New(rand.NewSource(int64(qseed))), + } +} + +type testdata []testdataitem + +func (t testdata) Len() int { return len(t) } +func (t testdata) Swap(i, j int) { t[i], t[j] = t[j], t[i] } +func (t testdata) Less(i, j int) bool { return bytes.Compare(t[i].Key, t[j].Key) == -1 } + +func (t testdata) Generate(rand *rand.Rand, size int) reflect.Value { + n := rand.Intn(qmaxitems-1) + 1 + items := make(testdata, n) + for i := 0; i < n; i++ { + item := &items[i] + item.Key = randByteSlice(rand, 1, qmaxksize) + item.Value = randByteSlice(rand, 0, qmaxvsize) + } + return reflect.ValueOf(items) +} + +type revtestdata []testdataitem + +func (t revtestdata) Len() int { return len(t) } +func (t revtestdata) Swap(i, j int) { t[i], t[j] = t[j], t[i] } +func (t revtestdata) Less(i, j int) bool { return bytes.Compare(t[i].Key, t[j].Key) == 1 } + +type testdataitem struct { + Key []byte + Value []byte +} + +func randByteSlice(rand *rand.Rand, minSize, maxSize int) []byte { + n := rand.Intn(maxSize-minSize) + minSize + b := make([]byte, n) + for i := 0; i < n; i++ { + b[i] = byte(rand.Intn(255)) + } + return b +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/simulation_test.go b/Godeps/_workspace/src/github.com/boltdb/bolt/simulation_test.go new file mode 100644 index 000000000..8a49a381b --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/simulation_test.go @@ -0,0 +1,327 @@ +package bolt_test + +import ( + "bytes" + "fmt" + "math/rand" + "sync" + "testing" + + "github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt" +) + +func TestSimulate_1op_1p(t *testing.T) { testSimulate(t, 100, 1) } +func TestSimulate_10op_1p(t *testing.T) { testSimulate(t, 10, 1) } +func TestSimulate_100op_1p(t *testing.T) { testSimulate(t, 100, 1) } +func TestSimulate_1000op_1p(t *testing.T) { testSimulate(t, 1000, 1) } +func TestSimulate_10000op_1p(t *testing.T) { testSimulate(t, 10000, 1) } + +func TestSimulate_10op_10p(t *testing.T) { testSimulate(t, 10, 10) } +func TestSimulate_100op_10p(t *testing.T) { testSimulate(t, 100, 10) } +func TestSimulate_1000op_10p(t *testing.T) { testSimulate(t, 1000, 10) } +func TestSimulate_10000op_10p(t *testing.T) { testSimulate(t, 10000, 10) } + +func TestSimulate_100op_100p(t *testing.T) { testSimulate(t, 100, 100) } +func TestSimulate_1000op_100p(t *testing.T) { testSimulate(t, 1000, 100) } +func TestSimulate_10000op_100p(t *testing.T) { testSimulate(t, 10000, 100) } + +func TestSimulate_10000op_1000p(t *testing.T) { testSimulate(t, 10000, 1000) } + +// Randomly generate operations on a given database with multiple clients to ensure consistency and thread safety. +func testSimulate(t *testing.T, threadCount, parallelism int) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + rand.Seed(int64(qseed)) + + // A list of operations that readers and writers can perform. + var readerHandlers = []simulateHandler{simulateGetHandler} + var writerHandlers = []simulateHandler{simulateGetHandler, simulatePutHandler} + + var versions = make(map[int]*QuickDB) + versions[1] = NewQuickDB() + + db := NewTestDB() + defer db.Close() + + var mutex sync.Mutex + + // Run n threads in parallel, each with their own operation. + var wg sync.WaitGroup + var threads = make(chan bool, parallelism) + var i int + for { + threads <- true + wg.Add(1) + writable := ((rand.Int() % 100) < 20) // 20% writers + + // Choose an operation to execute. + var handler simulateHandler + if writable { + handler = writerHandlers[rand.Intn(len(writerHandlers))] + } else { + handler = readerHandlers[rand.Intn(len(readerHandlers))] + } + + // Execute a thread for the given operation. + go func(writable bool, handler simulateHandler) { + defer wg.Done() + + // Start transaction. + tx, err := db.Begin(writable) + if err != nil { + t.Fatal("tx begin: ", err) + } + + // Obtain current state of the dataset. + mutex.Lock() + var qdb = versions[tx.ID()] + if writable { + qdb = versions[tx.ID()-1].Copy() + } + mutex.Unlock() + + // Make sure we commit/rollback the tx at the end and update the state. + if writable { + defer func() { + mutex.Lock() + versions[tx.ID()] = qdb + mutex.Unlock() + + ok(t, tx.Commit()) + }() + } else { + defer tx.Rollback() + } + + // Ignore operation if we don't have data yet. + if qdb == nil { + return + } + + // Execute handler. + handler(tx, qdb) + + // Release a thread back to the scheduling loop. + <-threads + }(writable, handler) + + i++ + if i > threadCount { + break + } + } + + // Wait until all threads are done. + wg.Wait() +} + +type simulateHandler func(tx *bolt.Tx, qdb *QuickDB) + +// Retrieves a key from the database and verifies that it is what is expected. +func simulateGetHandler(tx *bolt.Tx, qdb *QuickDB) { + // Randomly retrieve an existing exist. + keys := qdb.Rand() + if len(keys) == 0 { + return + } + + // Retrieve root bucket. + b := tx.Bucket(keys[0]) + if b == nil { + panic(fmt.Sprintf("bucket[0] expected: %08x\n", trunc(keys[0], 4))) + } + + // Drill into nested buckets. + for _, key := range keys[1 : len(keys)-1] { + b = b.Bucket(key) + if b == nil { + panic(fmt.Sprintf("bucket[n] expected: %v -> %v\n", keys, key)) + } + } + + // Verify key/value on the final bucket. + expected := qdb.Get(keys) + actual := b.Get(keys[len(keys)-1]) + if !bytes.Equal(actual, expected) { + fmt.Println("=== EXPECTED ===") + fmt.Println(expected) + fmt.Println("=== ACTUAL ===") + fmt.Println(actual) + fmt.Println("=== END ===") + panic("value mismatch") + } +} + +// Inserts a key into the database. +func simulatePutHandler(tx *bolt.Tx, qdb *QuickDB) { + var err error + keys, value := randKeys(), randValue() + + // Retrieve root bucket. + b := tx.Bucket(keys[0]) + if b == nil { + b, err = tx.CreateBucket(keys[0]) + if err != nil { + panic("create bucket: " + err.Error()) + } + } + + // Create nested buckets, if necessary. + for _, key := range keys[1 : len(keys)-1] { + child := b.Bucket(key) + if child != nil { + b = child + } else { + b, err = b.CreateBucket(key) + if err != nil { + panic("create bucket: " + err.Error()) + } + } + } + + // Insert into database. + if err := b.Put(keys[len(keys)-1], value); err != nil { + panic("put: " + err.Error()) + } + + // Insert into in-memory database. + qdb.Put(keys, value) +} + +// QuickDB is an in-memory database that replicates the functionality of the +// Bolt DB type except that it is entirely in-memory. It is meant for testing +// that the Bolt database is consistent. +type QuickDB struct { + sync.RWMutex + m map[string]interface{} +} + +// NewQuickDB returns an instance of QuickDB. +func NewQuickDB() *QuickDB { + return &QuickDB{m: make(map[string]interface{})} +} + +// Get retrieves the value at a key path. +func (db *QuickDB) Get(keys [][]byte) []byte { + db.RLock() + defer db.RUnlock() + + m := db.m + for _, key := range keys[:len(keys)-1] { + value := m[string(key)] + if value == nil { + return nil + } + switch value := value.(type) { + case map[string]interface{}: + m = value + case []byte: + return nil + } + } + + // Only return if it's a simple value. + if value, ok := m[string(keys[len(keys)-1])].([]byte); ok { + return value + } + return nil +} + +// Put inserts a value into a key path. +func (db *QuickDB) Put(keys [][]byte, value []byte) { + db.Lock() + defer db.Unlock() + + // Build buckets all the way down the key path. + m := db.m + for _, key := range keys[:len(keys)-1] { + if _, ok := m[string(key)].([]byte); ok { + return // Keypath intersects with a simple value. Do nothing. + } + + if m[string(key)] == nil { + m[string(key)] = make(map[string]interface{}) + } + m = m[string(key)].(map[string]interface{}) + } + + // Insert value into the last key. + m[string(keys[len(keys)-1])] = value +} + +// Rand returns a random key path that points to a simple value. +func (db *QuickDB) Rand() [][]byte { + db.RLock() + defer db.RUnlock() + if len(db.m) == 0 { + return nil + } + var keys [][]byte + db.rand(db.m, &keys) + return keys +} + +func (db *QuickDB) rand(m map[string]interface{}, keys *[][]byte) { + i, index := 0, rand.Intn(len(m)) + for k, v := range m { + if i == index { + *keys = append(*keys, []byte(k)) + if v, ok := v.(map[string]interface{}); ok { + db.rand(v, keys) + } + return + } + i++ + } + panic("quickdb rand: out-of-range") +} + +// Copy copies the entire database. +func (db *QuickDB) Copy() *QuickDB { + db.RLock() + defer db.RUnlock() + return &QuickDB{m: db.copy(db.m)} +} + +func (db *QuickDB) copy(m map[string]interface{}) map[string]interface{} { + clone := make(map[string]interface{}, len(m)) + for k, v := range m { + switch v := v.(type) { + case map[string]interface{}: + clone[k] = db.copy(v) + default: + clone[k] = v + } + } + return clone +} + +func randKey() []byte { + var min, max = 1, 1024 + n := rand.Intn(max-min) + min + b := make([]byte, n) + for i := 0; i < n; i++ { + b[i] = byte(rand.Intn(255)) + } + return b +} + +func randKeys() [][]byte { + var keys [][]byte + var count = rand.Intn(2) + 2 + for i := 0; i < count; i++ { + keys = append(keys, randKey()) + } + return keys +} + +func randValue() []byte { + n := rand.Intn(8192) + b := make([]byte, n) + for i := 0; i < n; i++ { + b[i] = byte(rand.Intn(255)) + } + return b +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/tx.go b/Godeps/_workspace/src/github.com/boltdb/bolt/tx.go new file mode 100644 index 000000000..fda6a210a --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/tx.go @@ -0,0 +1,585 @@ +package bolt + +import ( + "fmt" + "io" + "os" + "sort" + "time" + "unsafe" +) + +// txid represents the internal transaction identifier. +type txid uint64 + +// Tx represents a read-only or read/write transaction on the database. +// Read-only transactions can be used for retrieving values for keys and creating cursors. +// Read/write transactions can create and remove buckets and create and remove keys. +// +// IMPORTANT: You must commit or rollback transactions when you are done with +// them. Pages can not be reclaimed by the writer until no more transactions +// are using them. A long running read transaction can cause the database to +// quickly grow. +type Tx struct { + writable bool + managed bool + db *DB + meta *meta + root Bucket + pages map[pgid]*page + stats TxStats + commitHandlers []func() +} + +// init initializes the transaction. +func (tx *Tx) init(db *DB) { + tx.db = db + tx.pages = nil + + // Copy the meta page since it can be changed by the writer. + tx.meta = &meta{} + db.meta().copy(tx.meta) + + // Copy over the root bucket. + tx.root = newBucket(tx) + tx.root.bucket = &bucket{} + *tx.root.bucket = tx.meta.root + + // Increment the transaction id and add a page cache for writable transactions. + if tx.writable { + tx.pages = make(map[pgid]*page) + tx.meta.txid += txid(1) + } +} + +// ID returns the transaction id. +func (tx *Tx) ID() int { + return int(tx.meta.txid) +} + +// DB returns a reference to the database that created the transaction. +func (tx *Tx) DB() *DB { + return tx.db +} + +// Size returns current database size in bytes as seen by this transaction. +func (tx *Tx) Size() int64 { + return int64(tx.meta.pgid) * int64(tx.db.pageSize) +} + +// Writable returns whether the transaction can perform write operations. +func (tx *Tx) Writable() bool { + return tx.writable +} + +// Cursor creates a cursor associated with the root bucket. +// All items in the cursor will return a nil value because all root bucket keys point to buckets. +// The cursor is only valid as long as the transaction is open. +// Do not use a cursor after the transaction is closed. +func (tx *Tx) Cursor() *Cursor { + return tx.root.Cursor() +} + +// Stats retrieves a copy of the current transaction statistics. +func (tx *Tx) Stats() TxStats { + return tx.stats +} + +// Bucket retrieves a bucket by name. +// Returns nil if the bucket does not exist. +func (tx *Tx) Bucket(name []byte) *Bucket { + return tx.root.Bucket(name) +} + +// CreateBucket creates a new bucket. +// Returns an error if the bucket already exists, if the bucket name is blank, or if the bucket name is too long. +func (tx *Tx) CreateBucket(name []byte) (*Bucket, error) { + return tx.root.CreateBucket(name) +} + +// CreateBucketIfNotExists creates a new bucket if it doesn't already exist. +// Returns an error if the bucket name is blank, or if the bucket name is too long. +func (tx *Tx) CreateBucketIfNotExists(name []byte) (*Bucket, error) { + return tx.root.CreateBucketIfNotExists(name) +} + +// DeleteBucket deletes a bucket. +// Returns an error if the bucket cannot be found or if the key represents a non-bucket value. +func (tx *Tx) DeleteBucket(name []byte) error { + return tx.root.DeleteBucket(name) +} + +// ForEach executes a function for each bucket in the root. +// If the provided function returns an error then the iteration is stopped and +// the error is returned to the caller. +func (tx *Tx) ForEach(fn func(name []byte, b *Bucket) error) error { + return tx.root.ForEach(func(k, v []byte) error { + if err := fn(k, tx.root.Bucket(k)); err != nil { + return err + } + return nil + }) +} + +// OnCommit adds a handler function to be executed after the transaction successfully commits. +func (tx *Tx) OnCommit(fn func()) { + tx.commitHandlers = append(tx.commitHandlers, fn) +} + +// Commit writes all changes to disk and updates the meta page. +// Returns an error if a disk write error occurs. +func (tx *Tx) Commit() error { + _assert(!tx.managed, "managed tx commit not allowed") + if tx.db == nil { + return ErrTxClosed + } else if !tx.writable { + return ErrTxNotWritable + } + + // TODO(benbjohnson): Use vectorized I/O to write out dirty pages. + + // Rebalance nodes which have had deletions. + var startTime = time.Now() + tx.root.rebalance() + if tx.stats.Rebalance > 0 { + tx.stats.RebalanceTime += time.Since(startTime) + } + + // spill data onto dirty pages. + startTime = time.Now() + if err := tx.root.spill(); err != nil { + tx.rollback() + return err + } + tx.stats.SpillTime += time.Since(startTime) + + // Free the old root bucket. + tx.meta.root.root = tx.root.root + + // Free the freelist and allocate new pages for it. This will overestimate + // the size of the freelist but not underestimate the size (which would be bad). + tx.db.freelist.free(tx.meta.txid, tx.db.page(tx.meta.freelist)) + p, err := tx.allocate((tx.db.freelist.size() / tx.db.pageSize) + 1) + if err != nil { + tx.rollback() + return err + } + if err := tx.db.freelist.write(p); err != nil { + tx.rollback() + return err + } + tx.meta.freelist = p.id + + // Write dirty pages to disk. + startTime = time.Now() + if err := tx.write(); err != nil { + tx.rollback() + return err + } + + // If strict mode is enabled then perform a consistency check. + // Only the first consistency error is reported in the panic. + if tx.db.StrictMode { + if err, ok := <-tx.Check(); ok { + panic("check fail: " + err.Error()) + } + } + + // Write meta to disk. + if err := tx.writeMeta(); err != nil { + tx.rollback() + return err + } + tx.stats.WriteTime += time.Since(startTime) + + // Finalize the transaction. + tx.close() + + // Execute commit handlers now that the locks have been removed. + for _, fn := range tx.commitHandlers { + fn() + } + + return nil +} + +// Rollback closes the transaction and ignores all previous updates. +func (tx *Tx) Rollback() error { + _assert(!tx.managed, "managed tx rollback not allowed") + if tx.db == nil { + return ErrTxClosed + } + tx.rollback() + return nil +} + +func (tx *Tx) rollback() { + if tx.db == nil { + return + } + if tx.writable { + tx.db.freelist.rollback(tx.meta.txid) + tx.db.freelist.reload(tx.db.page(tx.db.meta().freelist)) + } + tx.close() +} + +func (tx *Tx) close() { + if tx.db == nil { + return + } + if tx.writable { + // Grab freelist stats. + var freelistFreeN = tx.db.freelist.free_count() + var freelistPendingN = tx.db.freelist.pending_count() + var freelistAlloc = tx.db.freelist.size() + + // Remove writer lock. + tx.db.rwlock.Unlock() + + // Merge statistics. + tx.db.statlock.Lock() + tx.db.stats.FreePageN = freelistFreeN + tx.db.stats.PendingPageN = freelistPendingN + tx.db.stats.FreeAlloc = (freelistFreeN + freelistPendingN) * tx.db.pageSize + tx.db.stats.FreelistInuse = freelistAlloc + tx.db.stats.TxStats.add(&tx.stats) + tx.db.statlock.Unlock() + } else { + tx.db.removeTx(tx) + } + tx.db = nil +} + +// Copy writes the entire database to a writer. +// This function exists for backwards compatibility. Use WriteTo() in +func (tx *Tx) Copy(w io.Writer) error { + _, err := tx.WriteTo(w) + return err +} + +// WriteTo writes the entire database to a writer. +// If err == nil then exactly tx.Size() bytes will be written into the writer. +func (tx *Tx) WriteTo(w io.Writer) (n int64, err error) { + // Attempt to open reader directly. + var f *os.File + if f, err = os.OpenFile(tx.db.path, os.O_RDONLY|odirect, 0); err != nil { + // Fallback to a regular open if that doesn't work. + if f, err = os.OpenFile(tx.db.path, os.O_RDONLY, 0); err != nil { + return 0, err + } + } + + // Copy the meta pages. + tx.db.metalock.Lock() + n, err = io.CopyN(w, f, int64(tx.db.pageSize*2)) + tx.db.metalock.Unlock() + if err != nil { + _ = f.Close() + return n, fmt.Errorf("meta copy: %s", err) + } + + // Copy data pages. + wn, err := io.CopyN(w, f, tx.Size()-int64(tx.db.pageSize*2)) + n += wn + if err != nil { + _ = f.Close() + return n, err + } + + return n, f.Close() +} + +// CopyFile copies the entire database to file at the given path. +// A reader transaction is maintained during the copy so it is safe to continue +// using the database while a copy is in progress. +func (tx *Tx) CopyFile(path string, mode os.FileMode) error { + f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_TRUNC, mode) + if err != nil { + return err + } + + err = tx.Copy(f) + if err != nil { + _ = f.Close() + return err + } + return f.Close() +} + +// Check performs several consistency checks on the database for this transaction. +// An error is returned if any inconsistency is found. +// +// It can be safely run concurrently on a writable transaction. However, this +// incurs a high cost for large databases and databases with a lot of subbuckets +// because of caching. This overhead can be removed if running on a read-only +// transaction, however, it is not safe to execute other writer transactions at +// the same time. +func (tx *Tx) Check() <-chan error { + ch := make(chan error) + go tx.check(ch) + return ch +} + +func (tx *Tx) check(ch chan error) { + // Check if any pages are double freed. + freed := make(map[pgid]bool) + for _, id := range tx.db.freelist.all() { + if freed[id] { + ch <- fmt.Errorf("page %d: already freed", id) + } + freed[id] = true + } + + // Track every reachable page. + reachable := make(map[pgid]*page) + reachable[0] = tx.page(0) // meta0 + reachable[1] = tx.page(1) // meta1 + for i := uint32(0); i <= tx.page(tx.meta.freelist).overflow; i++ { + reachable[tx.meta.freelist+pgid(i)] = tx.page(tx.meta.freelist) + } + + // Recursively check buckets. + tx.checkBucket(&tx.root, reachable, freed, ch) + + // Ensure all pages below high water mark are either reachable or freed. + for i := pgid(0); i < tx.meta.pgid; i++ { + _, isReachable := reachable[i] + if !isReachable && !freed[i] { + ch <- fmt.Errorf("page %d: unreachable unfreed", int(i)) + } + } + + // Close the channel to signal completion. + close(ch) +} + +func (tx *Tx) checkBucket(b *Bucket, reachable map[pgid]*page, freed map[pgid]bool, ch chan error) { + // Ignore inline buckets. + if b.root == 0 { + return + } + + // Check every page used by this bucket. + b.tx.forEachPage(b.root, 0, func(p *page, _ int) { + if p.id > tx.meta.pgid { + ch <- fmt.Errorf("page %d: out of bounds: %d", int(p.id), int(b.tx.meta.pgid)) + } + + // Ensure each page is only referenced once. + for i := pgid(0); i <= pgid(p.overflow); i++ { + var id = p.id + i + if _, ok := reachable[id]; ok { + ch <- fmt.Errorf("page %d: multiple references", int(id)) + } + reachable[id] = p + } + + // We should only encounter un-freed leaf and branch pages. + if freed[p.id] { + ch <- fmt.Errorf("page %d: reachable freed", int(p.id)) + } else if (p.flags&branchPageFlag) == 0 && (p.flags&leafPageFlag) == 0 { + ch <- fmt.Errorf("page %d: invalid type: %s", int(p.id), p.typ()) + } + }) + + // Check each bucket within this bucket. + _ = b.ForEach(func(k, v []byte) error { + if child := b.Bucket(k); child != nil { + tx.checkBucket(child, reachable, freed, ch) + } + return nil + }) +} + +// allocate returns a contiguous block of memory starting at a given page. +func (tx *Tx) allocate(count int) (*page, error) { + p, err := tx.db.allocate(count) + if err != nil { + return nil, err + } + + // Save to our page cache. + tx.pages[p.id] = p + + // Update statistics. + tx.stats.PageCount++ + tx.stats.PageAlloc += count * tx.db.pageSize + + return p, nil +} + +// write writes any dirty pages to disk. +func (tx *Tx) write() error { + // Sort pages by id. + pages := make(pages, 0, len(tx.pages)) + for _, p := range tx.pages { + pages = append(pages, p) + } + sort.Sort(pages) + + // Write pages to disk in order. + for _, p := range pages { + size := (int(p.overflow) + 1) * tx.db.pageSize + buf := (*[maxAllocSize]byte)(unsafe.Pointer(p))[:size] + offset := int64(p.id) * int64(tx.db.pageSize) + if _, err := tx.db.ops.writeAt(buf, offset); err != nil { + return err + } + + // Update statistics. + tx.stats.Write++ + } + if !tx.db.NoSync || IgnoreNoSync { + if err := fdatasync(tx.db); err != nil { + return err + } + } + + // Clear out page cache. + tx.pages = make(map[pgid]*page) + + return nil +} + +// writeMeta writes the meta to the disk. +func (tx *Tx) writeMeta() error { + // Create a temporary buffer for the meta page. + buf := make([]byte, tx.db.pageSize) + p := tx.db.pageInBuffer(buf, 0) + tx.meta.write(p) + + // Write the meta page to file. + if _, err := tx.db.ops.writeAt(buf, int64(p.id)*int64(tx.db.pageSize)); err != nil { + return err + } + if !tx.db.NoSync || IgnoreNoSync { + if err := fdatasync(tx.db); err != nil { + return err + } + } + + // Update statistics. + tx.stats.Write++ + + return nil +} + +// page returns a reference to the page with a given id. +// If page has been written to then a temporary bufferred page is returned. +func (tx *Tx) page(id pgid) *page { + // Check the dirty pages first. + if tx.pages != nil { + if p, ok := tx.pages[id]; ok { + return p + } + } + + // Otherwise return directly from the mmap. + return tx.db.page(id) +} + +// forEachPage iterates over every page within a given page and executes a function. +func (tx *Tx) forEachPage(pgid pgid, depth int, fn func(*page, int)) { + p := tx.page(pgid) + + // Execute function. + fn(p, depth) + + // Recursively loop over children. + if (p.flags & branchPageFlag) != 0 { + for i := 0; i < int(p.count); i++ { + elem := p.branchPageElement(uint16(i)) + tx.forEachPage(elem.pgid, depth+1, fn) + } + } +} + +// Page returns page information for a given page number. +// This is only safe for concurrent use when used by a writable transaction. +func (tx *Tx) Page(id int) (*PageInfo, error) { + if tx.db == nil { + return nil, ErrTxClosed + } else if pgid(id) >= tx.meta.pgid { + return nil, nil + } + + // Build the page info. + p := tx.db.page(pgid(id)) + info := &PageInfo{ + ID: id, + Count: int(p.count), + OverflowCount: int(p.overflow), + } + + // Determine the type (or if it's free). + if tx.db.freelist.freed(pgid(id)) { + info.Type = "free" + } else { + info.Type = p.typ() + } + + return info, nil +} + +// TxStats represents statistics about the actions performed by the transaction. +type TxStats struct { + // Page statistics. + PageCount int // number of page allocations + PageAlloc int // total bytes allocated + + // Cursor statistics. + CursorCount int // number of cursors created + + // Node statistics + NodeCount int // number of node allocations + NodeDeref int // number of node dereferences + + // Rebalance statistics. + Rebalance int // number of node rebalances + RebalanceTime time.Duration // total time spent rebalancing + + // Split/Spill statistics. + Split int // number of nodes split + Spill int // number of nodes spilled + SpillTime time.Duration // total time spent spilling + + // Write statistics. + Write int // number of writes performed + WriteTime time.Duration // total time spent writing to disk +} + +func (s *TxStats) add(other *TxStats) { + s.PageCount += other.PageCount + s.PageAlloc += other.PageAlloc + s.CursorCount += other.CursorCount + s.NodeCount += other.NodeCount + s.NodeDeref += other.NodeDeref + s.Rebalance += other.Rebalance + s.RebalanceTime += other.RebalanceTime + s.Split += other.Split + s.Spill += other.Spill + s.SpillTime += other.SpillTime + s.Write += other.Write + s.WriteTime += other.WriteTime +} + +// Sub calculates and returns the difference between two sets of transaction stats. +// This is useful when obtaining stats at two different points and time and +// you need the performance counters that occurred within that time span. +func (s *TxStats) Sub(other *TxStats) TxStats { + var diff TxStats + diff.PageCount = s.PageCount - other.PageCount + diff.PageAlloc = s.PageAlloc - other.PageAlloc + diff.CursorCount = s.CursorCount - other.CursorCount + diff.NodeCount = s.NodeCount - other.NodeCount + diff.NodeDeref = s.NodeDeref - other.NodeDeref + diff.Rebalance = s.Rebalance - other.Rebalance + diff.RebalanceTime = s.RebalanceTime - other.RebalanceTime + diff.Split = s.Split - other.Split + diff.Spill = s.Spill - other.Spill + diff.SpillTime = s.SpillTime - other.SpillTime + diff.Write = s.Write - other.Write + diff.WriteTime = s.WriteTime - other.WriteTime + return diff +} diff --git a/Godeps/_workspace/src/github.com/boltdb/bolt/tx_test.go b/Godeps/_workspace/src/github.com/boltdb/bolt/tx_test.go new file mode 100644 index 000000000..9612f336a --- /dev/null +++ b/Godeps/_workspace/src/github.com/boltdb/bolt/tx_test.go @@ -0,0 +1,424 @@ +package bolt_test + +import ( + "errors" + "fmt" + "os" + "testing" + + "github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt" +) + +// Ensure that committing a closed transaction returns an error. +func TestTx_Commit_Closed(t *testing.T) { + db := NewTestDB() + defer db.Close() + tx, _ := db.Begin(true) + tx.CreateBucket([]byte("foo")) + ok(t, tx.Commit()) + equals(t, tx.Commit(), bolt.ErrTxClosed) +} + +// Ensure that rolling back a closed transaction returns an error. +func TestTx_Rollback_Closed(t *testing.T) { + db := NewTestDB() + defer db.Close() + tx, _ := db.Begin(true) + ok(t, tx.Rollback()) + equals(t, tx.Rollback(), bolt.ErrTxClosed) +} + +// Ensure that committing a read-only transaction returns an error. +func TestTx_Commit_ReadOnly(t *testing.T) { + db := NewTestDB() + defer db.Close() + tx, _ := db.Begin(false) + equals(t, tx.Commit(), bolt.ErrTxNotWritable) +} + +// Ensure that a transaction can retrieve a cursor on the root bucket. +func TestTx_Cursor(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + tx.CreateBucket([]byte("woojits")) + c := tx.Cursor() + + k, v := c.First() + equals(t, "widgets", string(k)) + assert(t, v == nil, "") + + k, v = c.Next() + equals(t, "woojits", string(k)) + assert(t, v == nil, "") + + k, v = c.Next() + assert(t, k == nil, "") + assert(t, v == nil, "") + + return nil + }) +} + +// Ensure that creating a bucket with a read-only transaction returns an error. +func TestTx_CreateBucket_ReadOnly(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.View(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("foo")) + assert(t, b == nil, "") + equals(t, bolt.ErrTxNotWritable, err) + return nil + }) +} + +// Ensure that creating a bucket on a closed transaction returns an error. +func TestTx_CreateBucket_Closed(t *testing.T) { + db := NewTestDB() + defer db.Close() + tx, _ := db.Begin(true) + tx.Commit() + b, err := tx.CreateBucket([]byte("foo")) + assert(t, b == nil, "") + equals(t, bolt.ErrTxClosed, err) +} + +// Ensure that a Tx can retrieve a bucket. +func TestTx_Bucket(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + b := tx.Bucket([]byte("widgets")) + assert(t, b != nil, "") + return nil + }) +} + +// Ensure that a Tx retrieving a non-existent key returns nil. +func TestTx_Get_Missing(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar")) + value := tx.Bucket([]byte("widgets")).Get([]byte("no_such_key")) + assert(t, value == nil, "") + return nil + }) +} + +// Ensure that a bucket can be created and retrieved. +func TestTx_CreateBucket(t *testing.T) { + db := NewTestDB() + defer db.Close() + + // Create a bucket. + db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + assert(t, b != nil, "") + ok(t, err) + return nil + }) + + // Read the bucket through a separate transaction. + db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + assert(t, b != nil, "") + return nil + }) +} + +// Ensure that a bucket can be created if it doesn't already exist. +func TestTx_CreateBucketIfNotExists(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucketIfNotExists([]byte("widgets")) + assert(t, b != nil, "") + ok(t, err) + + b, err = tx.CreateBucketIfNotExists([]byte("widgets")) + assert(t, b != nil, "") + ok(t, err) + + b, err = tx.CreateBucketIfNotExists([]byte{}) + assert(t, b == nil, "") + equals(t, bolt.ErrBucketNameRequired, err) + + b, err = tx.CreateBucketIfNotExists(nil) + assert(t, b == nil, "") + equals(t, bolt.ErrBucketNameRequired, err) + return nil + }) + + // Read the bucket through a separate transaction. + db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte("widgets")) + assert(t, b != nil, "") + return nil + }) +} + +// Ensure that a bucket cannot be created twice. +func TestTx_CreateBucket_Exists(t *testing.T) { + db := NewTestDB() + defer db.Close() + // Create a bucket. + db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + assert(t, b != nil, "") + ok(t, err) + return nil + }) + + // Create the same bucket again. + db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket([]byte("widgets")) + assert(t, b == nil, "") + equals(t, bolt.ErrBucketExists, err) + return nil + }) +} + +// Ensure that a bucket is created with a non-blank name. +func TestTx_CreateBucket_NameRequired(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + b, err := tx.CreateBucket(nil) + assert(t, b == nil, "") + equals(t, bolt.ErrBucketNameRequired, err) + return nil + }) +} + +// Ensure that a bucket can be deleted. +func TestTx_DeleteBucket(t *testing.T) { + db := NewTestDB() + defer db.Close() + + // Create a bucket and add a value. + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar")) + return nil + }) + + // Delete the bucket and make sure we can't get the value. + db.Update(func(tx *bolt.Tx) error { + ok(t, tx.DeleteBucket([]byte("widgets"))) + assert(t, tx.Bucket([]byte("widgets")) == nil, "") + return nil + }) + + db.Update(func(tx *bolt.Tx) error { + // Create the bucket again and make sure there's not a phantom value. + b, err := tx.CreateBucket([]byte("widgets")) + assert(t, b != nil, "") + ok(t, err) + assert(t, tx.Bucket([]byte("widgets")).Get([]byte("foo")) == nil, "") + return nil + }) +} + +// Ensure that deleting a bucket on a closed transaction returns an error. +func TestTx_DeleteBucket_Closed(t *testing.T) { + db := NewTestDB() + defer db.Close() + tx, _ := db.Begin(true) + tx.Commit() + equals(t, tx.DeleteBucket([]byte("foo")), bolt.ErrTxClosed) +} + +// Ensure that deleting a bucket with a read-only transaction returns an error. +func TestTx_DeleteBucket_ReadOnly(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.View(func(tx *bolt.Tx) error { + equals(t, tx.DeleteBucket([]byte("foo")), bolt.ErrTxNotWritable) + return nil + }) +} + +// Ensure that nothing happens when deleting a bucket that doesn't exist. +func TestTx_DeleteBucket_NotFound(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + equals(t, bolt.ErrBucketNotFound, tx.DeleteBucket([]byte("widgets"))) + return nil + }) +} + +// Ensure that Tx commit handlers are called after a transaction successfully commits. +func TestTx_OnCommit(t *testing.T) { + var x int + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.OnCommit(func() { x += 1 }) + tx.OnCommit(func() { x += 2 }) + _, err := tx.CreateBucket([]byte("widgets")) + return err + }) + equals(t, 3, x) +} + +// Ensure that Tx commit handlers are NOT called after a transaction rolls back. +func TestTx_OnCommit_Rollback(t *testing.T) { + var x int + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.OnCommit(func() { x += 1 }) + tx.OnCommit(func() { x += 2 }) + tx.CreateBucket([]byte("widgets")) + return errors.New("rollback this commit") + }) + equals(t, 0, x) +} + +// Ensure that the database can be copied to a file path. +func TestTx_CopyFile(t *testing.T) { + db := NewTestDB() + defer db.Close() + var dest = tempfile() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar")) + tx.Bucket([]byte("widgets")).Put([]byte("baz"), []byte("bat")) + return nil + }) + + ok(t, db.View(func(tx *bolt.Tx) error { return tx.CopyFile(dest, 0600) })) + + db2, err := bolt.Open(dest, 0600, nil) + ok(t, err) + defer db2.Close() + + db2.View(func(tx *bolt.Tx) error { + equals(t, []byte("bar"), tx.Bucket([]byte("widgets")).Get([]byte("foo"))) + equals(t, []byte("bat"), tx.Bucket([]byte("widgets")).Get([]byte("baz"))) + return nil + }) +} + +type failWriterError struct{} + +func (failWriterError) Error() string { + return "error injected for tests" +} + +type failWriter struct { + // fail after this many bytes + After int +} + +func (f *failWriter) Write(p []byte) (n int, err error) { + n = len(p) + if n > f.After { + n = f.After + err = failWriterError{} + } + f.After -= n + return n, err +} + +// Ensure that Copy handles write errors right. +func TestTx_CopyFile_Error_Meta(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar")) + tx.Bucket([]byte("widgets")).Put([]byte("baz"), []byte("bat")) + return nil + }) + + err := db.View(func(tx *bolt.Tx) error { return tx.Copy(&failWriter{}) }) + equals(t, err.Error(), "meta copy: error injected for tests") +} + +// Ensure that Copy handles write errors right. +func TestTx_CopyFile_Error_Normal(t *testing.T) { + db := NewTestDB() + defer db.Close() + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar")) + tx.Bucket([]byte("widgets")).Put([]byte("baz"), []byte("bat")) + return nil + }) + + err := db.View(func(tx *bolt.Tx) error { return tx.Copy(&failWriter{3 * db.Info().PageSize}) }) + equals(t, err.Error(), "error injected for tests") +} + +func ExampleTx_Rollback() { + // Open the database. + db, _ := bolt.Open(tempfile(), 0666, nil) + defer os.Remove(db.Path()) + defer db.Close() + + // Create a bucket. + db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte("widgets")) + return err + }) + + // Set a value for a key. + db.Update(func(tx *bolt.Tx) error { + return tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar")) + }) + + // Update the key but rollback the transaction so it never saves. + tx, _ := db.Begin(true) + b := tx.Bucket([]byte("widgets")) + b.Put([]byte("foo"), []byte("baz")) + tx.Rollback() + + // Ensure that our original value is still set. + db.View(func(tx *bolt.Tx) error { + value := tx.Bucket([]byte("widgets")).Get([]byte("foo")) + fmt.Printf("The value for 'foo' is still: %s\n", value) + return nil + }) + + // Output: + // The value for 'foo' is still: bar +} + +func ExampleTx_CopyFile() { + // Open the database. + db, _ := bolt.Open(tempfile(), 0666, nil) + defer os.Remove(db.Path()) + defer db.Close() + + // Create a bucket and a key. + db.Update(func(tx *bolt.Tx) error { + tx.CreateBucket([]byte("widgets")) + tx.Bucket([]byte("widgets")).Put([]byte("foo"), []byte("bar")) + return nil + }) + + // Copy the database to another file. + toFile := tempfile() + db.View(func(tx *bolt.Tx) error { return tx.CopyFile(toFile, 0666) }) + defer os.Remove(toFile) + + // Open the cloned database. + db2, _ := bolt.Open(toFile, 0666, nil) + defer db2.Close() + + // Ensure that the key exists in the copy. + db2.View(func(tx *bolt.Tx) error { + value := tx.Bucket([]byte("widgets")).Get([]byte("foo")) + fmt.Printf("The value for 'foo' in the clone is: %s\n", value) + return nil + }) + + // Output: + // The value for 'foo' in the clone is: bar +} diff --git a/storage/backend/backend.go b/storage/backend/backend.go index c91492ed9..b2b85b63d 100644 --- a/storage/backend/backend.go +++ b/storage/backend/backend.go @@ -4,7 +4,7 @@ import ( "log" "time" - "github.com/boltdb/bolt" + "github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt" ) type Backend interface { diff --git a/storage/backend/batch_tx.go b/storage/backend/batch_tx.go index 45bbf3a24..5df76f22e 100644 --- a/storage/backend/batch_tx.go +++ b/storage/backend/batch_tx.go @@ -5,7 +5,7 @@ import ( "log" "sync" - "github.com/boltdb/bolt" + "github.com/coreos/etcd/Godeps/_workspace/src/github.com/boltdb/bolt" ) type BatchTx interface {