From 78a5de2ea4c6dd1999fb06a727db69e6eb044c71 Mon Sep 17 00:00:00 2001 From: Konstantin Isakov Date: Thu, 18 Jul 2013 21:33:25 +0400 Subject: [PATCH] Initial import of all source code. --- CMakeLists.txt | 36 ++ README.md | 172 +++++++- adler32.hh | 35 ++ appendallocator.cc | 94 ++++ appendallocator.hh | 67 +++ backup_creator.cc | 280 ++++++++++++ backup_creator.hh | 91 ++++ backup_file.cc | 48 ++ backup_file.hh | 28 ++ backup_restorer.cc | 54 +++ backup_restorer.hh | 33 ++ bundle.cc | 208 +++++++++ bundle.hh | 106 +++++ check.hh | 38 ++ chunk_id.cc | 46 ++ chunk_id.hh | 38 ++ chunk_index.cc | 153 +++++++ chunk_index.hh | 109 +++++ chunk_storage.cc | 223 ++++++++++ chunk_storage.hh | 137 ++++++ debug.cc | 4 + debug.hh | 26 ++ dir.cc | 119 +++++ dir.hh | 85 ++++ encrypted_file.cc | 391 +++++++++++++++++ encrypted_file.hh | 137 ++++++ encryption.cc | 120 +++++ encryption.hh | 56 +++ encryption_key.cc | 107 +++++ encryption_key.hh | 50 +++ endian.hh | 24 + ex.hh | 54 +++ file.cc | 361 +++++++++++++++ file.hh | 160 +++++++ hex.cc | 29 ++ hex.hh | 12 + index_file.cc | 78 ++++ index_file.hh | 60 +++ message.cc | 44 ++ message.hh | 40 ++ mt.cc | 83 ++++ mt.hh | 87 ++++ nocopy.hh | 19 + objectcache.cc | 47 ++ objectcache.hh | 127 ++++++ page_size.cc | 16 + page_size.hh | 10 + random.cc | 19 + random.hh | 21 + rolling_hash.cc | 29 ++ rolling_hash.hh | 81 ++++ sha256.cc | 27 ++ sha256.hh | 37 ++ sptr.hh | 156 +++++++ static_assert.hh | 28 ++ storage_info_file.cc | 42 ++ storage_info_file.hh | 28 ++ tartool/CMakeLists.txt | 11 + tartool/tartool.cc | 192 ++++++++ tests/TODO.txt | 1 + tests/encrypted_file/encrypted_file.pro | 38 ++ tests/encrypted_file/test_encrypted_file.cc | 174 ++++++++ tests/rolling_hash/rolling_hash.pro | 16 + tests/rolling_hash/test_rolling_hash.cc | 120 +++++ tmp_mgr.cc | 63 +++ tmp_mgr.hh | 62 +++ unbuffered_file.cc | 97 ++++ unbuffered_file.hh | 62 +++ zbackup.cc | 461 ++++++++++++++++++++ zbackup.hh | 97 ++++ zbackup.proto | 109 +++++ 71 files changed, 6310 insertions(+), 3 deletions(-) create mode 100644 CMakeLists.txt create mode 100644 adler32.hh create mode 100644 appendallocator.cc create mode 100644 appendallocator.hh create mode 100644 backup_creator.cc create mode 100644 backup_creator.hh create mode 100644 backup_file.cc create mode 100644 backup_file.hh create mode 100644 backup_restorer.cc create mode 100644 backup_restorer.hh create mode 100644 bundle.cc create mode 100644 bundle.hh create mode 100644 check.hh create mode 100644 chunk_id.cc create mode 100644 chunk_id.hh create mode 100644 chunk_index.cc create mode 100644 chunk_index.hh create mode 100644 chunk_storage.cc create mode 100644 chunk_storage.hh create mode 100644 debug.cc create mode 100644 debug.hh create mode 100644 dir.cc create mode 100644 dir.hh create mode 100644 encrypted_file.cc create mode 100644 encrypted_file.hh create mode 100644 encryption.cc create mode 100644 encryption.hh create mode 100644 encryption_key.cc create mode 100644 encryption_key.hh create mode 100644 endian.hh create mode 100644 ex.hh create mode 100644 file.cc create mode 100644 file.hh create mode 100644 hex.cc create mode 100644 hex.hh create mode 100644 index_file.cc create mode 100644 index_file.hh create mode 100644 message.cc create mode 100644 message.hh create mode 100644 mt.cc create mode 100644 mt.hh create mode 100644 nocopy.hh create mode 100644 objectcache.cc create mode 100644 objectcache.hh create mode 100644 page_size.cc create mode 100644 page_size.hh create mode 100644 random.cc create mode 100644 random.hh create mode 100644 rolling_hash.cc create mode 100644 rolling_hash.hh create mode 100644 sha256.cc create mode 100644 sha256.hh create mode 100644 sptr.hh create mode 100644 static_assert.hh create mode 100644 storage_info_file.cc create mode 100644 storage_info_file.hh create mode 100644 tartool/CMakeLists.txt create mode 100644 tartool/tartool.cc create mode 100644 tests/TODO.txt create mode 100644 tests/encrypted_file/encrypted_file.pro create mode 100644 tests/encrypted_file/test_encrypted_file.cc create mode 100644 tests/rolling_hash/rolling_hash.pro create mode 100644 tests/rolling_hash/test_rolling_hash.cc create mode 100644 tmp_mgr.cc create mode 100644 tmp_mgr.hh create mode 100644 unbuffered_file.cc create mode 100644 unbuffered_file.hh create mode 100644 zbackup.cc create mode 100644 zbackup.hh create mode 100644 zbackup.proto diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..07ae713 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,36 @@ +# Copyright (c) 2012-2013 Konstantin Isakov +# Part of ZBackup. Licensed under GNU GPLv2 or later + +cmake_minimum_required( VERSION 2.8.9 ) +project( zbackup ) + +set( CMAKE_BUILD_TYPE Release ) + +find_package( ZLIB REQUIRED ) +include_directories( ${ZLIB_INCLUDE_DIRS} ) + +find_package( OpenSSL REQUIRED ) +include_directories( ${OPENSSL_INCLUDE_DIR} ) + +find_package( Protobuf REQUIRED ) +include_directories( ${PROTOBUF_INCLUDE_DIRS} ) +include_directories( ${CMAKE_CURRENT_BINARY_DIR} ) +PROTOBUF_GENERATE_CPP( protoSrcs protoHdrs zbackup.proto ) + +find_package( Threads REQUIRED ) + +find_package( LibLZMA REQUIRED ) +include_directories( ${LIBLZMA_INCLUDE_DIRS} ) + +file( GLOB sourceFiles "*.cc" ) +add_executable( zbackup ${sourceFiles} ${protoSrcs} ${protoHdrs} ) + +target_link_libraries( zbackup + ${PROTOBUF_LIBRARIES} + ${OPENSSL_LIBRARIES} + ${CMAKE_THREAD_LIBS_INIT} + ${ZLIB_LIBRARIES} + ${LIBLZMA_LIBRARIES} +) + +install( TARGETS zbackup DESTINATION bin ) diff --git a/README.md b/README.md index e057563..7a5fb55 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,170 @@ -zbackup -======= +# Introduction -ZBackup, a versatile deduplicating backup tool +**zbackup** is globally-deduplicating backup tool, based on the ideas found in [rsync](http://rsync.samba.org/). Feed a large `.tar` into it, and it will eliminate any duplicate blocks in it, then compress and optionally encrypt the result. Feed another `.tar` file, and it will not only eliminate any duplicate data in it, but also re-use any blocks found in any previous backups. This way only new changes are stored, and as long as the files are not very different, the amount of storage required is very low. The program is format-agnostic, so you can feed virtually any files to it (any types of archives, proprietary formats, even raw disk images -- but see [Caveats](#caveats)). + +This is achieved by sliding a window with a rolling hash over the input at a byte granularity and checking whether the block in focus was ever met already. If a rolling hash matches, an additional full cryptographic hash is calculated to ensure the block is indeed the same. The deduplication happens then. + +# Features + +The program has the following features: + + * Parallel LZMA compression of the stored data + * Built-in AES encryption of the stored data + * Possibility to delete old backup data in the future + * Use of a 64-bit rolling hash, keeping the amount of soft collisions to zero + * Repository consists of immutable files. No existing files are ever modified + * Written in C++ only with only modest library dependencies + * Safe to use in production (see [below](#safety)) + +# Build dependencies + + * `cmake` >= 6.8.9 (though it should not be too hard to compile the sources by hand if needed) + * `libssl-dev` for all encryption, hashing and random numbers + * `libprotobuf-dev` for data serialization + * `liblzma-dev` for compression + * `zlib1g-dev` for adler32 calcuation + +# Quickstart + +To build: + +```bash +cd zbackup +cmake . +make +sudo make install +# or just run as ./zbackup +``` + +To use: + +```bash +zbackup init --non-enrypted /my/backup/repo +tar c /my/precious/data | zbackup backup /my/backup/repo/backups/backup-`date '+%Y-%m-%d'` +zbackup restore /my/backup/repo/backups/backup-`date '+%Y-%m-%d'` > /my/precious/backup-restored.tar +``` + +If you have a lot of RAM to spare, you can use it to speed-up the restore process -- to use 512 MB more, pass `--cache-size 512mb` when restoring. + +If encryption is wanted, create a file with your password: + +``` bash +# more secure to to use an editor +echo mypassword > ~/.my_backup_password +chmod 600 ~/.my_backup_password +``` + +Then init the repo the following way: + +```bash +zbackup init --password-file ~/.my_backup_password /my/backup/repo +``` + +And always pass the same argument afterwards: +```bash +tar c /my/precious/data | zbackup --password-file ~/.my_backup_password backup /my/backup/repo/backups/backup-`date '+%Y-%m-%d'` +zbackup --password-file ~/.my_backup_password restore /my/backup/repo/backups/backup-`date '+%Y-%m-%d'` > /my/precious/backup-restored.tar +``` + +If you have a 32-bit system and a lot of cores, consider lowering the number of compression threads by passing `--threads 4` or `--threads 2` if the program runs out of address space when backing up (see why [below](#caveats)). There should be no problem on a 64-bit system. + +# Caveats + + * While you can pipe any data into the program, the data should be uncompressed and unencrypted -- otherwise no deduplication could be performed on it. `zbackup` would compress and encrypt the data itself, so there's no need to do that yourself. So just run `tar c` and pipe it into `zbackup` directly. If backing up disk images employing encryption, pipe the unencrypted version (the one you normally mount). If you create `.zip` or `.rar` files, use no compression (`-0` or `-m0`) and no encryption. + * Parallel LZMA compression uses a lot of RAM (several hundreds of megabytes, depending on the number of threads used), and ten times more virtual address space. The latter is only relevant on 32-bit architectures where it's limited to 2 or 3 GB. If you hit the ceiling, lower the number of threads with `--threads`. + * Since the data is deduplicated, there's naturally no redundancy in it. A loss of a single file can lead to a loss of virtually all data. Make sure you store it on a redundant storage (RAID1, a cloud provider etc). + * The encryption key, if used, is stored in the `info` file in the root of the repo. It is encrypted with your password. Technically thus you can change your password without re-encrypting any data, and as long as no one possesses the old `info` file and knows your old password, you would be safe (even though the actual option to change password is not implemented yet -- someone who needs this is welcome to create a pull request -- the possibility is all there). Also note that it is crucial you don't lose your `info` file, as otherwise the whole backup would be lost. + +# Limitations + + * Right now the only modes supported are reading from standard input and writing to standard output. FUSE mounts and NBD servers may be added later if someone contributes the code. + * The program keeps all known blocks in an in-RAM hash table, which may create scalability problems for very large repos (see [below](#scalability)). + * The only encryption mode currently implemented is `AES-128` in `CBC` mode with `PKCS#7` padding. If you believe that this is not secure enough, patches are welcome. Before you jump to conclusions however, read [this article](http://www.schneier.com/blog/archives/2009/07/another_new_aes.html). + * The only compression mode supported is LZMA, which suits backups very nicely. + * It's only possible to fully restore the backup in order to get to a required file, without any option to quickly pick it out. `tar` would not allow to do it anyway, but e.g. for `zip` files it could have been possible. This is possible to implement though, e.g. by exposing the data over a FUSE filesystem. + * There's no option to delete old backup data yet. The possibility is all there, though. Someone needs to implement it (see [below](#improvements)). + * There's no option to specify block and bundle sizes other than the default ones (currently `64k` and `2MB` respectively), though it's trivial to add command-line switches for those. + +Most of those limitations can be lifted by implementing the respective features. + +# Safety + +Is it safe to use `zbackup` for production data? Being free software, the program comes with no warranty of any kind. That said, it's perfectly safe for production, and here's why. When performing a backup, the program never modifies or deletes any existing files -- only new ones are created. It specifically checks for that, and the code paths involved are short and easy to inspect. Furthermore, each backup is protected by its `SHA256` sum, which is calculated before piping the data into the deduplication logic. The code path doing that is also short and easy to inspect. When a backup is being restored, its `SHA256` is calculated again and compared against the stored one. The program would fail on a mismatch. Therefore, to ensure safety it is enough to restore each backup to `/dev/null` immediately after creating it. If it restores fine, it will restore fine ever after. +To add some statistics, the author of the program has been using an older version of `zbackup` internally for over a year. The `SHA256` check never ever failed. Again, even if it does, you would know immediately, so no work would be lost. Therefore you are welcome to try the program in production, and if you like it, stick with it. + +# Usage notes + +The repository has the following directory structure: + +``` +/repo + backups/ + bundles/ + 00/ + 01/ + 02/ + ... + index/ + info +``` + + * The `backups` directory contain your backups. Those are very small files which are needed for restoration. They are encrypted if encryption is enabled. The names can be arbitrary. It is possible to arrange files in subdirectories, too. Free renaming is also allowed. + * The `bundles` directory contains the bulk of data. Each bundle internally contains multiple small chunks, compressed together and encrypted. Together all those chunks account for all deduplicated data stored. + * The `index` directory contains the full index of all chunks in the repository, together with their bundle names. A separate index file is created for each backup session. Technically those files are redundant, all information is contained in the bundles themselves. However, having a separate `index` is nice for two reasons: 1) it's faster to read as it incurs less seeks, and 2) it allows making backups while storing bundles elsewhere. Bundles are only needed when restoring -- otherwise it's sufficient to only have `index`. One could then move all newly created bundles into another machine after each backup. + * `info` is a very important file, which contains all global repository metadata, such as chunk and bundle sizes, and an encryption key encrypted with the user password. It is paramount not to lose it, so backing it up separately somewhere might be a good idea. On the other hand, if you absolutely don't trust your remote storage provider, you might consider not storing it with the rest of the data. It would then be impossible to decrypt it at all, even if your password gets known later. + +The program does not have any facilities for sending your backup over the network. You can `rsync` the repo to another computer or use any kind of cloud storage capable of storing files. Since `zbackup` never modifies any existing files, the latter is especially easy -- just tell the upload tool you use not to upload any files which already exist on the remote side (e.g. with `gsutil` it's `gsutil cp -R -n /my/backup gs:/mybackup/`). + +To aid with creating backups, there's an utility called `tartool` included with `zbackup`. The idea is the following: one sprinkles empty files called `.backup` and `.no-backup` across the entire filesystem. Directories where `.backup` files are placed are marked for backing up. Similarly, directories with `.no-backup` files are marked not to be backed up. Additionally, it is possible to place `.backup-XYZ` in the same directory where `XYZ` is to mark `XYZ` for backing up, or place `.no-backup-XYZ` to mark it not to be backed up. Then `tartool` can be run with three arguments -- the root directory to start from (can be `/`), the output `includes` file, and the output `excludes` file. The tool traverses over the given directory noting the `.backup*` and `.no-backup*` files and creating include and exclude lists for the `tar` utility. The `tar` utility could then be run as `tar c --files-from includes --exclude-from excludes` to store all chosen data. + +# Scalability + +This section tries do address the question on the maximum amount of data which can be held in a backup repository. What is meant here is the deduplicated data. The number of bytes in all source files ever fed into the repository doesn't matter, but the total size of the resulting repository does. +Internally all input data is split into small blocks called chunks (up to `64k` each by default). Blocks are collected into bundles (up to `2MB` each by default), and those bundles are then compressed and encrypted. + +There are then two problems with the total number of chunks in the repository: + + * Hashes of all existing chunks are needed to be kept in RAM while the backup is ongoing. Since the sliding window performs checking with a single-byte granularity, lookups would otherwise be too slow. The amount of data needed to be stored is technically only 24 bytes for each chunk, where the size of the chunk is up to `64k`. In an example real-life `18GB` repo, only `18MB` are taken by in its hash index. Multiply this roughly by two to have an estimate of RAM needed to store this index as an in-RAM hash table. However, as this size is proportional to the total size of the repo, for `2TB` repo you could already require `2GB` of RAM. Most repos are much smaller though, and as long as the deduplication works properly, in many cases you can store terabytes of highly-redundant backup files in a `20GB` repo easily. + * We use a 64-bit rolling hash, which allows to have an `O(1)` lookup cost at each byte we process. Due to [birthday paradox](https://en.wikipedia.org/wiki/Birthday_paradox), we would start having collisions when we approach `2^32` hashes. If each chunk we have is `32k` on average, we would get there when our repo grows to `128TB`. We would still be able to continue, but as the number of collisions would grow, we would have to resort to calculating the full hash of a block at each byte more and more often, which would result in a considerable slowdown. + +All in all, as long as the amount of RAM permits, one can go up to several terabytes in deduplicated data, and start having some slowdown after having hundreds of terabytes, RAM-permitting. + +# Design choices + + * We use a 64-bit modified Rabin-Karp rolling hash (see `rolling_hash.hh` for details), while most other programs use a 32-bit one. As noted previously, one problem with the hash size is its birthday bound, which with the 32-bit hash is met after having only `2^16` hashes. The choice of a 64-bit hash allows us to scale much better while having virtually the same calculation cost on a typical 64-bit machine. + * `rsync` uses `MD5` as its strong hash. While `MD5` is known to be fast, it is also known to be broken, allowing a malicious user to craft colliding inputs. `zbackup` uses `SHA1` instead. The cost of `SHA1` calculations on modern machines is actually less than that of `MD5` (run `openssl speed md5 sha1` on yours), so it's a win-win situation. We only keep the first 128 bits of the `SHA1` output, and therefore together with the rolling hash we have a 192-bit hash for each chunk. It's a multiple of 8 bytes which is a nice properly on 64-bit machines, and it is long enough not to worry about possible collisions. + * `AES-128` in `CBC` mode with `PKCS#7` padding is used for encryption. This seems to be a reasonbly safe classic solution. Each encrypted file has a random IV as its first 16 bytes. + * We use Google's [protocol buffers](https://developers.google.com/protocol-buffers/) to represent data structures in binary form. They are very efficient and relatively simple to use. + +# Improvements + +There's a lot to be improved in the program. It was released with the minimum amount of functionality to be useful. It is also stable. This should hopefully stimulate people to join the development and add all those other fancy features. Here's a list of ideas: + + * Additional options, such as configurable chunk and bundle sizes etc. + * A command to change password. + * A command to perform garbage collection. The program should skim through all backups and note which chunks are used by all of them. Then it should skim through all bundles and see which chunks among the ones stored were never used by the backups. If a bundle has more than *X%* of unused chunks, the remaining chunks should be transferred into brand new bundles. The old bundles should be deleted then. Once the process finishes, a new single index file with all existing chunk ids should be written, replacing all previous index files. With this command, it would become possible to remove old backups. + * A command to fsck the repo by doing something close to what garbage collection does, but also checking all hashes and so on. + * Parallel decompression. Right now decompression is single-threaded, but it is possible to look ahead in the stream and perform prefetching. + * Support for mounting the repo over FUSE. Random access to data would then be possible. + * Support for exposing a backed up file over a userspace NBD server. It would then be possible to mount raw disk images without extracting them. + * Support for other encryption types (preferably for everything `openssl` supports with its `evp`). + * Support for other compression methods. + * You name it! + +# Communication + +The program's website is at +Development happens at +Discussion forum is at . Please ask for help there! + +The author is reachable over email at . Please be constructive and don't ask for help using the program, though. In most cases it's best to stick to the forum, unless you have something to discuss with the author in private. + +# Credits + +Copyright (c) 2013-2013 Konstantin Isakov (). Licensed under GNU GPLv2 or later. + +This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \ No newline at end of file diff --git a/adler32.hh b/adler32.hh new file mode 100644 index 0000000..0fcd533 --- /dev/null +++ b/adler32.hh @@ -0,0 +1,35 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef ADLER32_HH_INCLUDED__ +#define ADLER32_HH_INCLUDED__ + +#include +#include +#include + +/// A simple wrapper to calculate adler32 +class Adler32 +{ +public: + typedef uint32_t Value; + + Adler32(): value( ( Value ) adler32( 0, 0, 0 ) ) {} + + void add( void const * data, size_t size ) + { + // When size is 0, we assume a no-op was requested and 'data' should be + // ignored. However, adler32() has a special semantic for NULL 'data'. + // Therefore we check the size before calling it + if ( size ) + value = ( Value ) adler32( value, ( Bytef const * ) data, size ); + } + + Value result() const + { return value; } + +private: + Value value; +}; + +#endif diff --git a/appendallocator.cc b/appendallocator.cc new file mode 100644 index 0000000..944d868 --- /dev/null +++ b/appendallocator.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include +#include + +#include "appendallocator.hh" + +AppendAllocator::AppendAllocator( unsigned blockSize_, unsigned granularity ): + alignMask( granularity - 1 ), + // We may decide to enlarge the block to make sure it is a multiple of + // granularity. An improperly sized block would just waste the leftover + // bytes + blockSize( ( blockSize_ + alignMask ) & ~alignMask ), leftInBlock( -1 ) +{ +} + +char * AppendAllocator::allocateBytes( unsigned size ) +{ + // For zero-sized allocations, we always return a non-zero pointer. To do + // that, we need to make sure we have it + if ( !size && !blocks.empty() ) + return nextAvailable; + + if ( leftInBlock < (int) size ) + { + unsigned toAllocate = ( size <= blockSize ? blockSize : size ); + + // Need a new block + char * p = (char *) malloc( toAllocate ); + + if ( !p ) + throw std::bad_alloc(); + + blocks.push_back( Record( p, nextAvailable, leftInBlock ) ); + + leftInBlock = (int) toAllocate; + nextAvailable = p; + } + + // We may need to allocate more than was asked to preserve granularity + int toTake = (int) ( ( size + alignMask ) & ~alignMask ); + + char * result = nextAvailable; + + nextAvailable += toTake; + + leftInBlock -= toTake; // leftInBlock can become negative here, as toTake can + // actually be larger than the space left due to an added alignment + + return result; +} + +void AppendAllocator::returnBytes( unsigned size ) +{ + if ( !size ) + return; + + // If we are pointing to the start of the block, we need to free it and go + // back to the previous one + if ( nextAvailable == blocks.back().data ) + { + if ( blocks.size() == 1 ) + throw std::bad_alloc(); + + free( blocks.back().data ); + leftInBlock = blocks.back().prevLeftInBlock; + nextAvailable = blocks.back().prevNextAvailable; + blocks.pop_back(); + } + + unsigned toTake = ( size + alignMask ) & ~alignMask; + + // There must be enough used bytes in the block + if ( nextAvailable - blocks.back().data < (int) toTake ) + throw std::bad_alloc(); + + nextAvailable -= toTake; + leftInBlock += toTake; +} + +void AppendAllocator::clear() +{ + for ( unsigned x = blocks.size(); x--; ) + free( blocks[ x ].data ); + blocks.clear(); + + leftInBlock = -1; +} + +AppendAllocator::~AppendAllocator() +{ + clear(); +} diff --git a/appendallocator.hh b/appendallocator.hh new file mode 100644 index 0000000..55a1a23 --- /dev/null +++ b/appendallocator.hh @@ -0,0 +1,67 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef APPENDALLOCATOR_HH_INCLUDED__ +#define APPENDALLOCATOR_HH_INCLUDED__ + +#include +#include +#include +#include + +/// A simple "add-only" memory allocation mechanism. +class AppendAllocator +{ + unsigned alignMask; + unsigned blockSize; + + struct Record + { + char * data; + char * prevNextAvailable; + int prevLeftInBlock; + + Record( char * data_, char * prevNextAvailable_, int prevLeftInBlock_ ): + data( data_ ), prevNextAvailable( prevNextAvailable_ ), + prevLeftInBlock( prevLeftInBlock_ ) {} + }; + + std::vector< Record > blocks; + char * nextAvailable; + int leftInBlock; // Can become < 0 due to added alignment + +public: + + /// blockSize is the amount of bytes allocated for each of the underlying + /// storage blocks. granularity makes sure you allocate objects with + /// the proper alignment. It must be a power of 2 + AppendAllocator( unsigned blockSize, unsigned granularity ); + ~AppendAllocator(); + + /// Removes all data from the append allocator. + void clear(); + + /// Allocates a size-sized memory block. The only way to free it is to + /// destroy the whole AppendAllocator. Can throw bad_alloc in an out-of- + /// memory situation + char * allocateBytes( unsigned size ); + + /// Returns the allocated bytes back. The size must match the size passed + /// to allocateBytes() on the last invocation. Calls to allocateBytes()/ + /// returnBytes() must follow the stack order - returnBytes() should undo + /// the previous allocateBytes() + void returnBytes( unsigned size ); + + /// Allocates memory to hold 'count' objects of T. In essense, it just does + /// multiplication and type casting + template< typename T > + T * allocateObjects( unsigned count ) + { return (T *) allocateBytes( count * sizeof( T ) ); } + + /// Returns the allocated objects back + template< typename T > + void returnObjects( unsigned count ) + { returnBytes( count * sizeof( T ) ); } +}; + +#endif diff --git a/backup_creator.cc b/backup_creator.cc new file mode 100644 index 0000000..4151b7f --- /dev/null +++ b/backup_creator.cc @@ -0,0 +1,280 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include +#include + +#include "backup_creator.hh" +#include "check.hh" +#include "debug.hh" +#include "message.hh" +#include "page_size.hh" +#include "static_assert.hh" + +namespace { + unsigned const MinChunkSize = 256; +} + +BackupCreator::BackupCreator( StorageInfo const & info, + ChunkIndex & chunkIndex, + ChunkStorage::Writer & chunkStorageWriter ): + chunkMaxSize( info.chunk_max_size() ), + chunkIndex( chunkIndex ), chunkStorageWriter( chunkStorageWriter ), + ringBufferFill( 0 ), + chunkToSaveFill( 0 ), + backupDataStream( new google::protobuf::io::StringOutputStream( &backupData ) ), + chunkIdGenerated( false ) +{ + // In our ring buffer we have enough space to store one chunk plus an extra + // page for buffering the input + ringBuffer.resize( chunkMaxSize + getPageSize() ); + + begin = ringBuffer.data(); + end = &ringBuffer.back() + 1; + head = begin; + tail = head; + + chunkToSave.resize( chunkMaxSize ); +} + +void * BackupCreator::getInputBuffer() +{ + return head; +} + +size_t BackupCreator::getInputBufferSize() +{ + if ( tail > head ) + return tail - head; + else + if ( tail == head && ringBufferFill ) + return 0; + else + return end - head; +} + +void BackupCreator::handleMoreData( unsigned added ) +{ + // Note: head is never supposed to wrap around in the middle of the operation, + // as getInputBufferSize() never returns a value which could result in a + // wrap-around + while( added ) + { + // If we don't have a full chunk, we need to consume data until we have + // one + if ( ringBufferFill < chunkMaxSize ) + { + unsigned left = chunkMaxSize - ringBufferFill; + bool canFullyFill = added >= left; + + unsigned toFill = canFullyFill ? left : added; + + added -= toFill; + ringBufferFill += toFill; + + while ( toFill-- ) + rollingHash.rollIn( *head++ ); + + if ( head == end ) + head = begin; + + // If we've managed to fill in the complete chunk, attempt matching it + if ( canFullyFill ) + addChunkIfMatched(); + } + else + { + // At this point we have a full chunk in the ring buffer, so we can rotate + // over a byte + chunkToSave[ chunkToSaveFill++ ] = *tail; + + if ( chunkToSaveFill == chunkMaxSize ) + // Got the full chunk - save it + saveChunkToSave(); + + rollingHash.rotate( *head++, *tail++ ); + + if ( head == end ) + head = begin; + + if ( tail == end ) + tail = begin; + + addChunkIfMatched(); + + --added; // A byte was consumed + } + } +} + +void BackupCreator::saveChunkToSave() +{ + CHECK( chunkToSaveFill > 0, "chunk to save is empty" ); + + if ( chunkToSaveFill < 128 ) // TODO: make this value configurable + { + // The amount of data is too small - emit without creating a new chunk + BackupInstruction instr; + instr.set_bytes_to_emit( chunkToSave.data(), chunkToSaveFill ); + outputInstruction( instr ); + } + else + { + // Output as a chunk + + ChunkId id; + + id.rollingHash = RollingHash::digest( chunkToSave.data(), + chunkToSaveFill ); + unsigned char sha1Value[ SHA_DIGEST_LENGTH ]; + SHA1( (unsigned char const *) chunkToSave.data(), chunkToSaveFill, + sha1Value ); + + STATIC_ASSERT( sizeof( id.cryptoHash ) <= sizeof( sha1Value ) ); + memcpy( id.cryptoHash, sha1Value, sizeof( id.cryptoHash ) ); + + // Save it to the store if it's not there already + chunkStorageWriter.add( id, chunkToSave.data(), chunkToSaveFill ); + + BackupInstruction instr; + instr.set_chunk_to_emit( id.toBlob() ); + outputInstruction( instr ); + } + + chunkToSaveFill = 0; +} + +void BackupCreator::finish() +{ + dPrintf( "At finish: %u, %u\n", chunkToSaveFill, ringBufferFill ); + + // At this point we may have some bytes in chunkToSave, and some in the ring + // buffer. We need to save both + if ( chunkToSaveFill + ringBufferFill > chunkMaxSize ) + { + // We have more than a full chunk in chunkToSave and ringBuffer together, so + // save the first part as a full chunk first + + // Move data from ring buffer to have full chunk in chunkToSave. + moveFromRingBufferToChunkToSave( chunkMaxSize - chunkToSaveFill ); + saveChunkToSave(); + } + + // Concatenate the rest of data and save it too + + CHECK( chunkToSaveFill + ringBufferFill <= chunkMaxSize, "had more than two " + "full chunks at backup finish" ); + + moveFromRingBufferToChunkToSave( ringBufferFill ); + + if ( chunkToSaveFill ) + saveChunkToSave(); +} + +void BackupCreator::moveFromRingBufferToChunkToSave( unsigned toMove ) +{ + // If tail is before head, all data in the ring buffer is in one contiguous + // piece. If not, it's in two pieces + if ( tail < head ) + { + memcpy( chunkToSave.data() + chunkToSaveFill, tail, toMove ); + tail += toMove; + } + else + { + unsigned toEnd = end - tail; + + unsigned firstPart = toEnd < toMove ? toEnd : toMove; + memcpy( chunkToSave.data() + chunkToSaveFill, tail, firstPart ); + + tail += firstPart; + + if ( toMove > firstPart ) + { + unsigned secondPart = toMove - firstPart; + memcpy( chunkToSave.data() + chunkToSaveFill + firstPart, begin, + secondPart ); + tail = begin + secondPart; + } + } + + if ( tail == end ) + tail = begin; + + chunkToSaveFill += toMove; + ringBufferFill -= toMove; +} + +ChunkId const & BackupCreator::getChunkId() +{ + if ( !chunkIdGenerated ) + { + // Calculate SHA1 + SHA_CTX ctx; + SHA1_Init( &ctx ); + + if ( tail < head ) + { + // Tail is before head - all the block is in one contiguous piece + SHA1_Update( &ctx, tail, head - tail ); + } + else + { + // Tail is after head - the block consists of two pieces + SHA1_Update( &ctx, tail, end - tail ); + SHA1_Update( &ctx, begin, head - begin ); + } + + unsigned char sha1Value[ SHA_DIGEST_LENGTH ]; + SHA1_Final( sha1Value, &ctx ); + + generatedChunkId.rollingHash = rollingHash.digest(); + + memcpy( generatedChunkId.cryptoHash, sha1Value, + sizeof( generatedChunkId.cryptoHash ) ); + + chunkIdGenerated = true; + } + + return generatedChunkId; +} + +void BackupCreator::addChunkIfMatched() +{ + chunkIdGenerated = false; + + if ( chunkIndex.findChunk( rollingHash.digest(), *this ) ) + { +// verbosePrintf( "Reuse of chunk %lu\n", rollingHash.digest() ); + + // Before emitting the matched chunk, we need to make sure any bytes + // which came before it are saved first + if ( chunkToSaveFill ) + saveChunkToSave(); + + // Add the record + BackupInstruction instr; + instr.set_chunk_to_emit( getChunkId().toBlob() ); + outputInstruction( instr ); + + // The block was consumed from the ring buffer - remove the block from it + tail = head; + ringBufferFill = 0; + rollingHash.reset(); + } +} + +void BackupCreator::outputInstruction( BackupInstruction const & instr ) +{ + // TODO: once backupData becomes large enough, spawn another BackupCreator and + // feed data to it. This way we wouldn't have to store the entire backupData + // in RAM + Message::serialize( instr, *backupDataStream ); +} + +void BackupCreator::getBackupData( string & str ) +{ + CHECK( backupDataStream.get(), "getBackupData() called twice" ); + backupDataStream.reset(); + str.swap( backupData ); +} diff --git a/backup_creator.hh b/backup_creator.hh new file mode 100644 index 0000000..42d590d --- /dev/null +++ b/backup_creator.hh @@ -0,0 +1,91 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef BACKUP_CREATOR_HH_INCLUDED__ +#define BACKUP_CREATOR_HH_INCLUDED__ + +#include +#include +#include +#include + +#include "chunk_id.hh" +#include "chunk_index.hh" +#include "chunk_storage.hh" +#include "file.hh" +#include "nocopy.hh" +#include "rolling_hash.hh" +#include "sptr.hh" +#include "zbackup.pb.h" + +using std::vector; +using std::string; + +/// Creates a backup by processing input data and matching/writing chunks +class BackupCreator: ChunkIndex::ChunkInfoInterface, NoCopy +{ + unsigned chunkMaxSize; + ChunkIndex & chunkIndex; + ChunkStorage::Writer & chunkStorageWriter; + vector< char > ringBuffer; + // Ring buffer vars + char * begin; + char * end; + char * head; + char * tail; + unsigned ringBufferFill; + + /// In this buffer we assemble the next chunk to be eventually stored. We + /// copy the bytes from the ring buffer. While the copying may be avoided in + /// some cases, the plan is to move to multi-threaded chunk storage in the + /// future, where it would be necessary in any case + vector< char > chunkToSave; + unsigned chunkToSaveFill; /// Number of bytes accumulated in chunkToSave + /// When we have data in chunkToSave, this points to the record in backupData + /// which should store it + unsigned recordIndexToSaveDataInto; + + RollingHash rollingHash; + + string backupData; + sptr< google::protobuf::io::StringOutputStream > backupDataStream; + + /// Sees if the current block in the ring buffer exists in the chunk store. + /// If it does, the reference is emitted and the ring buffer is cleared + void addChunkIfMatched(); + + /// Outputs data contained in chunkToSave as a new chunk + void saveChunkToSave(); + + /// Move the given amount of bytes from the ring buffer to the chunk to save. + /// Ring buffer must have at least that many bytes + void moveFromRingBufferToChunkToSave( unsigned bytes ); + + /// Outputs the given instruction to the backup stream + void outputInstruction( BackupInstruction const & ); + + bool chunkIdGenerated; + ChunkId generatedChunkId; + virtual ChunkId const & getChunkId(); + +public: + BackupCreator( StorageInfo const &, ChunkIndex &, ChunkStorage::Writer & ); + + /// The data is fed the following way: the user fills getInputBuffer() with + /// up to getInputBufferSize() bytes, then calls handleMoreData() with the + /// number of bytes written + void * getInputBuffer(); + size_t getInputBufferSize(); + + void handleMoreData( unsigned ); + + /// Flushes any remaining data and finishes the process. No additional data + /// may be added after this call is made + void finish(); + + /// Returns the result of the backup creation. Can only be called once the + /// finish() was called and the backup is complete + void getBackupData( string & ); +}; + +#endif diff --git a/backup_file.cc b/backup_file.cc new file mode 100644 index 0000000..fe7375f --- /dev/null +++ b/backup_file.cc @@ -0,0 +1,48 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include "backup_file.hh" + +#include "encrypted_file.hh" +#include "encryption.hh" +#include "message.hh" + +namespace BackupFile { + +enum +{ + FileFormatVersion = 1 +}; + +void save( string const & fileName, EncryptionKey const & encryptionKey, + BackupInfo const & backupInfo ) +{ + EncryptedFile::OutputStream os( fileName.c_str(), encryptionKey, + Encryption::ZeroIv ); + os.writeRandomIv(); + + FileHeader header; + header.set_version( FileFormatVersion ); + Message::serialize( header, os ); + + Message::serialize( backupInfo, os ); + os.writeAdler32(); +} + +void load( string const & fileName, EncryptionKey const & encryptionKey, + BackupInfo & backupInfo ) +{ + EncryptedFile::InputStream is( fileName.c_str(), encryptionKey, + Encryption::ZeroIv ); + is.consumeRandomIv(); + + FileHeader header; + Message::parse( header, is ); + if ( header.version() != FileFormatVersion ) + throw exUnsupportedVersion(); + + Message::parse( backupInfo, is ); + is.checkAdler32(); +} + +} diff --git a/backup_file.hh b/backup_file.hh new file mode 100644 index 0000000..c74242b --- /dev/null +++ b/backup_file.hh @@ -0,0 +1,28 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef BACKUP_FILE_HH_INCLUDED__ +#define BACKUP_FILE_HH_INCLUDED__ + +#include +#include + +#include "encryption_key.hh" +#include "ex.hh" +#include "zbackup.pb.h" + +namespace BackupFile { + +using std::string; + +DEF_EX( Ex, "Backup file exception", std::exception ) +DEF_EX( exUnsupportedVersion, "Unsupported version of the backup file format", Ex ) + +/// Saves the given BackupInfo data into the given file +void save( string const & fileName, EncryptionKey const &, BackupInfo const & ); + +/// Loads the given BackupInfo data from the given file +void load( string const & fileName, EncryptionKey const &, BackupInfo & ); +} + +#endif diff --git a/backup_restorer.cc b/backup_restorer.cc new file mode 100644 index 0000000..445e0fc --- /dev/null +++ b/backup_restorer.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include +#include +#include + +#include "backup_restorer.hh" +#include "chunk_id.hh" +#include "message.hh" +#include "zbackup.pb.h" + +namespace BackupRestorer { + +using std::vector; +using google::protobuf::io::CodedInputStream; + +void restore( ChunkStorage::Reader & chunkStorageReader, + std::string const & backupData, + DataSink & output ) +{ + google::protobuf::io::ArrayInputStream is( backupData.data(), + backupData.size() ); + CodedInputStream cis( &is ); + CodedInputStream::Limit limit = cis.PushLimit( backupData.size() ); + + // Used when emitting chunks + string chunk; + + BackupInstruction instr; + while ( cis.BytesUntilLimit() > 0 ) + { + Message::parse( instr, cis ); + + if ( instr.has_chunk_to_emit() ) + { + // Need to emit a chunk, reading it from the store + size_t chunkSize; + chunkStorageReader.get( ChunkId( instr.chunk_to_emit() ), chunk, + chunkSize ); + output.saveData( chunk.data(), chunkSize ); + } + + if ( instr.has_bytes_to_emit() ) + { + // Need to emit the bytes directly + string const & bytes = instr.bytes_to_emit(); + output.saveData( bytes.data(), bytes.size() ); + } + } + + cis.PopLimit( limit ); +} +} diff --git a/backup_restorer.hh b/backup_restorer.hh new file mode 100644 index 0000000..6ea6712 --- /dev/null +++ b/backup_restorer.hh @@ -0,0 +1,33 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef BACKUP_RESTORER_HH_INCLUDED__ +#define BACKUP_RESTORER_HH_INCLUDED__ + +#include +#include +#include + +#include "chunk_storage.hh" +#include "ex.hh" + +/// Generic interface to stream data out +class DataSink +{ +public: + virtual void saveData( void const * data, size_t size )=0; + virtual ~DataSink() {} +}; + +/// Restores the backup +namespace BackupRestorer { + +DEF_EX( Ex, "Backup restorer exception", std::exception ) +DEF_EX( exTooManyBytesToEmit, "A backup record asks to emit too many bytes", Ex ) + +/// Restores the given backup +void restore( ChunkStorage::Reader &, std::string const & backupData, + DataSink & ); +} + +#endif diff --git a/bundle.cc b/bundle.cc new file mode 100644 index 0000000..14ccee3 --- /dev/null +++ b/bundle.cc @@ -0,0 +1,208 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include +#include + +#include "bundle.hh" +#include "check.hh" +#include "dir.hh" +#include "encrypted_file.hh" +#include "encryption.hh" +#include "hex.hh" +#include "message.hh" + +namespace Bundle { + +enum +{ + FileFormatVersion = 1 +}; + +void Creator::addChunk( string const & id, void const * data, size_t size ) +{ + BundleInfo_ChunkRecord * record = info.add_chunk_record(); + record->set_id( id ); + record->set_size( size ); + payload.append( ( char const * ) data, size ); +} + +void Creator::write( std::string const & fileName, EncryptionKey const & key ) +{ + EncryptedFile::OutputStream os( fileName.c_str(), key, Encryption::ZeroIv ); + + os.writeRandomIv(); + + FileHeader header; + header.set_version( FileFormatVersion ); + Message::serialize( header, os ); + + Message::serialize( info, os ); + os.writeAdler32(); + + // Compress + + uint32_t preset = 6; // TODO: make this customizable, although 6 seems to be + // the best option + lzma_stream strm = LZMA_STREAM_INIT; + lzma_ret ret; + + ret = lzma_easy_encoder( &strm, preset, LZMA_CHECK_CRC64 ); + CHECK( ret == LZMA_OK, "lzma_easy_encoder error: %d", (int) ret ); + + strm.next_in = ( uint8_t const * ) payload.data(); + strm.avail_in = payload.size(); + + for ( ; ; ) + { + { + void * data; + int size; + if ( !os.Next( &data, &size ) ) + { + lzma_end( &strm ); + throw exBundleWriteFailed(); + } + if ( !size ) + continue; + strm.next_out = ( uint8_t * ) data; + strm.avail_out = size; + } + + // Perform the compression + ret = lzma_code( &strm, LZMA_FINISH ); + + if ( ret == LZMA_STREAM_END ) + { + if ( strm.avail_out ) + os.BackUp( strm.avail_out ); + break; + } + + CHECK( ret == LZMA_OK, "lzma_code error: %d", (int) ret ); + } + + lzma_end( &strm ); + + os.writeAdler32(); +} + +Reader::Reader( string const & fileName, EncryptionKey const & key ) +{ + EncryptedFile::InputStream is( fileName.c_str(), key, Encryption::ZeroIv ); + + is.consumeRandomIv(); + + FileHeader header; + Message::parse( header, is ); + + if ( header.version() != FileFormatVersion ) + throw exUnsupportedVersion(); + + BundleInfo info; + Message::parse( info, is ); + is.checkAdler32(); + + size_t payloadSize = 0; + for ( int x = info.chunk_record_size(); x--; ) + payloadSize += info.chunk_record( x ).size(); + + payload.resize( payloadSize ); + + lzma_stream strm = LZMA_STREAM_INIT; + + lzma_ret ret; + + ret = lzma_stream_decoder( &strm, UINT64_MAX, 0 ); + CHECK( ret == LZMA_OK,"lzma_stream_decoder error: %d", (int) ret ); + + strm.next_out = ( uint8_t * ) &payload[ 0 ]; + strm.avail_out = payload.size(); + + for ( ; ; ) + { + { + void const * data; + int size; + if ( !is.Next( &data, &size ) ) + { + lzma_end( &strm ); + throw exBundleReadFailed(); + } + if ( !size ) + continue; + strm.next_in = ( uint8_t const * ) data; + strm.avail_in = size; + } + + ret = lzma_code( &strm, LZMA_RUN ); + + if ( ret == LZMA_STREAM_END ) + { + if ( strm.avail_in ) + is.BackUp( strm.avail_in ); + break; + } + + CHECK( ret == LZMA_OK, "lzma_code error: %d", (int) ret ); + + if ( !strm.avail_out && strm.avail_in ) + { + // Apparently we have more data than we were expecting + lzma_end( &strm ); + throw exTooMuchData(); + } + } + + lzma_end( &strm ); + + is.checkAdler32(); + + // Populate the map + char const * next = payload.data(); + for ( int x = 0, count = info.chunk_record_size(); x < count; ++x ) + { + BundleInfo_ChunkRecord const & record = info.chunk_record( x ); + pair< Chunks::iterator, bool > res = + chunks.insert( Chunks::value_type( record.id(), + Chunks::mapped_type( next, + record.size() ) ) ); + if ( !res.second ) + throw exDuplicateChunks(); // Duplicate key encountered + next += record.size(); + } +} + +bool Reader::get( string const & chunkId, string & chunkData, + size_t & chunkDataSize ) +{ + Chunks::iterator i = chunks.find( chunkId ); + if ( i != chunks.end() ) + { + size_t sz = i->second.second; + if ( chunkData.size() < sz ) + chunkData.resize( sz ); + memcpy( &chunkData[ 0 ], i->second.first, sz ); + + chunkDataSize = sz; + return true; + } + else + return false; +} + +string generateFileName( Id const & id, string const & bundlesDir, + bool createDirs ) +{ + string hex( toHex( ( unsigned char * ) &id, sizeof( id ) ) ); + + // TODO: make this scheme more flexible and allow it to scale, or at least + // be configurable + string level1( Dir::addPath( bundlesDir, hex.substr( 0, 2 ) ) ); + + if ( createDirs && !Dir::exists( level1 ) ) + Dir::create( level1 ); + + return string( Dir::addPath( level1, hex ) ); +} +} diff --git a/bundle.hh b/bundle.hh new file mode 100644 index 0000000..f6cb631 --- /dev/null +++ b/bundle.hh @@ -0,0 +1,106 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef BUNDLE_HH_INCLUDED__ +#define BUNDLE_HH_INCLUDED__ + +#include +#include +#include +#include +#include +#include + +#include "encryption_key.hh" +#include "ex.hh" +#include "nocopy.hh" +#include "static_assert.hh" +#include "zbackup.pb.h" + +namespace Bundle { + +using std::string; +using std::pair; +using std::map; + +enum +{ + /// The number of bytes the bundle id has. We chose 192-bit just to be on + /// the safer side. It is also a multiple of 8 bytes, which is good for + /// alignment + IdSize = 24 +}; + +/// Id of the bundle is IdSize bytes. Can and should be used as a POD type +struct Id +{ + char blob[ IdSize ]; + + bool operator == ( Id const & other ) const + { return memcmp( blob, other.blob, sizeof( blob ) ) == 0; } + bool operator != ( Id const & other ) const + { return ! operator == ( other ); } +}; + +STATIC_ASSERT( sizeof( Id ) == IdSize ); + +/// Creates a bundle by adding chunks to it until it's full, then compressing +/// it and writing out to disk +class Creator +{ + BundleInfo info; + string payload; + +public: + DEF_EX( Ex, "Bundle creator exception", std::exception ) + DEF_EX( exBundleWriteFailed, "Bundle write failed", Ex ) + + /// Adds a chunk with the given id + void addChunk( string const & chunkId, void const * data, size_t size ); + + /// Returns the number of bytes comprising all chunk bodies so far + size_t getPayloadSize() const + { return payload.size(); } + + /// Compresses and writes the bundle to the given file. The operation is + /// time-consuming - calling this function from a worker thread could be + /// warranted + void write( string const & fileName, EncryptionKey const & ); + + /// Returns the current BundleInfo record - this is used for index files + BundleInfo const & getCurrentBundleInfo() const + { return info; } +}; + +/// Reads the bundle and allows accessing chunks +class Reader: NoCopy +{ + /// Unpacked payload + string payload; + /// Maps chunk id blob to its contents and size + typedef map< string, pair< char const *, size_t > > Chunks; + Chunks chunks; + +public: + DEF_EX( Ex, "Bundle reader exception", std::exception ) + DEF_EX( exBundleReadFailed, "Bundle read failed", Ex ) + DEF_EX( exUnsupportedVersion, "Unsupported version of the index file format", Ex ) + DEF_EX( exTooMuchData, "More data than expected in a bundle", Ex ) + DEF_EX( exDuplicateChunks, "Chunks with the same id found in a bundle", Ex ) + + Reader( string const & fileName, EncryptionKey const & ); + + /// Reads the chunk into chunkData and returns true, or returns false if there + /// was no such chunk in the bundle. chunkData may be enlarged but won't + /// be shrunk. The size of the actual chunk would be stored in chunkDataSize + bool get( string const & chunkId, string & chunkData, size_t & chunkDataSize ); +}; + +/// Generates a full file name for a bundle with the given id. If createDirs +/// is true, any intermediate directories will be created if they don't exist +/// already +string generateFileName( Id const &, string const & bundlesDir, + bool createDirs ); +} + +#endif diff --git a/check.hh b/check.hh new file mode 100644 index 0000000..10e7ce6 --- /dev/null +++ b/check.hh @@ -0,0 +1,38 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef CHECK_HH_INCLUDED__ +#define CHECK_HH_INCLUDED__ + +#include +#include +#include + +// Run-time assertion macro + +// Usage: CHECK( value == 16, "Value is not 16: %d", value ); +// This will abort() if the value is not 16 with the message stating so. + +// TODO: show the backtrace here, without using __FILE__ __LINE__ + +#define CHECK( condition, message, ... ) ({if (!(condition)) \ +{ \ + fprintf( stderr, "Check failed: " ); \ + fprintf( stderr, message, ##__VA_ARGS__ ); \ + fprintf( stderr, "\nAt %s:%d\n", __FILE__, __LINE__ ); \ + abort(); \ +}}) + +#define FAIL( ... ) CHECK( false, __VA_ARGS__ ) + + +// Debug-only versions. Only instantiated in debug builds +#ifndef NDEBUG +#define DCHECK CHECK +#define DFAIL FAIL +#else +#define DCHECK( ... ) +#define DFAIL( ... ) +#endif + +#endif diff --git a/chunk_id.cc b/chunk_id.cc new file mode 100644 index 0000000..7322424 --- /dev/null +++ b/chunk_id.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include "chunk_id.hh" + +#include +#include "endian.hh" +#include "check.hh" + +string ChunkId::toBlob() const +{ + string out( BlobSize, 0 ); + + toBlob( &out[ 0 ] ); + + return out; +} + +void ChunkId::toBlob( void * outPtr ) const +{ + char * out = ( char * ) outPtr; + + RollingHash::Digest v = toLittleEndian( rollingHash ); + + memcpy( out, cryptoHash, sizeof( cryptoHash ) ); + memcpy( out + sizeof( cryptoHash ), &v, sizeof( v ) ); +} + +void ChunkId::setFromBlob( void const * data ) +{ + char const * blob = ( char const * ) data; + + RollingHash::Digest v; + + memcpy( cryptoHash, blob, sizeof( cryptoHash ) ); + memcpy( &v, blob + sizeof( cryptoHash ), sizeof( v ) ); + + rollingHash = fromLittleEndian( v ); +} + +ChunkId::ChunkId( string const & blob ) +{ + CHECK( blob.size() == BlobSize, "incorrect blob sise: %zu", blob.size() ); + + setFromBlob( blob.data() ); +} diff --git a/chunk_id.hh b/chunk_id.hh new file mode 100644 index 0000000..f8ce8ce --- /dev/null +++ b/chunk_id.hh @@ -0,0 +1,38 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef CHUNK_ID_HH_INCLUDED__ +#define CHUNK_ID_HH_INCLUDED__ + +#include +#include "rolling_hash.hh" + +using std::string; + +/// Chunk is identified by its crypto hash concatenated with its rolling hash +struct ChunkId +{ + typedef char CryptoHashPart[ 16 ]; + CryptoHashPart cryptoHash; + + typedef RollingHash::Digest RollingHashPart; + RollingHashPart rollingHash; + + enum + { + BlobSize = sizeof( CryptoHashPart ) + sizeof( RollingHashPart ) + }; + + string toBlob() const; + + /// Faster version - should point to a buffer with at least BlobSize bytes + void toBlob( void * ) const; + + /// Set the chunk id data reading from the given blob + void setFromBlob( void const * ); + + ChunkId() {} + ChunkId( string const & blob ); +}; + +#endif diff --git a/chunk_index.cc b/chunk_index.cc new file mode 100644 index 0000000..9ce753c --- /dev/null +++ b/chunk_index.cc @@ -0,0 +1,153 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include +#include +#include +#include + +#include "chunk_index.hh" +#include "debug.hh" +#include "dir.hh" +#include "index_file.hh" +#include "zbackup.pb.h" + +ChunkIndex::Chain::Chain( ChunkId const & id, Bundle::Id const * bundleId ): + next( 0 ), bundleId( bundleId ) +{ + memcpy( cryptoHash, id.cryptoHash, sizeof( cryptoHash ) ); +} + +bool ChunkIndex::Chain::equalsTo( ChunkId const & id ) +{ + return memcmp( cryptoHash, id.cryptoHash, sizeof ( cryptoHash ) ) == 0; +} + +void ChunkIndex::loadIndex() +{ + Dir::Listing lst( indexPath ); + + Dir::Entry entry; + + verbosePrintf( "Loading index...\n" ); + + while( lst.getNext( entry ) ) + { + verbosePrintf( "Loading index file %s...\n", entry.getFileName().c_str() ); + + IndexFile::Reader reader( key, + Dir::addPath( indexPath, entry.getFileName() ) ); + + BundleInfo info; + Bundle::Id bundleId; + while( reader.readNextRecord( info, bundleId ) ) + { + Bundle::Id * savedId = storage.allocateObjects< Bundle::Id >( 1 ); + memcpy( savedId, &bundleId, sizeof( bundleId ) ); + + lastBundleId = savedId; + + ChunkId id; + + for ( int x = info.chunk_record_size(); x--; ) + { + BundleInfo_ChunkRecord const & record = info.chunk_record( x ); + + if ( record.id().size() != ChunkId::BlobSize ) + throw exIncorrectChunkIdSize(); + + id.setFromBlob( record.id().data() ); + registerNewChunkId( id, savedId ); + } + } + } + + verbosePrintf( "Index loaded.\n" ); +} + +ChunkIndex::ChunkIndex( EncryptionKey const & key, TmpMgr & tmpMgr, + string const & indexPath ): + key( key ), tmpMgr( tmpMgr ), indexPath( indexPath ), storage( 65536, 1 ), + lastBundleId( NULL ) +{ + loadIndex(); +} + +Bundle::Id const * ChunkIndex::findChunk( ChunkId::RollingHashPart rollingHash, + ChunkInfoInterface & chunkInfo ) +{ + HashTable::iterator i = hashTable.find( rollingHash ); + + ChunkId const * id = 0; + + if ( i != hashTable.end() ) + { + if ( !id ) + id = &chunkInfo.getChunkId(); + // Check the chains + for ( Chain * chain = i->second; chain; chain = chain->next ) + if ( chain->equalsTo( *id ) ) + return chain->bundleId; + } + + return NULL; +} + +namespace { +struct ChunkInfoImmediate: public ChunkIndex::ChunkInfoInterface +{ + ChunkId const & id; + + ChunkInfoImmediate( ChunkId const & id ): id( id ) {} + + virtual ChunkId const & getChunkId() + { return id; } +}; +} + +Bundle::Id const * ChunkIndex::findChunk( ChunkId const & chunkId ) +{ + ChunkInfoImmediate chunkInfo( chunkId ); + return findChunk( chunkId.rollingHash, chunkInfo ); +} + +ChunkIndex::Chain * ChunkIndex::registerNewChunkId( ChunkId const & id, + Bundle::Id const * bundleId ) +{ + HashTable::iterator i = + hashTable.insert( std::make_pair( id.rollingHash, ( Chain *) 0 ) ).first; + + Chain * & chain = i->second; + + // Check the chains + for ( ; chain; chain = chain->next ) + if ( chain->equalsTo( id ) ) + { + return NULL; // The entry existed already + } + + // Create a new chain + chain = new ( storage.allocateObjects< Chain >( 1 ) ) Chain( id, bundleId ); + + return chain; +} + + +bool ChunkIndex::addChunk( ChunkId const & id, Bundle::Id const & bundleId ) +{ + if ( Chain * chain = registerNewChunkId( id, NULL ) ) + { + // Allocate or re-use bundle id + if ( !lastBundleId || *lastBundleId != bundleId ) + { + Bundle::Id * allocatedId = storage.allocateObjects< Bundle::Id >( 1 ); + memcpy( allocatedId, &bundleId, Bundle::IdSize ); + lastBundleId = allocatedId; + } + chain->bundleId = lastBundleId; + + return true; + } + else + return false; +} diff --git a/chunk_index.hh b/chunk_index.hh new file mode 100644 index 0000000..10f3540 --- /dev/null +++ b/chunk_index.hh @@ -0,0 +1,109 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef CHUNK_INDEX_HH_INCLUDED__ +#define CHUNK_INDEX_HH_INCLUDED__ + +// is obsolete, but requires C++11. Make up your +// mind, GNU people! +#undef __DEPRECATED + +#include +#include +#include +#include +#include +#include + +#include "appendallocator.hh" +#include "bundle.hh" +#include "chunk_id.hh" +#include "dir.hh" +#include "encryption_key.hh" +#include "endian.hh" +#include "ex.hh" +#include "index_file.hh" +#include "nocopy.hh" +#include "rolling_hash.hh" +#include "tmp_mgr.hh" + +using std::vector; + +// 32-bit specific hash function for unsigned long long which is what uint64_t +// is on 32-bit platforms +#if SIZE_MAX == UINT32_MAX +namespace __gnu_cxx +{ + template<> + struct hash< unsigned long long > + { + size_t operator()( unsigned long long v ) const + { return v ^ ( v >> 32 ); } + }; +} +#endif + +/// Maintains an in-memory hash table allowing to check whether we have a +/// specific chunk or not, and if we do, get the bundle id it's in +class ChunkIndex: NoCopy +{ + struct Chain + { + ChunkId::CryptoHashPart cryptoHash; + Chain * next; + Bundle::Id const * bundleId; + + Chain( ChunkId const &, Bundle::Id const * bundleId ); + + bool equalsTo( ChunkId const & id ); + }; + + /// This hash map stores all known chunk ids + /// TODO: implement a custom hash table for better performance + typedef __gnu_cxx::hash_map< RollingHash::Digest, Chain * > HashTable; + + EncryptionKey const & key; + TmpMgr & tmpMgr; + string indexPath; + AppendAllocator storage; + + HashTable hashTable; + + /// Stores the last used bundle id, which can be re-used + Bundle::Id const * lastBundleId; + +public: + DEF_EX( Ex, "Chunk index exception", std::exception ) + DEF_EX( exIncorrectChunkIdSize, "Incorrect chunk id size encountered", Ex ) + + ChunkIndex( EncryptionKey const &, TmpMgr &, string const & indexPath ); + + struct ChunkInfoInterface + { + /// Returns the full id of the chunk. This function is only called if that + /// full id is actually needed, as its generation requires the expensive + /// calculation of the full hash + virtual ChunkId const & getChunkId()=0; + virtual ~ChunkInfoInterface() {} + }; + + /// If the given chunk exists, its bundle id is returned, otherwise NULL + Bundle::Id const * findChunk( ChunkId::RollingHashPart, + ChunkInfoInterface & ); + + /// If the given chunk exists, its bundle id is returned, otherwise NULL + Bundle::Id const * findChunk( ChunkId const & ); + + /// Adds a new chunk to the index if it did not exist already. Returns true + /// if added, false if existed already + bool addChunk( ChunkId const &, Bundle::Id const & ); + +private: + void loadIndex(); + + /// Inserts new chunk id into the in-memory hash table. Returns the created + /// Chain if it was inserted, NULL if it existed before + Chain * registerNewChunkId( ChunkId const & id, Bundle::Id const * ); +}; + +#endif diff --git a/chunk_storage.cc b/chunk_storage.cc new file mode 100644 index 0000000..56c4add --- /dev/null +++ b/chunk_storage.cc @@ -0,0 +1,223 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include "check.hh" +#include "chunk_storage.hh" +#include "debug.hh" +#include "dir.hh" +#include "hex.hh" +#include "random.hh" + +namespace ChunkStorage { + +Writer::Writer( StorageInfo const & storageInfo, + EncryptionKey const & encryptionKey, + TmpMgr & tmpMgr, ChunkIndex & index, string const & bundlesDir, + string const & indexDir, size_t maxCompressorsToRun ): + storageInfo( storageInfo ), encryptionKey( encryptionKey ), + tmpMgr( tmpMgr ), index( index ), bundlesDir( bundlesDir ), + indexDir( indexDir ), hasCurrentBundleId( false ), + maxCompressorsToRun( maxCompressorsToRun ), runningCompressors( 0 ) +{ + verbosePrintf( "Using up to %zu thread(s) for compression\n", + maxCompressorsToRun ); +} + +Writer::~Writer() +{ + waitForAllCompressorsToFinish(); +} + +bool Writer::add( ChunkId const & id, void const * data, size_t size ) +{ + if ( index.addChunk( id, getCurrentBundleId() ) ) + { + // Added to the index? Emit to the bundle then + if ( getCurrentBundle().getPayloadSize() + size > + storageInfo.bundle_max_payload_size() ) + finishCurrentBundle(); + + getCurrentBundle().addChunk( id.toBlob(), data, size ); + + return true; + } + else + return false; +} + +void Writer::commit() +{ + finishCurrentBundle(); + + waitForAllCompressorsToFinish(); + + // Move all bundles + for ( size_t x = pendingBundleRenames.size(); x--; ) + { + PendingBundleRename & r = pendingBundleRenames[ x ]; + r.first->moveOverTo( Bundle::generateFileName( r.second, bundlesDir, + true ) ); + } + + pendingBundleRenames.clear(); + + // Move the index file + if ( indexFile.get() ) + { + indexFile.reset(); + // Generate a random filename + unsigned char buf[ 24 ]; // Same comments as for Bundle::IdSize + + Random::genaratePseudo( buf, sizeof( buf ) ); + + indexTempFile->moveOverTo( Dir::addPath( indexDir, + toHex( buf, sizeof( buf ) ) ) ); + indexTempFile.reset(); + } +} + +Bundle::Creator & Writer::getCurrentBundle() +{ + if ( !currentBundle.get() ) + currentBundle = new Bundle::Creator; + return *currentBundle; +} + +void Writer::finishCurrentBundle() +{ + if ( !currentBundle.get() ) + return; + + Bundle::Id const & bundleId = getCurrentBundleId(); + + if ( !indexFile.get() ) + { + // Create a new index file + indexTempFile = tmpMgr.makeTemporaryFile(); + indexFile = new IndexFile::Writer( encryptionKey, + indexTempFile->getFileName() ); + } + + indexFile->add( currentBundle->getCurrentBundleInfo(), bundleId ); + + sptr< TemporaryFile > file = tmpMgr.makeTemporaryFile(); + + pendingBundleRenames.push_back( PendingBundleRename( file, bundleId ) ); + + // Create a new compressor + + // Wait for some compressors to finish if there are too many of them + Lock _( runningCompressorsMutex ); + while ( runningCompressors >= maxCompressorsToRun ) + runningCompressorsCondition.wait( runningCompressorsMutex ); + + Compressor * compressor = new Compressor( *this, currentBundle, + file->getFileName() ); + + currentBundle.reset(); + hasCurrentBundleId = false; + + compressor->start(); + ++runningCompressors; +} + +void Writer::waitForAllCompressorsToFinish() +{ + Lock _( runningCompressorsMutex ); + while ( runningCompressors ) + runningCompressorsCondition.wait( runningCompressorsMutex ); +} + +Bundle::Id const & Writer::getCurrentBundleId() +{ + if ( !hasCurrentBundleId ) + { + // Generate a new one + Random::genaratePseudo( ¤tBundleId, sizeof( currentBundleId ) ); + hasCurrentBundleId = true; + } + + return currentBundleId; +} + +Writer::Compressor::Compressor( Writer & writer, + sptr< Bundle::Creator > const & bundleCreator, + string const & fileName ): + writer( writer ), bundleCreator( bundleCreator ), fileName( fileName ) +{ +} + +void * Writer::Compressor::Compressor::threadFunction() throw() +{ + try + { + bundleCreator->write( fileName, writer.encryptionKey ); + } + catch( std::exception & e ) + { + FAIL( "Bunding writing failed: %s", e.what() ); + } + + { + Lock _( writer.runningCompressorsMutex ); + CHECK( writer.runningCompressors, "no running compressors" ); + --writer.runningCompressors; + writer.runningCompressorsCondition.signal(); + } + + detach(); + + // We're in detached thread, so no further cleanup is necessary + delete this; + + return NULL; +} + +Reader::Reader( StorageInfo const & storageInfo, + EncryptionKey const & encryptionKey, + ChunkIndex & index, string const & bundlesDir, + size_t maxCacheSizeBytes ): + storageInfo( storageInfo ), encryptionKey( encryptionKey ), + index( index ), bundlesDir( bundlesDir ), + // We need to have at least one cached reader, otherwise we would have to + // unpack a bundle each time a chunk is read, even for consecutive chunks + // in the same bundle + cachedReaders( maxCacheSizeBytes < storageInfo.bundle_max_payload_size() ? + 1 : maxCacheSizeBytes / storageInfo.bundle_max_payload_size() ) +{ + verbosePrintf( "Using up to %zu MB of RAM as cache\n", + maxCacheSizeBytes / 1048576 ); +} + +void Reader::get( ChunkId const & chunkId, string & data, size_t & size ) +{ + if ( Bundle::Id const * bundleId = index.findChunk( chunkId ) ) + { + Bundle::Reader & reader = getReaderFor( *bundleId ); + reader.get( chunkId.toBlob(), data, size ); + } + else + { + string blob = chunkId.toBlob(); + throw exNoSuchChunk( toHex( ( unsigned char const * ) blob.data(), + blob.size() ) ); + } +} + +Bundle::Reader & Reader::getReaderFor( Bundle::Id const & id ) +{ + sptr< Bundle::Reader > & reader = cachedReaders.entry< Bundle::Reader >( + string( ( char const * ) &id, sizeof( id ) ) ); + + if ( !reader.get() ) + { + // Load the bundle + reader = + new Bundle::Reader( Bundle::generateFileName( id, bundlesDir, false ), + encryptionKey ); + } + + return *reader; +} + +} diff --git a/chunk_storage.hh b/chunk_storage.hh new file mode 100644 index 0000000..d94400e --- /dev/null +++ b/chunk_storage.hh @@ -0,0 +1,137 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef CHUNK_STORAGE_HH_INCLUDED__ +#define CHUNK_STORAGE_HH_INCLUDED__ + +#include +#include +#include +#include +#include + +#include "bundle.hh" +#include "chunk_id.hh" +#include "chunk_index.hh" +#include "encryption_key.hh" +#include "ex.hh" +#include "file.hh" +#include "index_file.hh" +#include "mt.hh" +#include "nocopy.hh" +#include "objectcache.hh" +#include "sptr.hh" +#include "tmp_mgr.hh" +#include "zbackup.pb.h" + +namespace ChunkStorage { + +using std::string; +using std::vector; +using std::pair; + +DEF_EX( Ex, "Chunk storage exception", std::exception ) + +/// Allows adding new chunks to the storage by filling up new bundles with them +/// and writing new index files +class Writer: NoCopy +{ +public: + /// All new bundles and index files are created as temp files. Call commit() + /// to move them to their permanent locations. commit() is never called + /// automatically! + Writer( StorageInfo const &, EncryptionKey const &, + TmpMgr &, ChunkIndex & index, string const & bundlesDir, + string const & indexDir, size_t maxCompressorsToRun ); + + /// Adds the given chunk to the store. If such a chunk has already existed + /// in the index, does nothing and returns false + bool add( ChunkId const &, void const * data, size_t size ); + + /// Commits all newly created bundles. Must be called before destroying the + /// object -- otherwise all work will be removed from the temp dir and lost + void commit(); + + ~Writer(); + +private: + /// Performs the compression in a separate thread. Destroys itself once done + class Compressor: public Thread + { + Writer & writer; + sptr< Bundle::Creator > bundleCreator; + string fileName; + public: + Compressor( Writer &, sptr< Bundle::Creator > const &, + string const & fileName ); + protected: + virtual void * threadFunction() throw(); + }; + + friend class Compressor; + + /// Returns the id of the currently written bundle. If there's none, generates + /// one. If a bundle hasn't yet started, still generates it - once the bundle + /// is started, it will be used then + Bundle::Id const & getCurrentBundleId(); + + /// Returns *currentBundle or creates a new one + Bundle::Creator & getCurrentBundle(); + + /// Writes the current bundle and deallocates it + void finishCurrentBundle(); + + /// Wait for all compressors to finish + void waitForAllCompressorsToFinish(); + + StorageInfo const & storageInfo; + EncryptionKey const & encryptionKey; + TmpMgr & tmpMgr; + ChunkIndex & index; + string bundlesDir, indexDir; + sptr< TemporaryFile > indexTempFile; + sptr< IndexFile::Writer > indexFile; + + sptr< Bundle::Creator > currentBundle; + Bundle::Id currentBundleId; + bool hasCurrentBundleId; + + size_t maxCompressorsToRun; + Mutex runningCompressorsMutex; + Condition runningCompressorsCondition; + size_t runningCompressors; + + /// Maps temp file of the bundle to its id blob + typedef pair< sptr< TemporaryFile >, Bundle::Id > PendingBundleRename; + vector< PendingBundleRename > pendingBundleRenames; +}; + +/// Allows retrieving existing chunks by extracting them from the bundles with +/// the help of an Index object +class Reader: NoCopy +{ +public: + DEF_EX_STR( exNoSuchChunk, "no such chunk found:", Ex ) + + Reader( StorageInfo const &, EncryptionKey const &, ChunkIndex & index, + string const & bundlesDir, size_t maxCacheSizeBytes ); + + /// Loads the given chunk from the store into the given buffer. May throw file + /// and decompression exceptions. 'data' may be enlarged but won't be shrunk. + /// The size of the actual chunk would be stored in 'size' + void get( ChunkId const &, string & data, size_t & size ); + + /// Retrieves the reader for the given bundle id. May employ caching + Bundle::Reader & getReaderFor( Bundle::Id const & ); + +private: + StorageInfo const & storageInfo; + EncryptionKey const & encryptionKey; + ChunkIndex & index; + string bundlesDir; + ObjectCache cachedReaders; +}; + +} + +#endif diff --git a/debug.cc b/debug.cc new file mode 100644 index 0000000..b1e6563 --- /dev/null +++ b/debug.cc @@ -0,0 +1,4 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +bool verboseMode = true; diff --git a/debug.hh b/debug.hh new file mode 100644 index 0000000..a48981c --- /dev/null +++ b/debug.hh @@ -0,0 +1,26 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef DEBUG_HH_INCLUDED__ +#define DEBUG_HH_INCLUDED__ + +#include + +// Macros we use to output debugging information + +#ifndef NDEBUG + +#define dPrintf( ... ) (fprintf( stderr, __VA_ARGS__ )) + +#else + +#define dPrintf( ... ) + +#endif + +extern bool verboseMode; + +#define verbosePrintf( ... ) ({ if ( verboseMode ) \ + fprintf( stderr, __VA_ARGS__ ); }) + +#endif diff --git a/dir.cc b/dir.cc new file mode 100644 index 0000000..ba7c527 --- /dev/null +++ b/dir.cc @@ -0,0 +1,119 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include +#include +#include +#include +#include +#include + +#include "dir.hh" + +DIR * dir; + +namespace Dir { +bool exists( string const & name ) +{ + struct stat buf; + + return stat( name.c_str(), &buf ) == 0 && S_ISDIR( buf.st_mode ); +} + +void create( string const & name ) +{ + if ( mkdir( name.c_str(), 0777 ) != 0 ) + throw exCantCreate( name ); +} + +void remove( string const & name ) +{ + if ( rmdir( name.c_str() ) != 0 ) + throw exCantRemove( name ); +} + +string addPath( string const & first, string const & second ) +{ + if ( first.empty() ) + return second; + + if ( second.empty() ) + return first; + + if ( first[ first.size() - 1 ] == separator() ) + return first + second; + else + return first + separator() + second; +} + +string getRealPath( string const & path ) +{ + if ( char * r = realpath( path.c_str(), NULL ) ) + { + string result( r ); + free( r ); + return result; + } + else + throw exCantGetRealPath( path ); +} + +string getDirName( string const & path ) +{ + char const * c = path.c_str(); + std::vector< char > copy( c, c + path.size() + 1 ); + + return dirname( copy.data() ); +} + +Listing::Listing( string const & dirName ): dirName( dirName ) +{ + dir = opendir( dirName.c_str() ); + + if ( !dir ) + throw exCantList( dirName ); + +} + +Listing::~Listing() +{ + closedir( dir ); +} + +bool Listing::getNext( Entry & result ) +{ + dirent entry; + + dirent * entryPtr; + + struct stat entryStats; + + for ( ; ; ) + { + if ( readdir_r( dir, &entry, &entryPtr ) != 0 ) + throw exCantList( dirName ); + + if ( !entryPtr ) + return false; + + if ( fstatat( dirfd( dir ), entry.d_name, &entryStats, + AT_SYMLINK_NOFOLLOW ) != 0 ) + throw exCantList( dirName ); + + bool isDir = S_ISDIR( entryStats.st_mode ); + bool isSymLink = S_ISLNK( entryStats.st_mode ); + + if ( isDir && + ( entry.d_name[ 0 ] == '.' && + ( !entry.d_name[ 1 ] || entry.d_name[ 1 ] == '.' ) ) ) + { + // Skip the . or .. entries + continue; + } + + result = Entry( entry.d_name, isDir, isSymLink ); + return true; + } +} + +} diff --git a/dir.hh b/dir.hh new file mode 100644 index 0000000..ec2bda3 --- /dev/null +++ b/dir.hh @@ -0,0 +1,85 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef DIR_HH_INCLUDED__ +#define DIR_HH_INCLUDED__ + +#include +#include +#include +#include + +#include "ex.hh" +#include "nocopy.hh" + +using std::string; + +/// Directory-related operations +namespace Dir { + +DEF_EX( Ex, "Directory exception", std::exception ) +DEF_EX_STR( exCantCreate, "Can't create directory", Ex ) +DEF_EX_STR( exCantRemove, "Can't remove directory", Ex ) +DEF_EX_STR( exCantList, "Can't list directory", Ex ) +DEF_EX_STR( exCantGetRealPath, "Can't real path of", Ex ) + +/// Checks whether the given dir exists or not +bool exists( string const & ); + +/// Creates the given directory +void create( string const & ); + +/// Removes the given directory. It must be empty to be removed +void remove( string const & ); + +/// Adds one path to another, e.g. for /hello/world and baz/bar, returns +/// /hello/world/baz/bar +string addPath( string const & first, string const & second ); + +/// Returns the canonicalized absolute pathname with symlinks resolved +string getRealPath( string const & ); + +/// Returns the directory part of the given path +string getDirName( string const & ); + +/// A separator used to separate names in the path. +inline char separator() +{ return '/'; } + +class Entry +{ + string fileName; + bool dir; + bool symlink; + +public: + Entry() {} + Entry( string const & fileName, bool dir, bool symlink ): + fileName( fileName ), dir( dir ), symlink( symlink ) {} + + string const & getFileName() const + { return fileName; } + + bool isDir() const + { return dir; } + + bool isSymLink() const + { return symlink; } +}; + +/// Allows listing the directory +class Listing: NoCopy +{ + string dirName; + DIR * dir; +public: + Listing( string const & dirName ); + ~Listing(); + + /// Return true if entry was filled, false if end of dir was encountered + bool getNext( Entry & ); +}; + +} + +#endif diff --git a/encrypted_file.cc b/encrypted_file.cc new file mode 100644 index 0000000..e3b759f --- /dev/null +++ b/encrypted_file.cc @@ -0,0 +1,391 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include +#include + +#include "check.hh" +#include "encrypted_file.hh" +#include "endian.hh" +#include "page_size.hh" +#include "random.hh" + +namespace EncryptedFile { + +using Encryption::BlockSize; + +InputStream::InputStream( char const * fileName, EncryptionKey const & key, + void const * iv_ ): + file( fileName, UnbufferedFile::ReadOnly ), filePos( 0 ), key( key ), + // Our buffer must be larger than BlockSize, as otherwise we won't be able + // to handle PKCS#7 padding properly + buffer( std::max( getPageSize(), ( unsigned ) BlockSize * 2 ) ), + fill( 0 ), remainder( 0 ), backedUp( false ) +{ + if ( key.hasKey() ) + { + memcpy( iv, iv_, sizeof( iv ) ); + // Since we use padding, file size should be evenly dividable by the cipher + // block size, and we should have at least one block + UnbufferedFile::Offset size = file.size(); + if ( !size || size % BlockSize ) + throw exIncorrectFileSize(); + } +} + +bool InputStream::Next( void const ** data, int * size ) +{ + // If we backed up, return the unconsumed data + if ( backedUp ) + backedUp = false; + else + { + try + { + // Update adler32 for the previous block + adler32.add( start, fill ); + + // Read more data + if ( filePos && !remainder ) + { + // Once we're read a full block, we always have a remainder. If not, + // this means we've hit the end of file already + fill = 0; + return false; + } + + // If we have a remainder, move it to the beginning of buffer and make + // it start the next block + memmove( buffer.data(), start + fill, remainder ); + start = buffer.data(); + fill = file.read( start + remainder, buffer.size() - remainder ) + + remainder; + // remainder should techically be 0 now, but decrypt() will update it + // anyway + // remainder = 0; + decrypt(); + } + catch( UnbufferedFile::exReadError & ) + { + fill = 0; // To make sure state is remaining consistent + return false; + } + } + *data = start; + *size = fill; + filePos += fill; + return *size; +} + +void InputStream::BackUp( int count ) +{ + CHECK( count >= 0, "count is negative" ); + if ( !backedUp ) + { + CHECK( (size_t) count <= fill, "Backing up too much" ); + size_t consumed = fill - count; + adler32.add( start, consumed ); + start += consumed; + fill = count; + filePos -= count; + backedUp = fill; // Don't make the next Next() return 0 bytes + } + else + { + CHECK( count == 0, "backing up after being backed up already" ); + } +} + +bool InputStream::Skip( int count ) +{ + CHECK( count >= 0, "count is negative" ); + + // We always need to read and decrypt data, as otherwise both the state of + // CBC and adler32 would be incorrect + void const * data; + int size; + while( count ) + { + if ( !Next( &data, &size ) ) + return false; + else + if ( size > count ) + { + BackUp( size - count ); + break; + } + else + count -= size; + } + return true; +} + +int64_t InputStream::ByteCount() const +{ + return filePos; +} + +Adler32::Value InputStream::getAdler32() +{ + // This makes all data consumed, if not already + BackUp( 0 ); + return adler32.result(); +} + +void InputStream::read( void * buf, size_t size ) +{ + void const * data; + int avail; + char * n = ( char * ) buf; + while( size ) + { + if ( !Next( &data, &avail ) ) + throw exReadFailed(); + else + if ( avail > ( ssize_t ) size ) + { + memcpy( n, data, size ); + BackUp( avail - size ); + break; + } + else + { + memcpy( n, data, avail ); + n += avail; + size -= avail; + } + } +} + +void InputStream::checkAdler32() +{ + Adler32::Value ours = getAdler32(); + Adler32::Value r; + read( &r, sizeof( r ) ); + if ( ours != fromLittleEndian( r ) ) + throw exAdlerMismatch(); +} + +void InputStream::consumeRandomIv() +{ + if ( key.hasKey() ) + { + char iv[ Encryption::IvSize ]; + read( iv, sizeof( iv ) ); // read() can throw exceptions, Skip() can't + } +} + +void InputStream::decrypt() +{ + if ( fill == buffer.size() ) + { + // When we have the full buffer, we set the last block of it aside and + // treat the rest as the normal CBC sequence. The last block in the buffer + // may be the last block of file, in which case we would need to handle + // padding. That may happen the next time the function is called + remainder = BlockSize; + fill -= BlockSize; + doDecrypt(); + } + else + { + // This is an end of file. Decrypt data treating the last block being + // padded + + // Since we always have padding in the file and the last block is always + // set apart when reading full buffers, we must have at least one block + // to decrypt here + doDecrypt(); + + // Unpad the last block + if ( key.hasKey() ) + fill -= BlockSize - Encryption::unpad( start + fill - BlockSize ); + + // We have not left any remainder this time + remainder = 0; + } +} + +void InputStream::doDecrypt() +{ + if ( !key.hasKey() ) + return; + + // Since we use padding, file size should be evenly dividable by the cipher's + // block size, and we should always have at least one block. When we get here, + // we would always get the proper fill value unless those characteristics are + // not met. We check for the same condition on construction, but the file + // size can change while we are reading it + + // We don't throw an exception here as the interface we implement doesn't + // support them + CHECK( fill > 0 && !( fill % BlockSize ), "incorrect size of the encrypted " + "file - must be non-zero and in multiples of %u", + ( unsigned ) BlockSize ); + + // Copy the next iv prior to decrypting the data in place, as it will + // not be available afterwards + char newIv[ Encryption::IvSize ]; + memcpy( newIv, Encryption::getNextDecryptionIv( start, fill ), + sizeof( newIv ) ); + // Decrypt the data + Encryption::decrypt( iv, key.getKey(), start, start, fill ); + // Copy the new iv + memcpy( iv, newIv, sizeof( iv ) ); +} + +OutputStream::OutputStream( char const * fileName, EncryptionKey const & key, + void const * iv_ ): + file( fileName, UnbufferedFile::WriteOnly ), filePos( 0 ), key( key ), + buffer( getPageSize() ), start( buffer.data() ), avail( 0 ), backedUp( false ) +{ + if ( key.hasKey() ) + memcpy( iv, iv_, sizeof( iv ) ); +} + +bool OutputStream::Next( void ** data, int * size ) +{ + // If we backed up, return the unconsumed data + if ( backedUp ) + backedUp = false; + else + { + try + { + // Update adler32 for the previous block + adler32.add( start, avail ); + + // Encrypt and write the buffer if it had data + if ( filePos ) + encryptAndWrite( buffer.size() ); + + start = buffer.data(); + avail = buffer.size(); + } + catch( UnbufferedFile::exWriteError & ) + { + avail = 0; // To make sure state is remaining consistent + return false; + } + } + *data = start; + *size = avail; + filePos += avail; + return *size; +} + +void OutputStream::BackUp( int count ) +{ + CHECK( count >= 0, "count is negative" ); + if ( !backedUp ) + { + CHECK( (size_t) count <= avail, "Backing up too much" ); + size_t consumed = avail - count; + adler32.add( start, consumed ); + start += consumed; + avail = count; + filePos -= count; + backedUp = avail; // Don't make the next Next() return 0 bytes + } + else + { + CHECK( count == 0, "backing up after being backed up already" ); + } +} + +int64_t OutputStream::ByteCount() const +{ + return filePos; +} + +Adler32::Value OutputStream::getAdler32() +{ + // This makes all data consumed, if not already + BackUp( 0 ); + return adler32.result(); +} + +void OutputStream::write( void const * buf, size_t size ) +{ + void * data; + int avail; + char const * n = ( char const * ) buf; + while( size ) + { + if ( !Next( &data, &avail ) ) + throw exReadFailed(); + else + if ( avail > ( ssize_t ) size ) + { + memcpy( data, n, size ); + BackUp( avail - size ); + break; + } + else + { + memcpy( data, n, avail ); + n += avail; + size -= avail; + } + } +} + +void OutputStream::writeAdler32() +{ + Adler32::Value v = toLittleEndian( getAdler32() ); + write( &v, sizeof( v ) ); +} + +void OutputStream::writeRandomIv() +{ + if ( key.hasKey() ) + { + char iv[ Encryption::IvSize ]; + Random::genaratePseudo( iv, sizeof( iv ) ); + write( iv, sizeof( iv ) ); + } +} + +void OutputStream::encryptAndWrite( size_t bytes ) +{ + if ( key.hasKey() ) + { + CHECK( bytes > 0 && !( bytes % BlockSize ), "incorrect number of bytes to " + "encrypt and write - must be non-zero and in multiples of %u", + ( unsigned ) BlockSize ); + + void const * nextIv = Encryption::encrypt( iv, key.getKey(), buffer.data(), + buffer.data(), bytes ); + memcpy( iv, nextIv, sizeof( iv ) ); + } + + file.write( buffer.data(), bytes ); +} + +OutputStream::~OutputStream() +{ + // This makes all data consumed, if not already + BackUp( 0 ); + + // If we have the full buffer, write it first + if ( start == buffer.data() + buffer.size() ) + { + encryptAndWrite( buffer.size() ); + start = buffer.data(); + } + + size_t bytesToWrite = start - buffer.data(); + + if ( key.hasKey() ) + { + // Perform padding + size_t remainderSize = bytesToWrite % BlockSize; + + Encryption::pad( start - remainderSize, remainderSize ); + bytesToWrite += BlockSize - remainderSize; + } + + encryptAndWrite( bytesToWrite ); +} + +} diff --git a/encrypted_file.hh b/encrypted_file.hh new file mode 100644 index 0000000..8a63d11 --- /dev/null +++ b/encrypted_file.hh @@ -0,0 +1,137 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef ENCRYPTED_FILE_HH_INCLUDED__ +#define ENCRYPTED_FILE_HH_INCLUDED__ + +#include +#include +#include +#include +#include +#include + +#include "adler32.hh" +#include "encryption.hh" +#include "encryption_key.hh" +#include "ex.hh" +#include "unbuffered_file.hh" + +/// Google's ZeroCopyStream implementations which read and write files encrypted +/// with our encryption mechanism. They also calculate adler32 of all file +/// content and write/check it at the end. +/// Encryption-wise we implement AES-128 in CBC mode with PKCS#7 padding. We +/// don't use EVP for this currently - everyone is welcome to change this, and +/// to add support for arbitrary ciphers, key lengths and modes of operations as +/// well. When no encryption key is set, no encryption or padding is done, but +/// everything else works the same way otherwise +namespace EncryptedFile { + +DEF_EX( Ex, "Encrypted file exception", std::exception ) +DEF_EX( exFileCorrupted, "encrypted file data is currupted", Ex ) +DEF_EX( exIncorrectFileSize, "size of the encrypted file is incorrect", exFileCorrupted ) +DEF_EX( exReadFailed, "read failed", Ex ) // Only thrown by InputStream::read() +DEF_EX( exAdlerMismatch, "adler32 mismatch", Ex ) + +class InputStream: public google::protobuf::io::ZeroCopyInputStream +{ +public: + /// Opens the input file. If EncryptionKey contains no key, the input won't be + /// decrypted and iv would be ignored + InputStream( char const * fileName, EncryptionKey const &, void const * iv ); + virtual bool Next( void const ** data, int * size ); + virtual void BackUp( int count ); + virtual bool Skip( int count ); + virtual int64_t ByteCount() const; + + + /// Returns adler32 of all data read so far. Calling this makes backing up + /// for the previous Next() call impossible - the data has to be consumed + Adler32::Value getAdler32(); + + /// Performs a traditional read, for convenience purposes + void read( void * buf, size_t size ); + + /// Reads an adler32 value from the stream and compares with checkAdler32(). + /// Throws an exception on mismatch + void checkAdler32(); + + /// Reads and discards the number of bytes equivalent to an IV size. This is + /// used when no IV is initially provided. + /// If there's no encryption key set, does nothing + void consumeRandomIv(); + + /// Closes the file + ~InputStream() {} + +private: + UnbufferedFile file; + UnbufferedFile::Offset filePos; + EncryptionKey const & key; + char iv[ Encryption::IvSize ]; + std::vector< char > buffer; + char * start; /// Points to the start of the data currently held in buffer + size_t fill; /// Number of bytes held in buffer + size_t remainder; /// Number of bytes held in buffer just after the main + /// 'fill'-bytes portion. We have to keep those to implement + /// PKCS#7 padding + bool backedUp; /// True if the BackUp operation was performed, and the buffer + /// contents are therefore unconsumed + Adler32 adler32; + + /// Decrypts 'fill' bytes at 'start', adjusting 'fill' and setting 'remainder' + void decrypt(); + /// Only used by decrypt() + void doDecrypt(); +}; + +class OutputStream: public google::protobuf::io::ZeroCopyOutputStream +{ +public: + /// Creates the output file. If EncryptionKey contains no key, the output + /// won't be encrypted and iv would be ignored + OutputStream( char const * fileName, EncryptionKey const &, void const * iv ); + virtual bool Next( void ** data, int * size ); + virtual void BackUp( int count ); + virtual int64_t ByteCount() const; + + /// Returns adler32 of all data written so far. Calling this makes backing up + /// for the previous Next() call impossible - the data has to be consumed + Adler32::Value getAdler32(); + + /// Performs a traditional write, for convenience purposes + void write( void const * buf, size_t size ); + + /// Writes the current adler32 value returned by getAdler32() to the stream + void writeAdler32(); + + /// Writes the number of random bytes equivalent to an IV size. This is used + /// when no IV is initially provided, and provides an equivalent of having + /// a random IV when used just after the stream has been opened. + /// If there's no encryption key set, does nothing + void writeRandomIv(); + + /// Finishes writing and closes the file + ~OutputStream(); + +private: + UnbufferedFile file; + UnbufferedFile::Offset filePos; + EncryptionKey const & key; + char iv[ Encryption::IvSize ]; + std::vector< char > buffer; + char * start; /// Points to the start of the area currently available for + /// writing to in buffer + size_t avail; /// Number of bytes available for writing to in buffer + bool backedUp; /// True if the BackUp operation was performed, and the buffer + /// contents are therefore unconsumed + Adler32 adler32; + + /// Encrypts and writes 'bytes' bytes from the beginning of the buffer. + /// 'bytes' must be non-zero and in multiples of BlockSize + void encryptAndWrite( size_t bytes ); +}; + +} + +#endif diff --git a/encryption.cc b/encryption.cc new file mode 100644 index 0000000..7ced622 --- /dev/null +++ b/encryption.cc @@ -0,0 +1,120 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include + +#include "check.hh" +#include "encryption.hh" +#include "static_assert.hh" + +namespace Encryption { + +char const ZeroIv[ IvSize ] = { 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 }; + +void const * encrypt( void const * iv, void const * keyData, + void const * inData, void * outData, size_t size ) +{ + unsigned char block[ BlockSize ]; + + CHECK( !( size % BlockSize ), "size of data to encrypt is not a multiple of " + "block size" ); + + AES_KEY key; + AES_set_encrypt_key( ( unsigned char const * ) keyData, KeySize * 8, &key ); + + void const * prev = iv; + + // We do the operation in block size multiples. We do XOR in size_t + // multiples. The operation is endian-neutral + + // Make sure that BlockSize is a multiple of the size of size_t + STATIC_ASSERT( !( BlockSize % sizeof( size_t ) ) ); + + size_t const * inS = ( size_t const * ) inData; + unsigned char * out = ( unsigned char * ) outData; + + for ( size_t count = size / BlockSize; count--; ) + { + size_t const * prevS = ( size_t const * ) prev; + size_t * blockS = ( size_t * ) block; + + for ( size_t x = BlockSize / sizeof( size_t ); x--; ) + *blockS++ = *inS++ ^ *prevS++; + + AES_encrypt( block, out, &key ); + + prev = out; + out += BlockSize; + } + + return prev; +} + +void const * getNextDecryptionIv( void const * in, size_t size ) +{ + CHECK( !( size % BlockSize ), "size of data to decrypt is not a multiple of " + "block size" ); + return ( char const * ) in + size - BlockSize; +} + +void decrypt( void const * iv, void const * keyData, void const * inData, + void * outData, size_t size ) +{ + CHECK( !( size % BlockSize ), "size of data to decrypt is not a multiple of " + "block size" ); + + AES_KEY key; + AES_set_decrypt_key( ( unsigned char const * ) keyData, KeySize * 8, &key ); + + // We decrypt from the end to the beginning + + unsigned char const * in = ( unsigned char const * ) inData + size; + unsigned char * out = ( unsigned char * ) outData + size; + + size_t count = size / BlockSize; + + size_t const * prevS = ( size_t const * )( in - BlockSize ); + + size_t * outS = ( size_t * ) out; + + while( count-- ) + { + if ( prevS == inData ) + prevS = ( size_t const * )( ( unsigned char const * ) iv + BlockSize ); + + in -= BlockSize; + + AES_decrypt( in, ( unsigned char * ) outS - BlockSize, &key ); + + for ( size_t x = BlockSize / sizeof( size_t ); x--; ) + *--outS ^= *--prevS; + } +} + +void pad( void * data, size_t size ) +{ + CHECK( size < BlockSize, "size to pad is too large: %zu bytes", size ); + unsigned char * p = ( unsigned char * ) data + size; + unsigned char v = BlockSize - size; + for ( size_t count = v; count--; ) + *p++ = v; +} + +size_t unpad( void const * data ) +{ + unsigned char const * p = ( unsigned char const * ) data + BlockSize - 1; + unsigned char v = *p; + if ( !v || v > BlockSize ) + throw exBadPadding(); + + // Check the rest of the padding + for ( size_t count = v - 1; count--; ) + if ( *--p != v ) + throw exBadPadding(); + + return BlockSize - v; +} +} diff --git a/encryption.hh b/encryption.hh new file mode 100644 index 0000000..486b57c --- /dev/null +++ b/encryption.hh @@ -0,0 +1,56 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef ENCRYPTION_HH_INCLUDED__ +#define ENCRYPTION_HH_INCLUDED__ + +#include +#include + +#include "ex.hh" + +/// What we implement right now is AES-128 in CBC mode with PKCS#7 padding +namespace Encryption { + +enum +{ + KeySize = 16, /// Size of the key in bytes + IvSize = 16, /// Size of the IV data in bytes + BlockSize = 16 /// Cipher block size in bytes +}; + +DEF_EX( exBadPadding, "Bad padding encountered", std::exception ) + +/// Encrypts 'size' bytes of the data pointed to by 'in', outputting 'size' +/// bytes to 'out'. 'key' points to KeySize bytes of the key data. 'iv' points +/// to IvSize bytes used as an initialization vector. 'in' and 'out' can be the +/// same. 'size' must be a multiple of BlockSize. Returns a pointer to the +/// IV which should be used to continue encrypting, which in CBC is the last +/// encrypted block +void const * encrypt( void const * iv, void const * key, void const * in, + void * out, size_t size ); + +/// Returns a pointer to the IV which should be used to decrypt the block next +/// to the given one, which in CBC is the last encrypted block. Note that if an +/// in-place decryption is performed, this IV should be saved first, as it will +/// be overwritten with the decrypted data. For size == 0, the returned pointer +/// is invalid +void const * getNextDecryptionIv( void const * in, size_t size ); + +/// The reverse of encrypt() +void decrypt( void const * iv, void const * key, void const * in, void * out, + size_t size ); + +/// Pads the last block to be encrypted, pointed to by 'data', 'size' bytes, +/// which should be less than BlockSize, to occupy BlockSize bytes +void pad( void * data, size_t size ); + +/// Returns the size of the padded data. The data itself is unchanged - use the +/// first bytes of 'data'. Can throw exBadPadding +size_t unpad( void const * data ); + +/// The IV consisting of zero bytes. Use it when there is no IV +extern char const ZeroIv[ IvSize ]; +} + +#endif diff --git a/encryption_key.cc b/encryption_key.cc new file mode 100644 index 0000000..f95c3ff --- /dev/null +++ b/encryption_key.cc @@ -0,0 +1,107 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include +#include +#include +#include + +#include "check.hh" +#include "encryption_key.hh" +#include "random.hh" + +namespace { +/// Derives an encryption key from a password and key info +void deriveKey( string const & password, EncryptionKeyInfo const & info, + void * key, unsigned keySize ) +{ + CHECK( PKCS5_PBKDF2_HMAC_SHA1( password.data(), password.size(), + (unsigned char const *) info.salt().data(), + info.salt().size(), info.rounds(), keySize, + (unsigned char *) key ) == 1, + "encryption key derivation failed" ); +} + +string calculateKeyHmac( void const * key, unsigned keySize, + string const & input ) +{ + char result[ EVP_MAX_MD_SIZE ]; + unsigned resultSize; + CHECK( HMAC( EVP_sha1(), (unsigned char const *) key, keySize, + (unsigned char const *) input.data(), input.size(), + (unsigned char *) result, &resultSize ), + "encryption key HMAC calcuation failed" ); + + return string( result, result + resultSize ); +} +} + +EncryptionKey::EncryptionKey( string const & password, + EncryptionKeyInfo const * info ) +{ + if ( !info ) + isSet = false; + else + { + isSet = true; + + char derivedKey[ KeySize ]; + deriveKey( password, *info, derivedKey, sizeof( derivedKey ) ); + + AES_KEY aesKey; + AES_set_decrypt_key( ( unsigned char const * ) derivedKey, 128, &aesKey ); + AES_decrypt( ( unsigned char const * ) info->encrypted_key().data(), + ( unsigned char * ) key, &aesKey ); + + if ( calculateKeyHmac( key, sizeof( key ), info->key_check_input() ) != + info->key_check_hmac() ) + throw exInvalidPassword(); + } +} + +EncryptionKey::~EncryptionKey() +{ + // Clear the key from memory + memset( key, 0, sizeof( key ) ); +} + +void EncryptionKey::generate( string const & password, + EncryptionKeyInfo & info ) +{ + // Use this buf for salts + char buf[ 16 ]; + + Random::genaratePseudo( buf, sizeof( buf ) ); + info.set_salt( buf, sizeof( buf ) ); + info.set_rounds( 10000 ); // TODO: make this configurable + + char derivedKey[ KeySize ]; + deriveKey( password, info, derivedKey, sizeof( derivedKey ) ); + + char key[ KeySize ]; + + Random::genarateTrue( key, sizeof( key ) ); + + // Fill in the HMAC verification part + Random::genaratePseudo( buf, sizeof( buf ) ); + info.set_key_check_input( buf, sizeof( buf ) ); + info.set_key_check_hmac( calculateKeyHmac( key, sizeof( key ), + info.key_check_input() ) ); + + // Encrypt the key + AES_KEY aesKey; + AES_set_encrypt_key( ( unsigned char const * ) derivedKey, 128, &aesKey ); + char encryptedKey[ sizeof( key ) ]; + AES_encrypt( ( unsigned char const * ) key, + ( unsigned char * ) encryptedKey, &aesKey ); + info.set_encrypted_key( encryptedKey, sizeof( encryptedKey ) ); + + // Clear the key from memory + memset( key, 0, sizeof( key ) ); +} + +EncryptionKey const & EncryptionKey::noKey() +{ + static EncryptionKey key( string(), NULL ); + return key; +} diff --git a/encryption_key.hh b/encryption_key.hh new file mode 100644 index 0000000..48db8df --- /dev/null +++ b/encryption_key.hh @@ -0,0 +1,50 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef ENCRYPTION_KEY_HH_INCLUDED__ +#define ENCRYPTION_KEY_HH_INCLUDED__ + +#include +#include + +#include "ex.hh" +#include "zbackup.pb.h" + +using std::string; + +class EncryptionKey +{ + bool isSet; + unsigned const static KeySize = 16; // TODO: make this configurable + char key[ KeySize ]; + +public: + DEF_EX( exInvalidPassword, "Invalid password specified", std::exception ) + + /// Decodes the encryption key from the given info and password. If info is + /// passed as NULL, the password is ignored and no key is set + EncryptionKey( string const & password, EncryptionKeyInfo const * ); + ~EncryptionKey(); + + /// Returns true if key was set, false otherwise. + bool hasKey() const + { return isSet; } + + /// Returns the key. Check if there is one with hasKey() first. Note: the key + /// should not be copied, as it may be allocated in a locked page in the + /// future + void const * getKey() const + { return key; } + + /// Returns key size, in bytes + unsigned getKeySize() const + { return sizeof( key ); } + + /// Generates new key info using the given password + static void generate( string const & password, EncryptionKeyInfo & ); + + /// Returns a static instance without any key set + static EncryptionKey const & noKey(); +}; + +#endif diff --git a/endian.hh b/endian.hh new file mode 100644 index 0000000..7150c6f --- /dev/null +++ b/endian.hh @@ -0,0 +1,24 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef ENDIAN_HH_INCLUDED__ +#define ENDIAN_HH_INCLUDED__ + +#include +#include +#include + +#if __BYTE_ORDER != __LITTLE_ENDIAN +#error Please add support for architectures different from little-endian. +#endif + +/// Converts the given host-order value to big-endian value +inline uint32_t toBigEndian( uint32_t v ) { return htonl( v ); } +/// Converts the given host-order value to little-endian value +inline uint32_t toLittleEndian( uint32_t v ) { return v; } +inline uint64_t toLittleEndian( uint64_t v ) { return v; } +/// Converts the given little-endian value to host-order value +inline uint32_t fromLittleEndian( uint32_t v ) { return v; } +inline uint64_t fromLittleEndian( uint64_t v ) { return v; } + +#endif diff --git a/ex.hh b/ex.hh new file mode 100644 index 0000000..466acf6 --- /dev/null +++ b/ex.hh @@ -0,0 +1,54 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef EX_HH_INCLUDED__ +#define EX_HH_INCLUDED__ + +#include +#include +#include + +/// A way to declare an exception class fast +/// Do like this: +/// DEF_EX( exErrorInFoo, "An error in foo encountered", std::exception ) +/// DEF_EX( exFooNotFound, "Foo was not found", exErrorInFoo ) + +#define DEF_EX( exName, exDescription, exParent ) \ +class exName: public exParent { \ +public: \ +virtual const char * what() const throw() { return (exDescription); } \ +virtual ~exName() throw() {} }; + +/// Same as DEF_EX, but takes a runtime string argument, which gets concatenated +/// with the description. +/// +/// DEF_EX_STR( exCantOpen, "can't open file", std::exception ) +/// ... +/// throw exCantOpen( "example.txt" ); +/// +/// what() would return "can't open file example.txt" + +#define DEF_EX_STR( exName, exDescription, exParent ) \ +class exName: public exParent { \ + std::string value; \ +public: \ + exName( std::string const & value_ ): value( std::string( exDescription ) + " " + value_ ) {} \ + exName( char const * value_, unsigned size ): value( std::string( exDescription ) + " " + std::string( value_, size ) ) {} \ +virtual const char * what() const throw() { return value.c_str(); } \ +virtual ~exName() throw() {} }; + +/// An exception class to wrap leave code into an std::exception +class exLeaveWrapped: public std::exception +{ + char buf[ 32 ]; + +public: + + exLeaveWrapped( int error ) + { sprintf( buf, "%d", error ); } + + char const * what() const throw() + { return buf; } +}; + +#endif diff --git a/file.cc b/file.cc new file mode 100644 index 0000000..b91b05e --- /dev/null +++ b/file.cc @@ -0,0 +1,361 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include +#include +#include +#include +#include + +#include "file.hh" + +enum +{ + // We employ a writing buffer to considerably speed up file operations when + // they consists of many small writes. The default size for the buffer is 64k + WriteBufferSize = 65536 +}; + +bool File::exists( char const * filename ) throw() +{ +#ifdef __WIN32 + struct _stat buf; + return _stat( filename, &buf ) == 0; +#else + struct stat buf; + + // EOVERFLOW rationale: if the file is too large, it still does exist + return stat( filename, &buf ) == 0 || errno == EOVERFLOW; +#endif +} + +void File::erase( std::string const & filename ) throw( exCantErase ) +{ + if ( remove( filename.c_str() ) != 0 ) + throw exCantErase( filename ); +} + +void File::rename( std::string const & from, + std::string const & to ) throw( exCantRename ) +{ + if ( ::rename( from.c_str(), to.c_str() ) != 0 ) + throw exCantRename( from + " to " + to ); +} + +void File::open( char const * filename, OpenMode mode ) throw( exCantOpen ) +{ + char const * m; + + switch( mode ) + { + case Update: + m = "r+b"; + break; + case WriteOnly: + m = "wb"; + break; + default: + m = "rb"; + } + + f = fopen( filename, m ); + + if ( !f ) + throw exCantOpen( std::string( filename ) + ": " + strerror( errno ) ); +} + +File::File( char const * filename, OpenMode mode ) throw( exCantOpen ): + writeBuffer( 0 ) +{ + open( filename, mode ); +} + +File::File( std::string const & filename, OpenMode mode ) + throw( exCantOpen ): writeBuffer( 0 ) +{ + open( filename.c_str(), mode ); +} + +void File::read( void * buf, size_t size ) throw( exReadError, exWriteError ) +{ + if ( !size ) + return; + + if ( writeBuffer ) + flushWriteBuffer(); + + size_t result = fread( buf, size, 1, f ); + + if ( result != 1 ) + { + if ( !ferror( f ) ) + throw exShortRead(); + else + throw exReadErrorDetailed( f ); + } +} + +size_t File::readRecords( void * buf, size_t size, size_t count ) throw( exWriteError ) +{ + if ( writeBuffer ) + flushWriteBuffer(); + + return fread( buf, size, count, f ); +} + +void File::write( void const * buf, size_t size ) throw( exWriteError ) +{ + if ( !size ) + return; + + if ( size >= WriteBufferSize ) + { + // If the write is large, there's not much point in buffering + flushWriteBuffer(); + + size_t result = fwrite( buf, size, 1, f ); + + if ( result != 1 ) + throw exWriteError(); + + return; + } + + if ( !writeBuffer ) + { + // Allocate the writing buffer since we don't have any yet + writeBuffer = new char[ WriteBufferSize ]; + writeBufferLeft = WriteBufferSize; + } + + size_t toAdd = size < writeBufferLeft ? size : writeBufferLeft; + + memcpy( writeBuffer + ( WriteBufferSize - writeBufferLeft ), + buf, toAdd ); + + size -= toAdd; + writeBufferLeft -= toAdd; + + if ( !writeBufferLeft ) // Out of buffer? Flush it + { + flushWriteBuffer(); + + if ( size ) // Something's still left? Add to buffer + { + memcpy( writeBuffer, (char const *)buf + toAdd, size ); + writeBufferLeft -= size; + } + } +} + +size_t File::writeRecords( void const * buf, size_t size, size_t count ) + throw( exWriteError ) +{ + flushWriteBuffer(); + + return fwrite( buf, size, count, f ); +} + +char * File::gets( char * s, int size, bool stripNl ) + throw( exWriteError ) +{ + if ( writeBuffer ) + flushWriteBuffer(); + + char * result = fgets( s, size, f ); + + if ( result && stripNl ) + { + size_t len = strlen( result ); + + char * last = result + len; + + while( len-- ) + { + --last; + + if ( *last == '\n' || *last == '\r' ) + *last = 0; + else + break; + } + } + + return result; +} + +std::string File::gets( bool stripNl ) throw( exReadError, exWriteError ) +{ + char buf[ 1024 ]; + + if ( !gets( buf, sizeof( buf ), stripNl ) ) + { + if ( !ferror( f ) ) + throw exShortRead(); + else + throw exReadErrorDetailed( f ); + } + + return std::string( buf ); +} + +void File::seek( long offset ) throw( exSeekError, exWriteError ) +{ + if ( writeBuffer ) + flushWriteBuffer(); + + if ( fseek( f, offset, SEEK_SET ) != 0 ) + throw exSeekError(); +} + +void File::seekCur( long offset ) throw( exSeekError, exWriteError ) +{ + if ( writeBuffer ) + flushWriteBuffer(); + + if ( fseek( f, offset, SEEK_CUR ) != 0 ) + throw exSeekError(); +} + +void File::seekEnd( long offset ) throw( exSeekError, exWriteError ) +{ + if ( writeBuffer ) + flushWriteBuffer(); + + if ( fseek( f, offset, SEEK_END ) != 0 ) + throw exSeekError(); +} + +void File::rewind() throw( exSeekError, exWriteError ) +{ + seek( 0 ); +} + +size_t File::tell() throw( exSeekError ) +{ + long result = ftell( f ); + + if ( result == -1 ) + throw exSeekError(); + + if ( writeBuffer ) + result += ( WriteBufferSize - writeBufferLeft ); + + return ( size_t ) result; +} + +size_t File::size() throw( exSeekError, exWriteError ) +{ + size_t cur = tell(); + seekEnd( 0 ); + size_t result = tell(); + seek( cur ); + + return result; +} + +bool File::eof() throw( exWriteError ) +{ + if ( writeBuffer ) + flushWriteBuffer(); + + return feof( f ); +} + +FILE * File::file() throw( exWriteError ) +{ + flushWriteBuffer(); + + return f; +} + +FILE * File::release() throw( exWriteError ) +{ + releaseWriteBuffer(); + + FILE * c = f; + + f = 0; + + return c; +} + +void File::close() throw( exWriteError ) +{ + fclose( release() ); +} + +File::~File() throw() +{ + if ( f ) + { + try + { + releaseWriteBuffer(); + } + catch( exWriteError & ) + { + } + fclose( f ); + } +} + +void File::flushWriteBuffer() throw( exWriteError ) +{ + if ( writeBuffer && writeBufferLeft != WriteBufferSize ) + { + size_t result = fwrite( writeBuffer, WriteBufferSize - writeBufferLeft, 1, f ); + + if ( result != 1 ) + throw exWriteError(); + + writeBufferLeft = WriteBufferSize; + } +} + +void File::releaseWriteBuffer() throw( exWriteError ) +{ + flushWriteBuffer(); + + if ( writeBuffer ) + { + delete [] writeBuffer; + + writeBuffer = 0; + } +} + +File::exReadErrorDetailed::exReadErrorDetailed( int fd ) +{ + buildDescription( fd ); +} + +File::exReadErrorDetailed::exReadErrorDetailed( FILE * f ) +{ + buildDescription( fileno( f ) ); +} + +void File::exReadErrorDetailed::buildDescription( int fd ) +{ + description = "Error reading from file "; + + char path[ PATH_MAX ]; + char procFdLink[ 48 ]; + sprintf( procFdLink, "/proc/self/fd/%d", fd ); + + int pathChars = readlink( procFdLink, path, sizeof( path ) ); + + if ( pathChars < 0 ) + description += "(unknown)"; + else + description.append( path, pathChars ); +} + +const char * File::exReadErrorDetailed::what() const throw() +{ + return description.c_str(); +} + +File::exReadErrorDetailed::~exReadErrorDetailed() throw () +{ +} diff --git a/file.hh b/file.hh new file mode 100644 index 0000000..df494aa --- /dev/null +++ b/file.hh @@ -0,0 +1,160 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef FILE_HH_INCLUDED__ +#define FILE_HH_INCLUDED__ + +#include +#include +#include +#include + +#include "ex.hh" + +using std::string; + +/// A simple wrapper over FILE * operations with added write-buffering +class File +{ + FILE * f; + char * writeBuffer; + size_t writeBufferLeft; + +public: + DEF_EX( Ex, "File exception", std::exception ) + DEF_EX_STR( exCantOpen, "Can't open", Ex ) + DEF_EX( exReadError, "Error reading from file", Ex ) + DEF_EX( exShortRead, "Short read from the file", exReadError ) + DEF_EX( exWriteError, "Error writing to the file", Ex ) + DEF_EX( exSeekError, "File seek error", Ex ) + DEF_EX_STR( exCantErase, "Can't erase file", Ex ) + DEF_EX_STR( exCantRename, "Can't rename file", Ex ) + + enum OpenMode + { + ReadOnly, + WriteOnly, + Update + }; + + typedef long Offset; + + File( char const * filename, OpenMode ) + throw( exCantOpen ); + + File( std::string const & filename, OpenMode ) + throw( exCantOpen ); + + /// Reads the number of bytes to the buffer, throws an error if it + /// failed to fill the whole buffer (short read, i/o error etc) + void read( void * buf, size_t size ) throw( exReadError, exWriteError ); + + template< typename T > + void read( T & value ) throw( exReadError, exWriteError ) + { read( &value, sizeof( value ) ); } + + template< typename T > + T read() throw( exReadError, exWriteError ) + { T value; read( value ); return value; } + + /// Attempts reading at most 'count' records sized 'size'. Returns + /// the number of records it managed to read, up to 'count' + size_t readRecords( void * buf, size_t size, size_t count ) throw( exWriteError ); + + /// Writes the number of bytes from the buffer, throws an error if it + /// failed to write the whole buffer (short write, i/o error etc). + /// This function employs write buffering, and as such, writes may not + /// end up on disk immediately, or a short write may occur later + /// than it really did. If you don't want write buffering, use + /// writeRecords() function instead + void write( void const * buf, size_t size ) throw( exWriteError ); + + template< typename T > + void write( T const & value ) throw( exWriteError ) + { write( &value, sizeof( value ) ); } + + /// Attempts writing at most 'count' records sized 'size'. Returns + /// the number of records it managed to write, up to 'count'. + /// This function does not employ buffering, but flushes the buffer if it + /// was used before + size_t writeRecords( void const * buf, size_t size, size_t count ) + throw( exWriteError ); + + /// Reads a string from the file. Unlike the normal fgets(), this one + /// can strip the trailing newline character, if this was requested. + /// Returns either s or 0 if no characters were read + char * gets( char * s, int size, bool stripNl = false ) throw( exWriteError ); + + /// Like the above, but uses its own local internal buffer (1024 bytes + /// currently), and strips newlines by default + std::string gets( bool stripNl = true ) throw( exReadError, exWriteError ); + + /// Seeks in the file, relative to its beginning + void seek( long offset ) throw( exSeekError, exWriteError ); + /// Seeks in the file, relative to the current position + void seekCur( long offset ) throw( exSeekError, exWriteError ); + /// Seeks in the file, relative to the end of file + void seekEnd( long offset = 0 ) throw( exSeekError, exWriteError ); + + /// Seeks to the beginning of file + void rewind() throw( exSeekError, exWriteError ); + + /// Tells the current position within the file, relative to its beginning + size_t tell() throw( exSeekError ); + + /// Returns file size + size_t size() throw( exSeekError, exWriteError ); + + /// Returns true if end-of-file condition is set + bool eof() throw( exWriteError ); + + /// Returns the underlying FILE * record, so other operations can be + /// performed on it + FILE * file() throw( exWriteError ); + + /// Releases the file handle out of the control of the class. No further + /// operations are valid. The file will not be closed on destruction + FILE * release() throw( exWriteError ); + + /// Closes the file. No further operations are valid + void close() throw( exWriteError ); + + /// Checks if the file exists or not + static bool exists( char const * filename ) throw(); + + static bool exists( std::string const & filename ) throw() + { return exists( filename.c_str() ); } + + ~File() throw(); + + /// Erases the given file + static void erase( std::string const & ) throw( exCantErase ); + + /// Renames the given file + static void rename( std::string const & from, + std::string const & to ) throw( exCantRename ); + + /// Throwing this class instead of exReadError will make the description + /// include the file name + class exReadErrorDetailed: public exReadError + { + string description; + + public: + exReadErrorDetailed( int fd ); + exReadErrorDetailed( FILE * f ); + virtual const char * what() const throw(); + virtual ~exReadErrorDetailed() throw (); + + private: + void buildDescription( int fd ); + }; + +private: + + void open( char const * filename, OpenMode ) throw( exCantOpen ); + void flushWriteBuffer() throw( exWriteError ); + void releaseWriteBuffer() throw( exWriteError ); +}; + +#endif diff --git a/hex.cc b/hex.cc new file mode 100644 index 0000000..ab8de0d --- /dev/null +++ b/hex.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include "hex.hh" + +using std::string; + +namespace { +/// Converts 'size' bytes pointed to by 'in' into a hex string pointed to by +/// 'out'. It should have at least size * 2 bytes. No trailing zero is added +void hexify( unsigned char const * in, unsigned size, char * out ) +{ + while( size-- ) + { + unsigned char v = *in++; + + *out++ = ( v >> 4 < 10 ) ? '0' + ( v >> 4 ) : 'a' + ( v >> 4 ) - 10; + *out++ = ( ( v & 0xF ) < 10 ) ? '0' + ( v & 0xF ) : 'a' + ( v & 0xF ) - 10; + } +} +} + +string toHex( unsigned char const * in, unsigned size ) +{ + string result( size * 2, 0 ); + hexify( in, size, &result[ 0 ] ); + + return result; +} diff --git a/hex.hh b/hex.hh new file mode 100644 index 0000000..36d638f --- /dev/null +++ b/hex.hh @@ -0,0 +1,12 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef HEX_HH_INCLUDED__ +#define HEX_HH_INCLUDED__ + +#include + +/// Converts 'size' bytes pointed to by 'in' into a hex string +std::string toHex( unsigned char const * in, unsigned size ); + +#endif diff --git a/index_file.cc b/index_file.cc new file mode 100644 index 0000000..b5dedc6 --- /dev/null +++ b/index_file.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include + +#include "bundle.hh" +#include "encryption.hh" +#include "index_file.hh" +#include "message.hh" + +namespace IndexFile { + +enum +{ + FileFormatVersion = 1 +}; + +Writer::Writer( EncryptionKey const & key, string const & fileName ): + stream( fileName.c_str(), key, Encryption::ZeroIv ) +{ + stream.writeRandomIv(); + FileHeader header; + header.set_version( FileFormatVersion ); + Message::serialize( header, stream ); +} + +void Writer::add( BundleInfo const & info, Bundle::Id const & bundleId ) +{ + IndexBundleHeader header; + header.set_id( &bundleId, sizeof( bundleId ) ); + + Message::serialize( header, stream ); + Message::serialize( info, stream ); +} + +Writer::~Writer() +{ + // Final record which does not have a bundle id + IndexBundleHeader header; + Message::serialize( header, stream ); + stream.writeAdler32(); +} + +Reader::Reader( EncryptionKey const & key, string const & fileName ): + stream( fileName.c_str(), key, Encryption::ZeroIv ) +{ + stream.consumeRandomIv(); + + FileHeader header; + Message::parse( header, stream ); + + if ( header.version() != FileFormatVersion ) + throw exUnsupportedVersion(); +} + +bool Reader::readNextRecord( BundleInfo & info, Bundle::Id & bundleId ) +{ + IndexBundleHeader header; + Message::parse( header, stream ); + + if ( header.has_id() ) + { + if ( header.id().size() != sizeof( bundleId ) ) + throw exIncorrectBundleIdSize(); + + memcpy( &bundleId, header.id().data(), sizeof( bundleId ) ); + + Message::parse( info, stream ); + return true; + } + else + { + stream.checkAdler32(); + return false; + } +} + +} diff --git a/index_file.hh b/index_file.hh new file mode 100644 index 0000000..faca834 --- /dev/null +++ b/index_file.hh @@ -0,0 +1,60 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef INDEX_FILE_HH_INCLUDED__ +#define INDEX_FILE_HH_INCLUDED__ + +#include +#include + +#include "adler32.hh" +#include "bundle.hh" +#include "encrypted_file.hh" +#include "encryption_key.hh" +#include "ex.hh" +#include "file.hh" +#include "nocopy.hh" +#include "zbackup.pb.h" + +/// Index files store all existing chunk ids and their bundle ids. This +/// information can also be retrieved by scanning all bundle files, but that +/// would incur a lot of disk seeks which we want to minimize here +namespace IndexFile { + +using std::string; + +/// Creates index files +class Writer: NoCopy +{ + EncryptedFile::OutputStream stream; + +public: + /// Creates a new chunk log. Initially it is stored in a temporary file + Writer( EncryptionKey const &, string const & fileName ); + + /// Adds a bundle info to the log + void add( BundleInfo const &, Bundle::Id const & bundleId ); + + /// Finalizes the file + ~Writer(); +}; + +/// Reads index files +class Reader: NoCopy +{ + EncryptedFile::InputStream stream; + +public: + DEF_EX( Ex, "Index file reader exception", std::exception ) + DEF_EX( exUnsupportedVersion, "Unsupported version of the index file format", Ex ) + DEF_EX( exIncorrectBundleIdSize, "Incorrect bundle id size encountered", Ex ) + + Reader( EncryptionKey const &, string const & fileName ); + + /// Reads the next record from the file. Returns false if no more records can + /// be found + bool readNextRecord( BundleInfo &, Bundle::Id & bundleId ); +}; +} + +#endif diff --git a/message.cc b/message.cc new file mode 100644 index 0000000..f919181 --- /dev/null +++ b/message.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include "message.hh" + +#include + +namespace Message { + +void serialize( MessageLite const & message, ZeroCopyOutputStream & stream ) +{ + CodedOutputStream cos( &stream ); + serialize( message, cos ); +} + +void serialize( MessageLite const & message, CodedOutputStream & cos ) +{ + cos.WriteVarint32( message.ByteSize() ); + message.SerializeWithCachedSizes( &cos ); + if ( cos.HadError() ) + throw exCantSerialize( message.GetTypeName() ); +} + +void parse( MessageLite & message, ZeroCopyInputStream & stream ) +{ + CodedInputStream cis( &stream ); + parse( message, cis ); +} + +void parse( MessageLite & message, CodedInputStream & cis ) +{ + uint32_t v; + if ( !cis.ReadVarint32( &v ) ) + throw exCantParse( message.GetTypeName() ); + + CodedInputStream::Limit limit = cis.PushLimit( v ); + + if( !message.ParseFromCodedStream( &cis ) ) + throw exCantParse( message.GetTypeName() ); + + cis.PopLimit( limit ); +} + +} diff --git a/message.hh b/message.hh new file mode 100644 index 0000000..152bf2a --- /dev/null +++ b/message.hh @@ -0,0 +1,40 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef MESSAGE_HH_INCLUDED__ +#define MESSAGE_HH_INCLUDED__ + +#include +#include +#include +#include + +#include "ex.hh" + +/// Some utilities for protobuffer messages +namespace Message { + +DEF_EX( Ex, "Message exception", std::exception ) +DEF_EX_STR( exCantParse, "Can't parse message", Ex ) +DEF_EX_STR( exCantSerialize, "Can't serialize message", Ex ) + +using google::protobuf::io::ZeroCopyOutputStream; +using google::protobuf::io::ZeroCopyInputStream; +using google::protobuf::io::CodedInputStream; +using google::protobuf::io::CodedOutputStream; +using google::protobuf::MessageLite; + +/// Serializes the given message to the given zero-copy stream +void serialize( MessageLite const &, ZeroCopyOutputStream & ); + +/// Serializes the given message to the given coded stream +void serialize( MessageLite const &, CodedOutputStream & ); + +/// Reads and parses the given message from the given zero-copy stream +void parse( MessageLite &, ZeroCopyInputStream & ); + +/// Reads and parses the given message from the given coded stream +void parse( MessageLite &, CodedInputStream & ); +} + +#endif diff --git a/mt.cc b/mt.cc new file mode 100644 index 0000000..9e5cbdb --- /dev/null +++ b/mt.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include "mt.hh" + +#include +#include "check.hh" + +Mutex::Mutex() +{ + pthread_mutex_init( &mutex, 0 ); +} + +void Mutex::lock() +{ + pthread_mutex_lock( &mutex ); +} + +void Mutex::unlock() +{ + pthread_mutex_unlock( &mutex ); +} + +Mutex::~Mutex() +{ + pthread_mutex_destroy( &mutex ); +} + +Condition::Condition() +{ + pthread_cond_init( &cond, 0 ); +} + +void Condition::signal() +{ + pthread_cond_signal( &cond ); +} + +void Condition::broadcast() +{ + pthread_cond_broadcast( &cond ); +} + +void Condition::wait( Mutex & m ) +{ + pthread_cond_wait( &cond, &m.mutex ); +} + +Condition::~Condition() +{ + pthread_cond_destroy( &cond ); +} + +void * Thread::__thread_routine( void * param ) +{ + return ( (Thread *)param ) -> threadFunction(); +} + +void Thread::start() +{ + CHECK( pthread_create( &thread, 0, &__thread_routine, this ) == 0, + "pthread_create() failed" ); +} + +void Thread::detach() +{ + CHECK( pthread_detach( thread ) == 0, "pthread_detach() failed" ); +} + +void * Thread::join() +{ + void * ret; + pthread_join( thread, &ret ); + return ret; +} + +size_t getNumberOfCpus() +{ + long result = sysconf( _SC_NPROCESSORS_ONLN ); + + // Handle -1 and also sanitize the 0 value which wouldn't make sense + return result < 1 ? 1 : result; +} diff --git a/mt.hh b/mt.hh new file mode 100644 index 0000000..9f70c53 --- /dev/null +++ b/mt.hh @@ -0,0 +1,87 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef MT_HH_INCLUDED__ +#define MT_HH_INCLUDED__ + +#include +#include + +#include "nocopy.hh" + +/// Multithreading + +class Condition; + +class Mutex +{ + friend class Condition; + + pthread_mutex_t mutex; + +public: + + Mutex(); + + /// Please consider using the Lock class instead + void lock(); + + void unlock(); + + ~Mutex(); +}; + +class Lock: NoCopy +{ + Mutex * m; + +public: + + Lock( Mutex & mutex ): m( &mutex ) { m->lock(); } + + ~Lock() + { m->unlock(); } +}; + +/// Condition variable. Atomically unlocks the given mutex before it suspends +/// waiting for event, and upon the awakening reacquires it +class Condition +{ + pthread_cond_t cond; + +public: + + Condition(); + + void signal(); + + void broadcast(); + + /// Mutex must be locked on entrance + void wait( Mutex & m ); + + ~Condition(); +}; + +class Thread +{ +public: + void start(); + void detach(); + void * join(); + + virtual ~Thread() {} + +protected: + /// This is the function that is meant to work in a separate thread + virtual void * threadFunction() throw()=0; + +private: + pthread_t thread; + static void * __thread_routine( void * ); +}; + +/// Returns the number of CPUs this system has +size_t getNumberOfCpus(); + +#endif diff --git a/nocopy.hh b/nocopy.hh new file mode 100644 index 0000000..ba4ba44 --- /dev/null +++ b/nocopy.hh @@ -0,0 +1,19 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef NOCOPY_HH_INCLUDED__ +#define NOCOPY_HH_INCLUDED__ + +/// A simple class to disallow copying of the class objects. Inherit from it to +/// use it +class NoCopy +{ +public: + NoCopy() {} + +private: + NoCopy( NoCopy const & ); + NoCopy & operator = ( NoCopy const & ); +}; + +#endif // NOCOPY_HH diff --git a/objectcache.cc b/objectcache.cc new file mode 100644 index 0000000..e3059f3 --- /dev/null +++ b/objectcache.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include "objectcache.hh" + +ObjectCache::ObjectCache( unsigned maxSize_ ): maxObjects( maxSize_ ), + totalObjects( 0 ) +{ +} + +bool ObjectCache::remove( ObjectId const & id ) +{ + Objects tmp; + tmp.push_back( Object() ); + tmp.back().id = id; + ObjectMap::iterator i = objectMap.find( tmp.begin() ); + + if ( i == objectMap.end() ) + return false; + + // Make sure that in case a destructor raises an exception, the cache + // is left in a consistent state. + Reference * ref = (*i)->reference; + + objects.erase( *i ); + objectMap.erase( i ); + --totalObjects; + + delete ref; + + return true; +} + +void ObjectCache::clear() +{ + for ( Objects::iterator i = objects.begin(); i != objects.end(); ) + { + // Make sure that in case a destructor raises an exception, the cache + // is left in a consistent state. + Reference * ref = i->reference; + objectMap.erase( i ); + objects.erase( i++ ); + --totalObjects; + + delete ref; + } +} diff --git a/objectcache.hh b/objectcache.hh new file mode 100644 index 0000000..aafe40c --- /dev/null +++ b/objectcache.hh @@ -0,0 +1,127 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef OBJECTCACHE_HH_INCLUDED__ +#define OBJECTCACHE_HH_INCLUDED__ + +#include +#include +#include +#include +#include "sptr.hh" +#include "nocopy.hh" + +/// ObjectCache allows caching dynamically-allocated objects of any type. The +/// size of the cache is upper-bound and is specified at construction-time. +/// Newly added or recently found objects are placed to the top of the internal +/// stack. When there's no space in the cache, object become removed from the +/// bottom of it +class ObjectCache: NoCopy +{ +public: + ObjectCache( unsigned maxObjects ); + + /// Id of the object being stored in the cache + typedef std::string ObjectId; + + /// Returns a reference to the stored object with the given id, or creates + /// one if none existed. The caller must know the expected type of the object + /// and specify it explicitly + template< class T > + sptr< T > & entry( ObjectId const & ); + + /// Removes a stored object with the given id. Returns true if the object + /// was removed, false if it didn't exist in the cache + bool remove( ObjectId const & ); + + /// Deletes all the objects from cache + void clear(); + + ~ObjectCache() + { clear(); } + +private: + + /// Base class for a reference to an object being stored + struct Reference: NoCopy + { + virtual ~Reference() + {} + }; + + /// Having this class allows to delete T via virtual destructor accessible + /// from the base Reference class + template< class T > + struct ReferenceTo: public Reference + { + sptr< T > ref; + }; + + struct Object + { + ObjectId id; + Reference * reference; + }; + typedef std::list< Object > Objects; + + struct ObjectsIteratorComp + { + bool operator () ( Objects::iterator const & x, Objects::iterator const & y ) + { return x->id < y->id; } + }; + + typedef std::set< Objects::iterator, ObjectsIteratorComp > ObjectMap; + + unsigned maxObjects; + Objects objects; + unsigned totalObjects; + ObjectMap objectMap; + +}; + +template< class T > +sptr< T > & ObjectCache::entry( ObjectId const & id ) +{ + Objects tmp; + tmp.push_back( Object() ); + tmp.back().id = id; + + std::pair< ObjectMap::iterator, bool > r = objectMap.insert( tmp.begin() ); + + if ( r.second ) + { + // The object was created + + // Init the reference + ReferenceTo< T > * refTo = new ReferenceTo< T >(); + tmp.back().reference = refTo; + + // Add the object to top of our objects + objects.splice( objects.begin(), tmp ); + ++totalObjects; + + // evict an entry at the bottom, if needed + if ( totalObjects > maxObjects ) + { + Objects::iterator i = --objects.end(); + objectMap.erase( i ); + Reference * ref = i->reference; + objects.pop_back(); + --totalObjects; + + delete ref; // We expect that it may throw + } + + return refTo->ref; + } + else + { + // The object was existent + // Move it to the top + objects.splice( objects.begin(), objects, *r.first ); + + return dynamic_cast< ReferenceTo< T > & >( *objects.front().reference ).ref; + } +} + +#endif // OBJECTCACHE_HH diff --git a/page_size.cc b/page_size.cc new file mode 100644 index 0000000..b11f2a1 --- /dev/null +++ b/page_size.cc @@ -0,0 +1,16 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include "page_size.hh" + +#include + +unsigned getPageSize() +{ + static unsigned value = 0; + + if ( !value ) + value = sysconf( _SC_PAGESIZE ); + + return value; +} diff --git a/page_size.hh b/page_size.hh new file mode 100644 index 0000000..e426084 --- /dev/null +++ b/page_size.hh @@ -0,0 +1,10 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef PAGE_SIZE_HH_INCLUDED__ +#define PAGE_SIZE_HH_INCLUDED__ + +/// Returns the page size used by this system +unsigned getPageSize(); + +#endif diff --git a/random.cc b/random.cc new file mode 100644 index 0000000..ca67dd8 --- /dev/null +++ b/random.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include "random.hh" +#include + +namespace Random { +void genarateTrue( void * buf, unsigned size ) +{ + if ( RAND_bytes( (unsigned char *) buf, size ) != 1 ) + throw exCantGenerate(); +} + +void genaratePseudo( void * buf, unsigned size ) +{ + if ( RAND_pseudo_bytes( (unsigned char *) buf, size ) < 0 ) + throw exCantGenerate(); +} +} diff --git a/random.hh b/random.hh new file mode 100644 index 0000000..954f8bf --- /dev/null +++ b/random.hh @@ -0,0 +1,21 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef RANDOM_HH_INCLUDED__ +#define RANDOM_HH_INCLUDED__ + +#include + +#include "ex.hh" + +namespace Random { +DEF_EX( exCantGenerate, "Error generating random sequence, try later", std::exception ) + +/// This one fills the buffer with true randomness, suitable for a key +void genarateTrue( void * buf, unsigned size ); +/// This one fills the buffer with pseudo randomness, suitable for salts but not +/// keys +void genaratePseudo( void * buf, unsigned size ); +} + +#endif diff --git a/rolling_hash.cc b/rolling_hash.cc new file mode 100644 index 0000000..3de9231 --- /dev/null +++ b/rolling_hash.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include "rolling_hash.hh" + +RollingHash::RollingHash() +{ + reset(); +} + +void RollingHash::reset() +{ + count = 0; + factor = 0; + nextFactor = 1; + value = 0; +} + +RollingHash::Digest RollingHash::digest( void const * buf, unsigned size ) +{ + // TODO: this can be optimized, as in this case there's no need to calculate + // factor values. + RollingHash hash; + + for ( char const * p = ( char const * )buf; size--; ) + hash.rollIn( *p++ ); + + return hash.digest(); +} diff --git a/rolling_hash.hh b/rolling_hash.hh new file mode 100644 index 0000000..fb9b97a --- /dev/null +++ b/rolling_hash.hh @@ -0,0 +1,81 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef ROLLING_HASH_HH_INCLUDED__ +#define ROLLING_HASH_HH_INCLUDED__ + +#include +#include + +// Modified Rabin-Karp rolling hash with the base of 257 and the modulo of 2^64. + +// The canonical RK hash calculates the following value (e.g. for 4 bytes): + +// hash = ( v1*b^3 + v2*b^2 + v3*b + v4 ) % m +// where v1, v2, v3 and v4 are the sequence of bytes, b is the base and m +// is the modulo. + +// We add b^4 in the mix: + +// hash = ( b^4 + v1*b^3 + v2*b^2 + v3*b + v4 ) % m + +// This fixes collisions where sequences only differ in the amount of zero +// bytes in the beginning (those amount to zero in the canonical RK), since the +// power of b in the first member depends on the total number of bytes in the +// sequence. + +// The choice of base: 257 is easy to multiply by (only two bits are set), and +// is the first prime larger than the value of any byte. It's easy to create +// collisions with the smaller primes: two-byte sequences '1, 0' and '0, base' +// would collide, for example. + +// The choice of modulo: 32-bit is impractical due to birthday paradox -- you +// get a collision with the 50% probability having only 77000 hashes. With +// 64-bit, the number of hashes to have the same probability would be 5.1 +// billion. With the block size of 64k, that would amount to 303 terabytes of +// data stored, which should be enough for our purposes. + +// Note: ( a = ( a << 8 ) + a ) is equivalent to ( a *= 257 ) + +class RollingHash +{ + uint64_t factor; + uint64_t nextFactor; + uint64_t value; + size_t count; + +public: + typedef uint64_t Digest; + + RollingHash(); + + void reset(); + + void rollIn( char c ) + { + factor = nextFactor; + nextFactor = ( nextFactor << 8 ) + nextFactor; // nextFactor *= 257 + value = ( value << 8 ) + value; + value += ( unsigned char ) c; + ++count; + } + + void rotate( char in, char out ) + { + value -= uint64_t( ( unsigned char ) out ) * factor; + value = ( value << 8 ) + value; // value *= 257 + value += ( unsigned char ) in; + } + + Digest digest() const + { + return value + nextFactor; + } + + size_t size() const + { return count; } + + static Digest digest( void const * buf, unsigned size ); +}; + +#endif diff --git a/sha256.cc b/sha256.cc new file mode 100644 index 0000000..0a8454d --- /dev/null +++ b/sha256.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include "sha256.hh" + +Sha256::Sha256() +{ + SHA256_Init( &ctx ); +} + +void Sha256::add( void const * data, size_t size ) +{ + SHA256_Update( &ctx, data, size ); +} + +void Sha256::finish( void * result ) +{ + SHA256_Final( ( unsigned char * ) result, &ctx ); +} + +string Sha256::finish() +{ + char buf[ Size ]; + finish( buf ); + + return string( buf, buf + sizeof( buf ) ); +} diff --git a/sha256.hh b/sha256.hh new file mode 100644 index 0000000..f4b8ef7 --- /dev/null +++ b/sha256.hh @@ -0,0 +1,37 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef SHA256_HH_INCLUDED__ +#define SHA256_HH_INCLUDED__ + +#include +#include +#include + +using std::string; + +/// A simple wrapper over openssl +class Sha256 +{ + SHA256_CTX ctx; +public: + + enum + { + // Number of bytes a digest has + Size = SHA256_DIGEST_LENGTH + }; + + Sha256(); + + /// Adds more data + void add( void const * data, size_t size ); + + /// Result should point at at least Size bytes + void finish( void * result ); + + /// Returns result as a string blob + string finish(); +}; + +#endif diff --git a/sptr.hh b/sptr.hh new file mode 100644 index 0000000..4668811 --- /dev/null +++ b/sptr.hh @@ -0,0 +1,156 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef SPTR_HH_INCLUDED__ +#define SPTR_HH_INCLUDED__ + +/// A generic non-intrusive smart-pointer template. We could use boost::, tr1:: +/// or whatever, but since there's no standard solution yet, it isn't worth +/// the dependency given the simplicity of the template + +template< class T > +class sptr_base +{ + template< class TT > friend class sptr_base; + + T * p; + unsigned * count; + + + void increment() + { + if ( count ) + ++*count; + } + +public: + + sptr_base(): p( 0 ), count( 0 ) {} + + sptr_base( T * p_ ): p( p_ ), count( p ? new unsigned( 1 ) : 0 ) + { + } + + sptr_base( sptr_base< T > const & other ): p( other.p ), count( other.count ) + { increment(); } + + // TT is meant to be a derivative of T + template< class TT > + sptr_base( sptr_base< TT > const & other ): p( ( T * ) other.p ), + count( other.count ) + { increment(); } + + void reset() + { + if ( count ) + { + if ( ! -- *count ) + { + delete count; + + count = 0; + + if ( p ) + { + T * p_ = p; + + p = 0; + + delete p_; + } + } + else + { + p = 0; + count = 0; + } + } + } + + unsigned use_count() const + { return count; } + + sptr_base & operator = ( sptr_base const & other ) + { if ( &other != this ) { reset(); p = other.p; count = other.count; increment(); } + return * this; } + + bool operator ! ( void ) const + { return !p; } + + bool operator == ( sptr_base const & other ) const + { return p == other.p; } + + bool operator != ( sptr_base const & other ) const + { return p != other.p; } + + ~sptr_base() + { reset(); } + +protected: + + T * get_base( void ) const + { return p; } +}; + +template< class T > +class sptr: public sptr_base< T > +{ +public: + + sptr() {} + + sptr( T * p ): sptr_base< T >( p ) {} + + // TT is meant to be a derivative of T + template< class TT > + sptr( sptr< TT > const & other ): sptr_base< T >( other ) {} + + // Retrieval + + T * get( void ) const + { return sptr_base< T > :: get_base(); } + + T * operator -> ( void ) const + { return get(); } + + T & operator * ( void ) const + { return * get(); } + + // Check + + operator bool( void ) const + { return get(); } + + bool operator ! ( void ) const + { return !get(); } +}; + +template< class T > +class const_sptr: public sptr_base< T > +{ +public: + + const_sptr() {} + + const_sptr( T * p_ ): sptr_base< T >( p_ ) {} + + const_sptr( sptr< T > const & other ): sptr_base< T >( other ) {} + + // TT is meant to be a derivative of T + template< class TT > + const_sptr( sptr_base< TT > const & other ): sptr_base< T >( other ) {} + + // Retrieval + + T const * get( void ) const + { return sptr_base< T > :: get_base(); } + + T const * operator -> ( void ) const + { return get(); } + + T const & operator * ( void ) const + { return * get(); } +}; + + +#endif diff --git a/static_assert.hh b/static_assert.hh new file mode 100644 index 0000000..b12ce42 --- /dev/null +++ b/static_assert.hh @@ -0,0 +1,28 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef STATIC_ASSERT_HH_INCLUDED__ +#define STATIC_ASSERT_HH_INCLUDED__ + +// Based on the one from the Boost library. It wouldn't make sense to depend on +// boost just for that + +namespace StaticAssert { + +template < bool > +struct AssertionFailure; + +template <> +struct AssertionFailure< true > +{}; + +template< int > struct Test +{}; +} + +#define STATIC_ASSERT( B ) \ + typedef ::StaticAssert::Test< \ + sizeof( ::StaticAssert::AssertionFailure< bool( B ) > ) >\ + static_assert_typedef_ ## __LINE__ + +#endif diff --git a/storage_info_file.cc b/storage_info_file.cc new file mode 100644 index 0000000..eed8af2 --- /dev/null +++ b/storage_info_file.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include + +#include "encrypted_file.hh" +#include "message.hh" +#include "storage_info_file.hh" + +namespace StorageInfoFile { + +enum +{ + FileFormatVersion = 1 +}; + +void save( string const & fileName, StorageInfo const & storageInfo ) +{ + EncryptedFile::OutputStream os( fileName.c_str(), EncryptionKey::noKey(), + NULL ); + FileHeader header; + header.set_version( FileFormatVersion ); + Message::serialize( header, os ); + + Message::serialize( storageInfo, os ); + os.writeAdler32(); +} + +void load( string const & fileName, StorageInfo & storageInfo ) +{ + EncryptedFile::InputStream is( fileName.c_str(), EncryptionKey::noKey(), + NULL ); + FileHeader header; + Message::parse( header, is ); + if ( header.version() != FileFormatVersion ) + throw exUnsupportedVersion(); + + Message::parse( storageInfo, is ); + is.checkAdler32(); +} + +} diff --git a/storage_info_file.hh b/storage_info_file.hh new file mode 100644 index 0000000..35c6386 --- /dev/null +++ b/storage_info_file.hh @@ -0,0 +1,28 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef STORAGE_INFO_FILE_HH_INCLUDED__ +#define STORAGE_INFO_FILE_HH_INCLUDED__ + +#include +#include + +#include "encryption_key.hh" +#include "ex.hh" +#include "zbackup.pb.h" + +namespace StorageInfoFile { + +using std::string; + +DEF_EX( Ex, "Storage info file exception", std::exception ) +DEF_EX( exUnsupportedVersion, "Unsupported version of the storage info file format", Ex ) + +/// Saves the given StorageInfo data into the given file +void save( string const & fileName, StorageInfo const & ); + +/// Loads the given StorageInfo data from the given file +void load( string const & fileName, StorageInfo & ); +} + +#endif diff --git a/tartool/CMakeLists.txt b/tartool/CMakeLists.txt new file mode 100644 index 0000000..e2c4b96 --- /dev/null +++ b/tartool/CMakeLists.txt @@ -0,0 +1,11 @@ +# Copyright (c) 2012-2013 Konstantin Isakov +# Part of ZBackup. Licensed under GNU GPLv2 or later + +cmake_minimum_required( VERSION 2.6.0 ) +project( tartool ) + +set( CMAKE_BUILD_TYPE Release ) + +add_executable( tartool tartool.cc ../file.cc ../dir.cc ) + +install( TARGETS tartool DESTINATION bin ) diff --git a/tartool/tartool.cc b/tartool/tartool.cc new file mode 100644 index 0000000..ad0db91 --- /dev/null +++ b/tartool/tartool.cc @@ -0,0 +1,192 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include +#include +#include +#include +#include + +#include "../dir.hh" +#include "../file.hh" + +using std::string; +using std::vector; +using std::map; + +void mention( File & file, string const & path ) +{ + file.write( path.data(), path.size() ); + file.write( '\n' ); +} + +bool startsWith( string const & s, char const * prefix ) +{ + for ( char const * sPtr = s.c_str(), * pPtr = prefix; *pPtr; ++sPtr, ++pPtr ) + if ( *sPtr != *pPtr ) + return false; + + return true; +} + +void scanDirIgnoringErrors( string const & path, File & includes, File & excludes, + bool currentlyIncluded ); + +void scanDir( string const & path, File & includes, File & excludes, + bool currentlyIncluded ) +{ + Dir::Entry entry; + + vector< string > subdirs; + vector< string > namedIncludes, namedExcludes; + typedef map< string, bool > FileList; + FileList fileList; + bool doBackup = false; + bool dontBackup = false; + + for ( Dir::Listing dir( path ); dir.getNext( entry ); ) + { + string const & fileName = entry.getFileName(); + + if ( entry.isDir() ) + { + if ( !entry.isSymLink() ) + subdirs.push_back( fileName ); + } + else + if ( fileName == ".backup" ) + doBackup = true; + if ( fileName == ".no-backup" ) + dontBackup = true; + else + if ( startsWith( fileName, ".backup-" ) ) + namedIncludes.push_back( fileName.substr( 8 ) ); + else + if ( startsWith( fileName, ".no-backup-" ) ) + namedExcludes.push_back( fileName.substr( 11 ) ); + } + + // If both are mentioned, backup + if ( doBackup ) + dontBackup = false; + + if ( doBackup && !currentlyIncluded ) + { + mention( includes, path ); + currentlyIncluded = true; + } + + if ( dontBackup && currentlyIncluded ) + { + mention( excludes, path ); + currentlyIncluded = false; + } + + // If we have any effective named lists, build the fileList map and process + // them. + if ( ( !currentlyIncluded && !namedIncludes.empty() ) || + ( currentlyIncluded && !namedExcludes.empty() ) ) + { + for ( Dir::Listing dir( path ); dir.getNext( entry ); ) + fileList[ entry.getFileName() ] = entry.isDir() && !entry.isSymLink(); + + if ( !currentlyIncluded ) + { + for ( vector< string > :: const_iterator i = namedIncludes.begin(); + i != namedIncludes.end(); ++i ) + { + FileList::iterator entry = fileList.find( *i ); + + if ( entry != fileList.end() ) + { + mention( includes, Dir::addPath( path, *i ) ); + + if ( entry->second ) // Is it a dir? Scan it then. + scanDir( Dir::addPath( path, entry->first ), includes, excludes, + true ); + + // Make sure we don't process it twice. + fileList.erase( entry ); + } + else + fprintf( stderr, "Warning: named include %s does not exist in %s\n", + i->c_str(), path.c_str() ); + } + } + else + { + for ( vector< string > :: const_iterator i = namedExcludes.begin(); + i != namedExcludes.end(); ++i ) + { + FileList::iterator entry = fileList.find( *i ); + + if ( entry != fileList.end() ) + { + mention( excludes, Dir::addPath( path, *i ) ); + + if ( entry->second ) // Is it a dir? Scan it then. + scanDir( Dir::addPath( path, entry->first ), includes, excludes, + false ); + + // Make sure we don't process it twice. + fileList.erase( entry ); + } + else + fprintf( stderr, "Warning: named exclude %s does not exist in %s\n", + i->c_str(), path.c_str() ); + } + } + + // Scan the rest of dirs + for ( FileList::const_iterator i = fileList.begin(); i != fileList.end(); + ++i ) + if ( i->second ) + scanDirIgnoringErrors( Dir::addPath( path, i->first ), includes, + excludes, currentlyIncluded ); + } + else + { + // No named lists -- just process all the dirs + for ( size_t x = 0; x < subdirs.size(); ++x ) + scanDirIgnoringErrors( Dir::addPath( path, subdirs[ x ] ), includes, + excludes, currentlyIncluded ); + } +} + +void scanDirIgnoringErrors( string const & path, File & includes, File & excludes, + bool currentlyIncluded ) +{ + try + { + scanDir( path, includes, excludes, currentlyIncluded ); + } + catch( Dir::exCantList & e ) + { + fprintf( stderr, "Warning: %s\n", e.what() ); + } +} + +int main( int argc, char *argv[] ) +{ + if ( argc != 4 ) + { + fprintf( stderr, "Usage: %s \n", *argv ); + return EXIT_FAILURE; + } + + try + { + File includes( argv[ 2 ], File::WriteOnly ); + File excludes( argv[ 3 ], File::WriteOnly ); + + scanDir( argv[ 1 ], includes, excludes, false ); + + return EXIT_SUCCESS; + } + catch( std::exception & e ) + { + fprintf( stderr, "Error: %s\n", e.what() ); + + return EXIT_FAILURE; + } +} diff --git a/tests/TODO.txt b/tests/TODO.txt new file mode 100644 index 0000000..e15efb2 --- /dev/null +++ b/tests/TODO.txt @@ -0,0 +1 @@ +Convert those to cmake -- they still use qmake at the moment diff --git a/tests/encrypted_file/encrypted_file.pro b/tests/encrypted_file/encrypted_file.pro new file mode 100644 index 0000000..1db357c --- /dev/null +++ b/tests/encrypted_file/encrypted_file.pro @@ -0,0 +1,38 @@ +###################################################################### +# Automatically generated by qmake (2.01a) Sun Jul 14 20:54:52 2013 +###################################################################### + +TEMPLATE = app +TARGET = +DEPENDPATH += . +INCLUDEPATH += . + +LIBS += -lcrypto -lprotobuf -lz +DEFINES += __STDC_FORMAT_MACROS + +# Input +SOURCES += test_encrypted_file.cc \ + ../../unbuffered_file.cc \ + ../../tmp_mgr.cc \ + ../../page_size.cc \ + ../../random.cc \ + ../../encryption_key.cc \ + ../../encryption.cc \ + ../../encrypted_file.cc \ + ../../file.cc \ + ../../dir.cc \ + ../../zbackup.pb.cc + +HEADERS += \ + ../../unbuffered_file.hh \ + ../../tmp_mgr.hh \ + ../../adler32.hh \ + ../../page_size.hh \ + ../../random.hh \ + ../../encryption_key.hh \ + ../../encrypted_file.hh \ + ../../encryption.hh \ + ../../ex.hh \ + ../../file.hh \ + ../../dir.hh \ + ../../zbackup.pb.h diff --git a/tests/encrypted_file/test_encrypted_file.cc b/tests/encrypted_file/test_encrypted_file.cc new file mode 100644 index 0000000..e41bb16 --- /dev/null +++ b/tests/encrypted_file/test_encrypted_file.cc @@ -0,0 +1,174 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include +#include "../../encrypted_file.hh" +#include "../../encryption_key.hh" +#include "../../random.hh" +#include "../../tmp_mgr.hh" +#include "../../check.hh" +#include "../../adler32.hh" + +char rnd[ 16384 ]; + +Adler32::Value adler( int sz ) +{ + Adler32 a; + a.add( rnd, sz ); + return a.result(); +} + +void readAndWrite( EncryptionKey const & key, bool writeBackups, + bool readBackups, bool readSkips ) +{ + TmpMgr tmpMgr( "/dev/shm" ); + + sptr< TemporaryFile > tempFile = tmpMgr.makeTemporaryFile(); + + int fileSize = rand() % ( sizeof( rnd ) + 1 ); + + fprintf( stderr, "Run with %d bytes, %s%s%s%sfile %s...\n", fileSize, + key.hasKey() ? "" : "no encryption, ", + writeBackups ? "write backups, " : "", + readBackups ? "read backups, " : "", + readSkips ? "read skips, " : "", + tempFile->getFileName().c_str() ); + + char iv[ Encryption::IvSize ]; + + Random::genaratePseudo( iv, sizeof( iv ) ); + + // Write + { + EncryptedFile::OutputStream out( tempFile->getFileName().c_str(), key, iv ); + + char const * next = rnd; + + int avail = 0; + for ( int left = fileSize; left; ) + { + CHECK( out.ByteCount() == fileSize - left, "Incorrect bytecount in the " + "middle of writing" ); + void * data; + CHECK( out.Next( &data, &avail ), "out.Next() returned false" ); + CHECK( avail > 0, "out.Next() returned zero size" ); + + bool doBackup = writeBackups && ( rand() & 1 ); + int backup; + if ( doBackup ) + { + backup = rand() % ( avail + 1 ); + // Make sure we don't back up and then need to back up again to finish + // the write + if ( avail > left ) + backup = avail - left; + avail -= backup; + } + + int toWrite = avail > left ? left : avail; + memcpy( data, next, toWrite ); + + if ( doBackup ) + out.BackUp( backup ); + + next += toWrite; + left -= toWrite; + avail -= toWrite; + + if ( !avail && ( rand() & 1 ) ) + { + CHECK( adler( next - rnd ) == out.getAdler32(), + "bad adler32 in the middle of writing" ); + } + } + + if ( avail || ( rand() & 1 ) ) + out.BackUp( avail ); + + CHECK( out.ByteCount() == fileSize, "Incorrect bytecount after writing" ); + + if ( rand() & 1 ) + { + CHECK( adler( fileSize ) == out.getAdler32(), + "bad adler32 of the written file" ); + } + } + + // Read back + { + EncryptedFile::InputStream in( tempFile->getFileName().c_str(), key, iv ); + + char const * next = rnd; + + void const * data; + int avail = 0; + for ( int left = fileSize; left; ) + { + if ( readSkips && ( rand() & 1 ) ) + { + int toSkip = rand() % ( left + 1 ); + in.Skip( toSkip ); + next += toSkip; + left -= toSkip; + avail = 0; + continue; + } + + CHECK( in.ByteCount() == fileSize - left, "Incorrect bytecount in the " + "middle of reading" ); + CHECK( in.Next( &data, &avail ), "file ended while %d were still left", + left ); + CHECK( avail > 0, "in.Next() returned zero size" ); + + bool doBackup = readBackups && ( rand() & 1 ); + int backup; + if ( doBackup ) + { + backup = rand() % ( avail + 1 ); + avail -= backup; + } + + int toRead = avail > left ? left : avail; + + CHECK( memcmp( next, data, toRead ) == 0, "Different bytes read than " + "expected at offset %d", int( next - rnd ) ); + + if ( doBackup ) + in.BackUp( backup ); + + next += toRead; + left -= toRead; + avail -= toRead; + + if ( !avail && ( rand() & 1 ) ) + { + CHECK( adler( next - rnd ) == in.getAdler32(), + "bad adler32 in the middle of the reading" ); + } + } + + CHECK( in.ByteCount() == fileSize, "Incorrect bytecount after reading" ); + + CHECK( !avail, "at least %d bytes still available", avail ); + CHECK( !in.Next( &data, &avail ), "file should have ended but resulted in " + "%d more bytes", avail ); + if ( rand() & 1 ) + { + CHECK( adler( fileSize ) == in.getAdler32(), + "bad adler32 of the read file" ); + } + } +} + +int main() +{ + Random::genaratePseudo( rnd, sizeof( rnd ) ); + EncryptionKeyInfo keyInfo; + EncryptionKey::generate( "blah", keyInfo ); + EncryptionKey key( "blah", &keyInfo ); + EncryptionKey noKey( std::string(), NULL ); + + for ( size_t iteration = 100000; iteration--; ) + readAndWrite( ( rand() & 1 ) ? key : noKey, rand() & 1, rand() & 1, + rand() & 1 ); +} diff --git a/tests/rolling_hash/rolling_hash.pro b/tests/rolling_hash/rolling_hash.pro new file mode 100644 index 0000000..cdacd64 --- /dev/null +++ b/tests/rolling_hash/rolling_hash.pro @@ -0,0 +1,16 @@ +###################################################################### +# Automatically generated by qmake (2.01a) ?? ???. 8 14:05:16 2012 +###################################################################### + +TEMPLATE = app +TARGET = +DEPENDPATH += . +INCLUDEPATH += . +LIBS += -lcrypto + +# Input +SOURCES += test_rolling_hash.cc ../../rolling_hash.cc \ + ../../random.cc + +HEADERS += \ + ../../random.hh diff --git a/tests/rolling_hash/test_rolling_hash.cc b/tests/rolling_hash/test_rolling_hash.cc new file mode 100644 index 0000000..e766ba0 --- /dev/null +++ b/tests/rolling_hash/test_rolling_hash.cc @@ -0,0 +1,120 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include +#include +#include +#include +#include +#include +#include "../../rolling_hash.hh" +#include "../../random.hh" + +using std::vector; +using std::map; +using std::set; +using std::pair; +using std::make_pair; + +int main() +{ + // Generate a buffer with random data, then pick slices there and try + // different strategies of rolling to them + vector< char > data( 65536 ); + + Random::genaratePseudo( data.data(), data.size() ); + + for ( unsigned iteration = 0; iteration < 5000; ++iteration ) + { + unsigned sliceBegin = rand() % data.size(); + unsigned sliceSize = 1 + ( rand() % ( data.size() - sliceBegin ) ); + + // Calculate the hash by roll-ins only + uint64_t rollIns; + { + RollingHash hash; + + for ( unsigned x = 0; x < sliceSize; ++x ) + hash.rollIn( data[ sliceBegin + x ] ); + + rollIns = hash.digest(); + } + + // Calculate the hash by rolling-in from the beginning of data to sliceSize, + // then rotating to sliceBegin + + uint64_t rotates; + { + RollingHash hash; + + for ( unsigned x = 0; x < sliceSize; ++x ) + hash.rollIn( data[ x ] ); + + for ( unsigned x = 0; x < sliceBegin; ++x ) + hash.rotate( data[ sliceSize + x ], data[ x ] ); + + rotates = hash.digest(); + } + + if ( rollIns != rotates ) + { + fprintf( stderr, "Error in iteration %u: %016lx vs %016lx\n", + iteration, rollIns, rotates ); + + return EXIT_FAILURE; + } + + printf( "Iteration %u: %016lx\n", iteration, rollIns ); + } + fprintf( stderr, "Rolling hash test produced equal results\n" ); + + // Test collisions + + // Maps the hash to the ranges. Ideally each hash should be mapped to a + // single range + map< uint64_t, set< pair< unsigned, unsigned > > > collisions; + size_t collisionValuesCount = 0; + + for ( unsigned iteration = 0; iteration < 500000; ++iteration ) + { + unsigned sliceBegin = rand() % ( data.size() - 7 ); + // A minimum of 16 should be enough to ensure every unique slice corresponds + // to a unique random sequence with a very high probability + unsigned sliceSize = 16 + ( rand() % ( data.size() - sliceBegin ) ); + + // Calculate the hash by roll-ins (fastest) + uint64_t rollIns; + { + RollingHash hash; + + for ( unsigned x = 0; x < sliceSize; ++x ) + hash.rollIn( data[ sliceBegin + x ] ); + + rollIns = hash.digest(); + } + + if ( collisions[ rollIns ].insert( make_pair( sliceBegin, sliceSize ) ).second ) + ++collisionValuesCount; + + if ( ! ( ( iteration + 1 ) % 1000 ) ) + printf( "Iteration %u: %016lx\n", iteration, rollIns ); + } + + size_t collisionsFound = collisionValuesCount - collisions.size(); + double collisionsPercentage = double( collisionsFound ) * 100 / + collisionValuesCount; + + fprintf( stderr, "Collisions: %.04f%% (%zu in %zu)\n", collisionsPercentage, + collisionsFound, collisionValuesCount ); + + if ( collisionsFound ) + { + // The probability of a collision in 500000 hashes is one to ~6 billions + fprintf( stderr, "Found a collision, which should be highly unlikely\n" ); + return EXIT_FAILURE; + } + + fprintf( stderr, "Rolling hash test succeeded\n" ); + + return EXIT_SUCCESS; +} diff --git a/tmp_mgr.cc b/tmp_mgr.cc new file mode 100644 index 0000000..6106818 --- /dev/null +++ b/tmp_mgr.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include "tmp_mgr.hh" + +#include +#include +#include "dir.hh" +#include "file.hh" + +TemporaryFile::TemporaryFile( string const & fileName ): fileName( fileName ) +{ +} + +void TemporaryFile::moveOverTo( string const & destinationFileName, + bool mayOverwrite ) +{ + if ( !mayOverwrite && File::exists( destinationFileName ) ) + throw TmpMgr::exWontOverwrite( destinationFileName ); + + File::rename( fileName, destinationFileName ); + fileName.clear(); +} + +TemporaryFile::~TemporaryFile() +{ + if ( !fileName.empty() ) + File::erase( fileName ); +} + +string const & TemporaryFile::getFileName() const +{ + return fileName; +} + +TmpMgr::TmpMgr( string const & path ): path( path ) +{ + if ( !Dir::exists( path ) ) + Dir::create( path ); +} + +sptr< TemporaryFile > TmpMgr::makeTemporaryFile() +{ + string name( Dir::addPath( path, "XXXXXX") ); + + int fd = mkstemp( &name[ 0 ] ); + + if ( fd == -1 || close( fd ) != 0 ) + throw exCantCreate( path ); + + return new TemporaryFile( name ); +} + +TmpMgr::~TmpMgr() +{ + try + { + Dir::remove( path ); + } + catch( Dir::exCantRemove & ) + { + } +} diff --git a/tmp_mgr.hh b/tmp_mgr.hh new file mode 100644 index 0000000..da7db43 --- /dev/null +++ b/tmp_mgr.hh @@ -0,0 +1,62 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef TMP_MGR_HH_INCLUDED__ +#define TMP_MGR_HH_INCLUDED__ + +#include +#include + +#include "dir.hh" +#include "ex.hh" +#include "file.hh" +#include "nocopy.hh" +#include "sptr.hh" + +/// A temporary file +class TemporaryFile: NoCopy +{ +public: + /// Returns the temporary file's file name. The file may already be existent - + /// it is supposed to be overwritten then + string const & getFileName() const; + /// Renames this temporary file over the given file name. If the destination + /// file exists already, it gets replaced if mayOverwrite is true, or throws + /// an exception otherwise + void moveOverTo( string const & destinationFileName, bool mayOverwrite = false ); + /// Removes the file from the disk, unless moveOverTo() was called previously + ~TemporaryFile(); + +private: + /// Use TmpMgr::makeTemporaryFile() instead of this constructor + TemporaryFile( string const & fileName ); + + string fileName; + + friend class TmpMgr; +}; + +/// Allows creating temporary files and later either removing them or moving +/// them over to the target ones +class TmpMgr: NoCopy +{ + string path; +public: + + DEF_EX( Ex, "Temporary file manager exception", std::exception ) + DEF_EX_STR( exCantCreate, "Can't create a temporary file in dir", Ex ) + DEF_EX_STR( exWontOverwrite, "Won't overwrite existing file", Ex ) + + /// Creates the given directory if it doesn't exist already and uses it to + /// store temporary files. + TmpMgr( string const & path ); + + /// Creates an new empty temporary file and returns its full file name, + /// including the path. The file is then supposed to be overwritten + sptr< TemporaryFile > makeTemporaryFile(); + + /// Removes the temporary directory, if possible + ~TmpMgr(); +}; + +#endif diff --git a/unbuffered_file.cc b/unbuffered_file.cc new file mode 100644 index 0000000..c9737b9 --- /dev/null +++ b/unbuffered_file.cc @@ -0,0 +1,97 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#define _LARGEFILE64_SOURCE + +#include +#include +#include +#include +#include + +#include "check.hh" +#include "unbuffered_file.hh" + +UnbufferedFile::UnbufferedFile( char const * fileName, Mode mode ) + throw( exCantOpen ) +{ + int flags = O_LARGEFILE | + ( mode == WriteOnly ? ( O_WRONLY | O_CREAT | O_TRUNC ) : + O_RDONLY ); + fd = open( fileName, flags, 0666 ); + if ( fd < 0 ) + throw exCantOpen( fileName ); +} + +size_t UnbufferedFile::read( void * buf, size_t size ) + throw( exReadError ) +{ + char * next = ( char * ) buf; + size_t left = size; + + while( left ) + { + ssize_t rd = ::read( fd, next, left ); + if ( rd < 0 ) + { + if ( errno != EINTR ) + throw exReadError(); + } + else + if ( rd > 0 ) + { + CHECK( ( size_t ) rd <= left, "read too many bytes from a file" ); + next += rd; + left -= rd; + } + else + break; + } + + return size - left; +} + +void UnbufferedFile::write( void const * buf, size_t size ) + throw( exWriteError ) +{ + char const * next = ( char const * ) buf; + size_t left = size; + + while( left ) + { + ssize_t written = ::write( fd, next, left ); + if ( written < 0 ) + { + if ( errno != EINTR ) + throw exWriteError(); + } + else + { + CHECK( ( size_t ) written <= left, "wrote too many bytes to a file" ); + next += written; + left -= written; + } + } +} + +UnbufferedFile::Offset UnbufferedFile::size() throw( exSeekError ) +{ + Offset cur = lseek64( fd, 0, SEEK_CUR ); + if ( cur < 0 ) + throw exSeekError(); + Offset result = lseek64( fd, 0, SEEK_END ); + if ( result < 0 || lseek64( fd, cur, SEEK_SET ) < 0 ) + throw exSeekError(); + return result; +} + +void UnbufferedFile::seekCur( Offset offset ) throw( exSeekError ) +{ + if ( lseek64( fd, offset, SEEK_CUR ) < 0 ) + throw exSeekError(); +} + +UnbufferedFile::~UnbufferedFile() throw() +{ + close( fd ); +} diff --git a/unbuffered_file.hh b/unbuffered_file.hh new file mode 100644 index 0000000..a8b65e2 --- /dev/null +++ b/unbuffered_file.hh @@ -0,0 +1,62 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef UNBUFFERED_FILE_HH_INCLUDED__ +#define UNBUFFERED_FILE_HH_INCLUDED__ + +#include +#include +#include +#include + +#include "ex.hh" +#include "nocopy.hh" + +/// A file which does not employ its own buffering. +/// TODO: add support for memory-mapped I/O, with the interface which would look +/// like that of a zero-copy stream. However, since we can do encryption in- +/// place, both interfaces should be available - when there's no memory-mapped +/// I/O available, the user should still provide its own buffer (and then do +/// in-place encryption in it). +class UnbufferedFile: NoCopy +{ +public: + + DEF_EX( Ex, "Unbuffered file exception", std::exception ) + DEF_EX_STR( exCantOpen, "Can't open file", Ex ) + DEF_EX( exReadError, "File read error", Ex ) + DEF_EX( exWriteError, "File write error", Ex ) + DEF_EX( exSeekError, "File seek error", Ex ) + + enum Mode + { + ReadOnly, + WriteOnly + }; + + typedef int64_t Offset; + + /// Opens the given file + UnbufferedFile( char const * fileName, Mode ) throw( exCantOpen ); + + /// Reads up to 'size' bytes into the buffer. Returns the number of bytes + /// read. If the value returned is less than the 'size' provided, the end of + /// file was reached + size_t read( void * buf, size_t size ) throw( exReadError ); + + /// Writes 'size' bytes + void write( void const * buf, size_t size ) throw( exWriteError ); + + /// Returns file size + Offset size() throw( exSeekError ); + + /// Seeks to the given offset, relative to the current file offset + void seekCur( Offset ) throw( exSeekError ); + + ~UnbufferedFile() throw(); + +private: + int fd; +}; + +#endif diff --git a/zbackup.cc b/zbackup.cc new file mode 100644 index 0000000..9f3a36d --- /dev/null +++ b/zbackup.cc @@ -0,0 +1,461 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "backup_creator.hh" +#include "backup_file.hh" +#include "backup_restorer.hh" +#include "debug.hh" +#include "dir.hh" +#include "encryption_key.hh" +#include "ex.hh" +#include "file.hh" +#include "mt.hh" +#include "sha256.hh" +#include "sptr.hh" +#include "storage_info_file.hh" +#include "zbackup.hh" + +using std::vector; + +Paths::Paths( string const & storageDir ): storageDir( storageDir ) +{ +} + +string Paths::getTmpPath() +{ + return string( Dir::addPath( storageDir, "tmp" ) ); +} + +string Paths::getBundlesPath() +{ + return string( Dir::addPath( storageDir, "bundles" ) ); +} + +string Paths::getStorageInfoPath() +{ + return string( Dir::addPath( storageDir, "info" ) ); +} + +string Paths::getIndexPath() +{ + return string( Dir::addPath( storageDir, "index" ) ); +} + +string Paths::getBackupsPath() +{ + return string( Dir::addPath( storageDir, "backups" ) ); +} + +ZBackupBase::ZBackupBase( string const & storageDir, string const & password ): + Paths( storageDir ), storageInfo( loadStorageInfo() ), + encryptionkey( password, storageInfo.has_encryption_key() ? + &storageInfo.encryption_key() : 0 ), + tmpMgr( getTmpPath() ), + chunkIndex( encryptionkey, tmpMgr, getIndexPath() ) +{ +} + +StorageInfo ZBackupBase::loadStorageInfo() +{ + StorageInfo storageInfo; + + StorageInfoFile::load( getStorageInfoPath(), storageInfo ); + + return storageInfo; +} + +void ZBackupBase::initStorage( string const & storageDir, + string const & password, + bool isEncrypted ) +{ + StorageInfo storageInfo; + // TODO: make the following configurable + storageInfo.set_chunk_max_size( 65536 ); + storageInfo.set_bundle_max_payload_size( 0x200000 ); + + if ( isEncrypted ) + EncryptionKey::generate( password, + *storageInfo.mutable_encryption_key() ); + + Paths paths( storageDir ); + + if ( !Dir::exists( storageDir ) ) + Dir::create( storageDir ); + + if ( !Dir::exists( paths.getBundlesPath() ) ) + Dir::create( paths.getBundlesPath() ); + + if ( !Dir::exists( paths.getBackupsPath() ) ) + Dir::create( paths.getBackupsPath() ); + + if ( !Dir::exists( paths.getIndexPath() ) ) + Dir::create( paths.getIndexPath() ); + + string storageInfoPath( paths.getStorageInfoPath() ); + + if ( File::exists( storageInfoPath ) ) + throw exWontOverwrite( storageInfoPath ); + + StorageInfoFile::save( storageInfoPath, storageInfo ); +} + +string ZBackupBase::deriveStorageDirFromBackupsFile( string const & + backupsFile ) +{ + // TODO: handle cases when there's a backup/ folder within the backup/ folder + // correctly + string realPath = Dir::getRealPath( Dir::getDirName( backupsFile ) ); + size_t pos; + if ( realPath.size() >= 8 && strcmp( realPath.c_str() + realPath.size() - 8, + "/backups") == 0 ) + pos = realPath.size() - 8; + else + pos = realPath.rfind( "/backups/" ); + if ( pos == string::npos ) + throw exCantDeriveStorageDir( backupsFile ); + else + return realPath.substr( 0, pos ); +} + +ZBackup::ZBackup( string const & storageDir, string const & password, + size_t threads ): + ZBackupBase( storageDir, password ), + chunkStorageWriter( storageInfo, encryptionkey, tmpMgr, chunkIndex, + getBundlesPath(), getIndexPath(), threads ) +{ +} + +void ZBackup::backupFromStdin( string const & outputFileName ) +{ + if ( isatty( fileno( stdin ) ) ) + throw exWontReadFromTerminal(); + + if ( File::exists( outputFileName ) ) + throw exWontOverwrite( outputFileName ); + + Sha256 sha256; + BackupCreator backupCreator( storageInfo, chunkIndex, chunkStorageWriter ); + + time_t startTime = time( 0 ); + uint64_t totalDataSize = 0; + + for ( ; ; ) + { + size_t toRead = backupCreator.getInputBufferSize(); +// dPrintf( "Reading up to %u bytes from stdin\n", toRead ); + + void * inputBuffer = backupCreator.getInputBuffer(); + size_t rd = fread( inputBuffer, 1, toRead, stdin ); + + if ( !rd ) + { + if ( feof( stdin ) ) + { + dPrintf( "No more input on stdin\n" ); + break; + } + else + throw exStdinError(); + } + + sha256.add( inputBuffer, rd ); + + backupCreator.handleMoreData( rd ); + + totalDataSize += rd; + } + + // Finish up with the creator + backupCreator.finish(); + + string serialized; + backupCreator.getBackupData( serialized ); + + BackupInfo info; + + info.set_sha256( sha256.finish() ); + info.set_size( totalDataSize ); + + // Shrink the serialized data iteratively until it wouldn't shrink anymore + for ( ; ; ) + { + BackupCreator backupCreator( storageInfo, chunkIndex, chunkStorageWriter ); + char const * ptr = serialized.data(); + size_t left = serialized.size(); + while( left ) + { + size_t bufferSize = backupCreator.getInputBufferSize(); + size_t toCopy = bufferSize > left ? left : bufferSize; + + memcpy( backupCreator.getInputBuffer(), ptr, toCopy ); + backupCreator.handleMoreData( toCopy ); + ptr += toCopy; + left -= toCopy; + } + + backupCreator.finish(); + + string newGen; + backupCreator.getBackupData( newGen ); + + if ( newGen.size() < serialized.size() ) + { + serialized.swap( newGen ); + info.set_iterations( info.iterations() + 1 ); + } + else + break; + } + + dPrintf( "Iterations: %u\n", info.iterations() ); + + info.mutable_backup_data()->swap( serialized ); + + info.set_time( time( 0 ) - startTime ); + + // Commit the bundles to the disk before creating the final output file + chunkStorageWriter.commit(); + + // Now save the resulting BackupInfo + + sptr< TemporaryFile > tmpFile = tmpMgr.makeTemporaryFile(); + BackupFile::save( tmpFile->getFileName(), encryptionkey, info ); + tmpFile->moveOverTo( outputFileName ); +} + +ZRestore::ZRestore( string const & storageDir, string const & password, + size_t cacheSize ): + ZBackupBase( storageDir, password ), + chunkStorageReader( storageInfo, encryptionkey, chunkIndex, getBundlesPath(), + cacheSize ) +{ +} + +void ZRestore::restoreToStdin( string const & inputFileName ) +{ + if ( isatty( fileno( stdout ) ) ) + throw exWontWriteToTerminal(); + + BackupInfo backupInfo; + + BackupFile::load( inputFileName, encryptionkey, backupInfo ); + + string backupData; + + // Perform the iterations needed to get to the actual user backup data + for ( ; ; ) + { + backupData.swap( *backupInfo.mutable_backup_data() ); + + if ( backupInfo.iterations() ) + { + struct StringWriter: public DataSink + { + string result; + + virtual void saveData( void const * data, size_t size ) + { + result.append( ( char const * ) data, size ); + } + } stringWriter; + + BackupRestorer::restore( chunkStorageReader, backupData, stringWriter ); + backupInfo.mutable_backup_data()->swap( stringWriter.result ); + backupInfo.set_iterations( backupInfo.iterations() - 1 ); + } + else + break; + } + + struct StdoutWriter: public DataSink + { + Sha256 sha256; + + virtual void saveData( void const * data, size_t size ) + { + sha256.add( data, size ); + if ( fwrite( data, size, 1, stdout ) != 1 ) + throw exStdoutError(); + } + } stdoutWriter; + + BackupRestorer::restore( chunkStorageReader, backupData, stdoutWriter ); + + if ( stdoutWriter.sha256.finish() != backupInfo.sha256() ) + throw exChecksumError(); +} + +DEF_EX( exNonEncryptedWithKey, "--non-encrypted and --password-file are incompatible", std::exception ) +DEF_EX( exSpecifyEncryptionOptions, "Specify either --password-file or --non-encrypted", std::exception ) +DEF_EX_STR( exInvalidThreadsValue, "Invalid threads value specified:", std::exception ) + +int main( int argc, char *argv[] ) +{ + try + { + char const * passwordFile = 0; + bool nonEncrypted = false; + size_t const defaultThreads = getNumberOfCpus(); + size_t threads = defaultThreads; + size_t const defaultCacheSizeMb = 40; + size_t cacheSizeMb = defaultCacheSizeMb; + vector< char const * > args; + + for( int x = 1; x < argc; ++x ) + { + if ( strcmp( argv[ x ], "--password-file" ) == 0 && x + 1 < argc ) + { + passwordFile = argv[ x + 1 ]; + ++x; + } + else + if ( strcmp( argv[ x ], "--non-encrypted" ) == 0 ) + nonEncrypted = true; + else + if ( strcmp( argv[ x ], "--silent" ) == 0 ) + verboseMode = false; + else + if ( strcmp( argv[ x ], "--threads" ) == 0 && x + 1 < argc ) + { + int n; + if ( sscanf( argv[ x + 1 ], "%zu %n", &threads, &n ) != 1 || + argv[ x + 1 ][ n ] || threads < 1 ) + throw exInvalidThreadsValue( argv[ x + 1 ] ); + ++x; + } + else + if ( strcmp( argv[ x ], "--cache-size" ) == 0 && x + 1 < argc ) + { + char suffix[ 16 ]; + int n; + if ( sscanf( argv[ x + 1 ], "%zu %15s %n", + &cacheSizeMb, suffix, &n ) == 2 && !argv[ x + 1 ][ n ] ) + { + // Check the suffix + for ( char * c = suffix; *c; ++c ) + *c = tolower( *c ); + + if ( strcmp( suffix, "mb" ) != 0 ) + { + fprintf( stderr, "Invalid suffix specified in cache size: %s. " + "The only supported suffix is 'mb' for megabytes\n", + argv[ x + 1 ] ); + return EXIT_FAILURE; + } + + ++x; + } + else + { + fprintf( stderr, "Invalid cache size value specified: %s. " + "Must be a number with the 'mb' suffix, e.g. '100mb'\n", + argv[ x + 1 ] ); + return EXIT_FAILURE; + } + } + else + args.push_back( argv[ x ] ); + } + + if ( nonEncrypted && passwordFile ) + throw exNonEncryptedWithKey(); + + if ( args.size() < 1 ) + { + fprintf( stderr, +"ZBackup, a versatile deduplicating backup tool, version 1.0\n" +"Copyright (c) 2012-2013 Konstantin Isakov \n" +"Comes with no warranty. Licensed under GNU GPLv2 or later.\n" +"Visit the project's home page at http://zbackup.org/\n\n" + +"Usage: %s [flags] [command args]\n" +" Flags: --non-encrypted|--password-file \n" +" --silent (default is verbose)\n" +" --threads (default is %zu on your system)\n" +" --cache-size MB (default is %zu)\n" +" Commands:\n" +" init - initializes new storage;\n" +" backup - performs a backup from stdin;\n" +" restore - restores a backup to stdout.\n", *argv, + defaultThreads, defaultCacheSizeMb ); + return EXIT_FAILURE; + } + + // Read the password + string passwordData; + if ( passwordFile ) + { + File f( passwordFile, File::ReadOnly ); + passwordData.resize( f.size() ); + f.read( &passwordData[ 0 ], passwordData.size() ); + + // If the password ends with \n, remove that last \n. Many editors will + // add \n there even if a user doesn't want them to + if ( !passwordData.empty() && + passwordData[ passwordData.size() - 1 ] == '\n' ) + passwordData.resize( passwordData.size() - 1 ); + } + + if ( strcmp( args[ 0 ], "init" ) == 0 ) + { + // Perform the init + if ( args.size() != 2 ) + { + fprintf( stderr, "Usage: %s init \n", *argv ); + return EXIT_FAILURE; + } + if ( !nonEncrypted && !passwordFile ) + throw exSpecifyEncryptionOptions(); + + ZBackup::initStorage( args[ 1 ], passwordData, !nonEncrypted ); + } + else + if ( strcmp( args[ 0 ], "backup" ) == 0 ) + { + // Perform the backup + if ( args.size() != 2 ) + { + fprintf( stderr, "Usage: %s backup \n", + *argv ); + return EXIT_FAILURE; + } + ZBackup zb( ZBackup::deriveStorageDirFromBackupsFile( args[ 1 ] ), + passwordData, threads ); + zb.backupFromStdin( args[ 1 ] ); + } + else + if ( strcmp( args[ 0 ], "restore" ) == 0 ) + { + // Perform the restore + if ( args.size() != 2 ) + { + fprintf( stderr, "Usage: %s restore \n", + *argv ); + return EXIT_FAILURE; + } + ZRestore zr( ZRestore::deriveStorageDirFromBackupsFile( args[ 1 ] ), + passwordData, cacheSizeMb * 1048576 ); + zr.restoreToStdin( args[ 1 ] ); + } + else + { + fprintf( stderr, "Error: unknown command line option: %s\n", args[ 0 ] ); + return EXIT_FAILURE; + } + } + catch( std::exception & e ) + { + fprintf( stderr, "%s\n", e.what() ); + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} diff --git a/zbackup.hh b/zbackup.hh new file mode 100644 index 0000000..1981ddf --- /dev/null +++ b/zbackup.hh @@ -0,0 +1,97 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +#ifndef ZBACKUP_HH_INCLUDED__ +#define ZBACKUP_HH_INCLUDED__ + +#include +#include +#include +#include + +#include "chunk_id.hh" +#include "chunk_index.hh" +#include "chunk_storage.hh" +#include "encryption_key.hh" +#include "ex.hh" +#include "tmp_mgr.hh" +#include "zbackup.pb.h" + +using std::string; +using std::vector; + +struct Paths +{ + string storageDir; + + Paths( string const & storageDir ); + + string getTmpPath(); + string getRestorePath(); + string getCreatePath(); + string getBundlesPath(); + string getStorageInfoPath(); + string getIndexPath(); + string getBackupsPath(); +}; + +class ZBackupBase: protected Paths +{ +public: + DEF_EX( Ex, "ZBackup exception", std::exception ) + DEF_EX_STR( exWontOverwrite, "Won't overwrite existing file", Ex ) + DEF_EX( exStdinError, "Error reading from standard input", Ex ) + DEF_EX( exWontReadFromTerminal, "Won't read data from a terminal", exStdinError ) + DEF_EX( exStdoutError, "Error writing to standard output", Ex ) + DEF_EX( exWontWriteToTerminal, "Won't write data to a terminal", exStdoutError ) + DEF_EX( exSerializeError, "Failed to serialize data", Ex ) + DEF_EX( exParseError, "Failed to parse data", Ex ) + DEF_EX( exChecksumError, "Checksum error", Ex ) + DEF_EX_STR( exCantDeriveStorageDir, "The path must be within the backups/ dir:", Ex ) + + /// Opens the storage + ZBackupBase( string const & storageDir, string const & password ); + + /// Creates new storage + static void initStorage( string const & storageDir, string const & password, + bool isEncrypted ); + + /// For a given file within the backups/ dir in the storage, returns its + /// storage dir or throws an exception + static string deriveStorageDirFromBackupsFile( string const & backupsFile ); + +protected: + StorageInfo storageInfo; + EncryptionKey encryptionkey; + TmpMgr tmpMgr; + ChunkIndex chunkIndex; + +private: + StorageInfo loadStorageInfo(); +}; + +class ZBackup: public ZBackupBase +{ + ChunkStorage::Writer chunkStorageWriter; + +public: + ZBackup( string const & storageDir, string const & password, + size_t threads ); + + /// Backs up the data from stdin + void backupFromStdin( string const & outputFileName ); +}; + +class ZRestore: public ZBackupBase +{ + ChunkStorage::Reader chunkStorageReader; + +public: + ZRestore( string const & storageDir, string const & password, + size_t cacheSize ); + + /// Restores the data to stdin + void restoreToStdin( string const & inputFileName ); +}; + +#endif diff --git a/zbackup.proto b/zbackup.proto new file mode 100644 index 0000000..b1b138c --- /dev/null +++ b/zbackup.proto @@ -0,0 +1,109 @@ +// Copyright (c) 2012-2013 Konstantin Isakov +// Part of ZBackup. Licensed under GNU GPLv2 or later + +// Protobuffers used in zbackup + +// This stores the key used for the encryption of all the blocks. The key itself +// is stored in the encrypted form. A user supplies a password - it is used +// together with salt and rounds to generate a decryption key for the actual +// key used for block encryption. This way we can change the password without +// re-encrypting all the blocks +message EncryptionKeyInfo +{ + // The decryption key is derived from the password, salt and rounds using + // PKCS5_PBKDF2_HMAC_SHA1 + + // Salt to use together with the user password + required bytes salt = 1; + // Rounds of hashing to apply when generating the key used to decrypt the + // block key + required uint32 rounds = 2; + // Stores the block encryption key, in an encrypted form itself + required bytes encrypted_key = 3; + // Used to check that the key was decrypted correctly - see the next field + required bytes key_check_input = 4; + // HMAC of key_check_input using the decrypted key. Used to check that the + // key was indeed decrypted correctly + required bytes key_check_hmac = 5; +} + +message StorageInfo +{ + // Maximum chunk size used when storing chunks + required uint32 chunk_max_size = 1; + // Maximum number of bytes a bundle can hold. Only real chunk bytes are + // counted, not metadata. Any bundle should be able to contain at least + // one arbitrary single chunk, so this should not be smaller than + // chunk_max_size + required uint32 bundle_max_payload_size = 2; + // If present, used for encryption/decryption of all data + optional EncryptionKeyInfo encryption_key = 3; +} + +message BundleInfo +{ + // Info about a single chunk stored + message ChunkRecord + { + // Id of the chunk + required bytes id = 1; + // Size of the chunk + required uint32 size = 2; + } + + // A sequence of chunk records + repeated ChunkRecord chunk_record = 1; +} + +message FileHeader +{ + // File format version + required uint32 version = 1; +} + +message IndexBundleHeader +{ + // Id of the bundle following in the stream. If not present, indicates the + // end of log file + optional bytes id = 1; +} + +// A single instruction. Backups are made of a sequence of those instructions, +// which are executed one after another +message BackupInstruction +{ + // Both fields can present simultaneously. They are evaluated in the same + // order they are listed here + + // If present, the chunk with that id should be emitted to the data flow + optional bytes chunk_to_emit = 1; + // If present, the bytes contained in the field should be emitted to the + // data flow + optional bytes bytes_to_emit = 2; +} + +message BackupInfo +{ + // The backup data. Since usually the field is quite large for real life + /// backups, we process its serialized data with the same backup algorithm + // iteratively until it doesn't shrink. The content of this field represents + // the last iteration of that process. If iterations = 0, it directly + // represents the user's backup data. If iterations = 1, it represents the + // backed up BackupData which would represent the user's backed up data once + // it is restored, and so on. + // The type is 'bytes' as the result is serialized + required bytes backup_data = 1; + + // Number of times backup_data should be restored with the 'restore' algorithm + // before we get what we need to restore for the end user + optional uint32 iterations = 2 [default = 0]; + + // Number of bytes in the backup data + required uint64 size = 3; + + // SHA-256 of the original data + required bytes sha256 = 4; + + // Time spent creating the backup, in seconds + optional int64 time = 5; +}