// Copyright (c) 2012-2014 Konstantin Isakov and ZBackup contributors, see CONTRIBUTORS // Part of ZBackup. Licensed under GNU GPLv2 or later + OpenSSL, see LICENSE #include #include "compression.hh" #include "check.hh" #include "endian.hh" namespace Compression { EnDecoder::EnDecoder() { } EnDecoder::~EnDecoder() { } CompressionMethod::~CompressionMethod() { } // LZMA #include class LZMAEnDecoder : public EnDecoder { protected: static lzma_stream initValue; lzma_stream strm; public: LZMAEnDecoder() { strm = initValue; } void setInput( const void* data, size_t size ) { strm.next_in = (const uint8_t *) data; strm.avail_in = size; } void setOutput( void* data, size_t size ) { strm.next_out = (uint8_t *) data; strm.avail_out = size; } size_t getAvailableInput() { return strm.avail_in; } size_t getAvailableOutput() { return strm.avail_out; } bool process( bool finish ) { lzma_ret ret = lzma_code( &strm, ( finish ? LZMA_FINISH : LZMA_RUN ) ); CHECK( ret == LZMA_OK || ret == LZMA_STREAM_END, "lzma_code error: %d", (int) ret ); return ( ret == LZMA_STREAM_END ); } ~LZMAEnDecoder() { lzma_end( &strm ); } }; lzma_stream LZMAEnDecoder::initValue = LZMA_STREAM_INIT; class LZMAEncoder : public LZMAEnDecoder { public: LZMAEncoder() { uint32_t preset = 6; lzma_ret ret = lzma_easy_encoder( &strm, preset, LZMA_CHECK_CRC64 ); CHECK( ret == LZMA_OK, "lzma_easy_encoder error: %d", (int) ret ); } LZMAEncoder( Config const & config ) { uint32_t compressionLevel = config.GET_STORABLE( lzma, compression_level ); uint32_t preset = ( compressionLevel > 9 ) ? ( compressionLevel - 10 ) | LZMA_PRESET_EXTREME : compressionLevel; lzma_ret ret = lzma_easy_encoder( &strm, preset, LZMA_CHECK_CRC64 ); CHECK( ret == LZMA_OK, "lzma_easy_encoder error: %d", (int) ret ); } }; class LZMADecoder : public LZMAEnDecoder { public: LZMADecoder() { lzma_ret ret = lzma_stream_decoder( &strm, UINT64_MAX, 0 ); CHECK( ret == LZMA_OK,"lzma_stream_decoder error: %d", (int) ret ); } }; class LZMACompression : public CompressionMethod { public: sptr createEncoder( Config const & config ) const { return new LZMAEncoder( config ); } sptr createEncoder() const { return new LZMAEncoder(); } sptr createDecoder() const { return new LZMADecoder(); } std::string getName() const { return "lzma"; } }; // LZO // liblzo implements a lot of algorithms "for unlimited backward compatibility" // The web site says: // "My experiments have shown that LZO1B is good with a large blocksize // or with very redundant data, LZO1F is good with a small blocksize or // with binary data and that LZO1X is often the best choice of all. // LZO1Y and LZO1Z are almost identical to LZO1X - they can achieve a // better compression ratio on some files. // Beware, your mileage may vary." // => I'm using LZO1X, as suggested #include // Unfortunately, liblzo always works with the whole data, so it doesn't support // the streaming approach that most other libraries use. This means that we have // to use a big buffer for the data. The class NoStreamEnDecoder implements this // so we can use it, if there is another library like liblzo. // Collect all data and process it in one pass class NoStreamEnDecoder : public EnDecoder { std::string accDataIn, accDataOut; const char* dataIn; char* dataOut; size_t availIn, availOut; bool processed; size_t posInAccDataOut; protected: // you must implement these: // Should we try with the existing output buffer which has availOut // bytes of free space? If you know that this will fail, return false. // You may peek into dataIn which contains the complete compressed data. virtual bool shouldTryWith( const char* dataIn, size_t availIn, size_t availOut ) =0; // We will allocate a buffer for the output data. How big should it be? // You may peek into dataIn which contains the complete compressed data. virtual size_t suggestOutputSize( const char* dataIn, size_t availIn ) =0; // Is this input complete? // An encoder should return false. virtual bool isCompleteInput( const char* dataIn, size_t availIn ) =0; // Process the data in dataIn and put the result into dataOut. You musn't // write more than availOut bytes! If the output buffer is big enough, // process the data and store the output size in outputSize. If the output // buffer is too small, return false and we will give you a bigger one. If // any other error occurrs, abort the program. We don't have any better // error handling. Sorry. Do NOT return false for errors that won't be // remedied by a bigger buffer! virtual bool doProcess( const char* dataIn, size_t availIn, char* dataOut, size_t availOut, size_t& outputSize ) =0; void setUnusedInput( size_t unused ) { this->dataIn += availIn - unused; this->availIn = unused; } public: NoStreamEnDecoder() { dataIn = dataOut = NULL; availIn = availOut = posInAccDataOut = 0; processed = false; } void setInput( const void* data, size_t size ) { dataIn = (const char *) data; availIn = size; } void setOutput( void* data, size_t size ) { dataOut = (char *) data; availOut = size; } size_t getAvailableInput() { return availIn; } size_t getAvailableOutput() { return availOut; } bool process( bool finish ) { // try to process the input, if we haven't done it, yet if ( !processed ) { // data has not been encoded if ( accDataIn.empty() ) { // this is the first piece of data if ( finish || isCompleteInput( dataIn, availIn ) ) { // special case: all the data has been passed at once // -> process it without using accDataIn processFinish( dataIn, availIn ); } } // if we didn't process the data, put it into accumulator if ( !processed ) { // accumulate data in accDataIn accDataIn.append( dataIn, availIn ); // If this was the last bit of data, we process it, now. if ( finish || isCompleteInput( accDataIn.data(), accDataIn.size() ) ) { processFinish( accDataIn.data(), accDataIn.size() ); } } } // If the input has been processed, try to copy some of it to the output buffer. if ( processed ) { // data has been encoded or decoded, remaining output is in accDataOut // -> copy to output if ( availOut > 0 && accDataOut.size() - posInAccDataOut > 0 ) { size_t sz = availOut; if ( sz > accDataOut.size() - posInAccDataOut ) sz = accDataOut.size() - posInAccDataOut; memcpy( dataOut, accDataOut.data() + posInAccDataOut, sz ); dataOut += sz; availOut -= sz; posInAccDataOut += sz; } // no more data left? -> return true return ( accDataOut.size() - posInAccDataOut == 0 ); } else { // not yet processed, so we cannot be done return false; } } private: void processFinish( const char* dataIn, size_t availIn ) { // should we try with the existing output buffer? if ( shouldTryWith( dataIn, availIn, availOut ) ) { size_t outputSize; if ( doProcess( dataIn, availIn, dataOut, availOut, outputSize ) ) { // it worked :-) processed = true; availOut -= outputSize; return ; } } // we use our own buffer size_t bufferSize = suggestOutputSize( dataIn, availIn ); do { accDataOut.resize( bufferSize ); size_t outputSize; //TODO doc says we mustn't modify the pointer returned by data()... if ( doProcess( dataIn, availIn, (char*) accDataOut.data(), bufferSize, outputSize ) ) { // buffer is big enough accDataOut.resize( outputSize ); processed = true; return ; } // try a bigger one bufferSize *= 2; } while (true); } }; #ifdef __APPLE__ #include #elif __FreeBSD__ #include #else #include #endif // like NoStreamEnDecoder, but also adds the uncompressed size before the stream //NOTE You should make sure that the compression function doesn't overwrite any // memory, if this information is corrupted! This could be exploited by a // malicious person and there is nothing I can do about it. I could check for // an overflow, but when control gets back to this class, it is already too // late, as one 'ret' instruction is enough to do harm. class NoStreamAndUnknownSizeDecoder : public NoStreamEnDecoder { protected: // You implement this one: // If you don't know the real decoded size, don't change outputSize. virtual bool doProcessNoSize( const char* dataIn, size_t availIn, char* dataOut, size_t availOut, size_t& outputSize ) =0; bool shouldTryWith( const char* dataIn, size_t availIn, size_t availOut ) { return suggestOutputSize( dataIn, availIn ) <= availOut; } // Is this input complete? bool isCompleteInput( const char* dataIn, size_t availIn ) { if ( availIn < 2*sizeof(uint64_t) ) return false; dataIn += sizeof(uint64_t); size_t inputSize = le32toh( *(uint32_t*) dataIn ); return ( availIn >= inputSize + 2*sizeof(uint64_t) ); } size_t suggestOutputSize( const char* dataIn, size_t availIn ) { CHECK( availIn >= sizeof(uint64_t), "not enough input data" ); // We're not using size_t because we need a type that has the same size on all // architectures. A 32-bit host won't be able to open files with more than // 4GB (actually much less), so 4 byte are enough. Even a 64-bit host would // have some trouble with allocating 8GB of RAM just for our buffers ;-) //NOTE If your compiler doesn't accept this cast, your size_t is smaller than // uint32_t. In that case, you are in trouble... size_t outputSize = le32toh( *(uint32_t*) dataIn ); return outputSize; } bool doProcess( const char* dataIn, size_t availIn, char* dataOut, size_t availOut, size_t& outputSize ) { if ( availIn < 2*sizeof( uint64_t ) ) return false; //NOTE We skip 8 bytes. If we later decide to drop compatibility with 32-bit // hosts, we can save a 64-bit size. Well, that will be much later, when // we can easily hold two copies of a 4GB file in main memory :-D size_t neededOutputSize = le32toh( *(uint32_t*) dataIn ); dataIn += sizeof(uint64_t); size_t inputSize = le32toh( *(uint32_t*) dataIn ); dataIn += sizeof(uint64_t); /*if ( outputSize < neededOutputSize ) return false;*/ outputSize = neededOutputSize; availIn -= 2*sizeof( uint64_t ); // We might not need all of our input data. setUnusedInput( availIn - inputSize ); availIn = inputSize; size_t reportedOutputSize = neededOutputSize; if ( !doProcessNoSize( dataIn, availIn, dataOut, availOut, reportedOutputSize ) ) return false; CHECK( reportedOutputSize == neededOutputSize, "Size of decoded data is different than expected" ); return true; } }; // encoder for NoStreamAndUnknownSizeDecoder class NoStreamAndUnknownSizeEncoder : public NoStreamEnDecoder { protected: // You implement this one: virtual bool doProcessNoSize( const char* dataIn, size_t availIn, char* dataOut, size_t availOut, size_t& outputSize ) =0; bool shouldTryWith( const char*, size_t, size_t availOut ) { // If the compression doesn't use any spaces... return availOut > sizeof( uint64_t ); } bool isCompleteInput( const char* dataIn, size_t availIn ) { // We cannot know whether the user wants to send more data. // -> return false; user must use finish=true to signal end of data return false; } size_t getOverhead() { return 2*sizeof( uint64_t ); } size_t suggestOutputSize( const char*, size_t availIn ) { // We assume that the compression won't make the data any bigger. return availIn + getOverhead(); } bool doProcess( const char* dataIn, size_t availIn, char* dataOut, size_t availOut, size_t& outputSize ) { CHECK( availIn <= UINT32_MAX, "You want to compress more than 4GB of data?! Sorry, we don't support that, yet." ); memcpy(dataOut, "ABCDEFGHIJKLMNOP", 16); // store size *(uint32_t*)dataOut = htole32( availIn ); uint32_t* compressedSize = (uint32_t*) ( dataOut + sizeof( uint64_t ) ); // compressed data goes after the size // We skip more than we actually use; see NoStreamAndUnknownSizeDecoder::doProcess(...). dataOut += getOverhead(); availOut -= getOverhead(); if ( !doProcessNoSize( dataIn, availIn, dataOut, availOut, outputSize ) ) return false; CHECK( outputSize <= UINT32_MAX, "The compressed data is more than 4GB?! Sorry, we don't support that, yet." ); *compressedSize = htole32( (uint32_t) outputSize ); outputSize += getOverhead(); return true; } }; #ifdef HAVE_LIBLZO #include // finally, we can implement lzo class LZO1X_1_Decoder : public NoStreamAndUnknownSizeDecoder { protected: bool doProcessNoSize( const char* dataIn, size_t availIn, char* dataOut, size_t availOut, size_t& outputSize ) { // same argument is used for available output size and size of decompressed data outputSize = availOut; int ret = lzo1x_decompress_safe( (const lzo_bytep) dataIn, availIn, (lzo_bytep) dataOut, (lzo_uintp) &outputSize, NULL ); if ( ret == LZO_E_OUTPUT_OVERRUN ) return false; CHECK( ret >= LZO_E_OK, "lzo1x_decompress_safe failed (code %d)", ret ); return true; } }; class LZO1X_1_Compression; class LZO1X_1_Encoder : public NoStreamAndUnknownSizeEncoder { const LZO1X_1_Compression* compression; static size_t calcMaxCompressedSize( size_t availIn ); public: LZO1X_1_Encoder( const LZO1X_1_Compression* compression ) { this->compression = compression; } protected: bool doProcessNoSize( const char* dataIn, size_t availIn, char* dataOut, size_t availOut, size_t& outputSize ); bool shouldTryWith( const char*, size_t, size_t availOut ); size_t suggestOutputSize( const char*, size_t availIn ); }; class LZO1X_1_Compression : public CompressionMethod { static bool initialized; static void init() { //TODO This is not thread-safe. Does it have to be? if (!initialized) { int ret = lzo_init(); CHECK( ret == LZO_E_OK, "lzo_init failed (%d)", ret ); initialized = true; } } public: sptr< EnDecoder > createEncoder( Config const & config ) const { init(); return new LZO1X_1_Encoder(this); } sptr< EnDecoder > createEncoder() const { init(); return new LZO1X_1_Encoder(this); } sptr< EnDecoder > createDecoder() const { init(); return new LZO1X_1_Decoder(); } std::string getName() const { return "lzo1x_1"; } lzo_voidp getWorkmem( size_t size ) const { return new char[size]; } void giveBackWorkmem( lzo_voidp wrkmem ) const { //TODO I think we should keep the memory around and reuse it. After all // it is only a few kilobytes and we will need it a lot. However, I // won't risk anything here because I don't know whether this will be // called by more than one thread. delete[] (char*)wrkmem; } }; bool LZO1X_1_Compression::initialized = false; size_t LZO1X_1_Encoder::calcMaxCompressedSize( size_t availIn ) { // It seems that lzo1x_1_compress does NOT check whether the buffer is big enough. // The documentation refers to example/simple.c which says: // "Because the input block may be incompressible, we must provide a little more // output space in case that compression is not possible." // -> We use the same formula. return (availIn + availIn / 16 + 64 + 3); } bool LZO1X_1_Encoder::shouldTryWith( const char* dataIn, size_t availIn, size_t availOut ) { return availOut >= suggestOutputSize( dataIn, availIn ); } size_t LZO1X_1_Encoder::suggestOutputSize( const char*, size_t availIn ) { // It seems that lzo1x_1_compress does NOT check whether the buffer is big enough. // The documentation refers to example/simple.c which says: // "Because the input block may be incompressible, we must provide a little more // output space in case that compression is not possible." // -> We use the same formula. return calcMaxCompressedSize( availIn ) + getOverhead(); } bool LZO1X_1_Encoder::doProcessNoSize( const char* dataIn, size_t availIn, char* dataOut, size_t availOut, size_t& outputSize ) { // It seems that lzo1x_1_compress does NOT check whether the buffer is big enough. // Therefore, we won't try it unless we are sure that the buffer is big enough. if ( availOut < calcMaxCompressedSize( availIn ) ) return false; // same argument is used for available output size (haha, see above) // and size of decompressed data outputSize = availOut; lzo_voidp wrkmem = compression->getWorkmem(LZO1X_1_MEM_COMPRESS); int ret = lzo1x_1_compress( (const lzo_bytep) dataIn, availIn, (lzo_bytep) dataOut, (lzo_uintp) &outputSize, wrkmem ); compression->giveBackWorkmem(wrkmem); if ( ret == LZO_E_OUTPUT_OVERRUN ) return false; CHECK( ret >= LZO_E_OK, "lzo1x_1_compress failed (code %d)", ret ); return true; } #endif // HAVE_LIBLZO // register them const_sptr< CompressionMethod > const CompressionMethod::compressions[] = { new LZMACompression(), # ifdef HAVE_LIBLZO new LZO1X_1_Compression(), # endif // NULL entry marks end of list. Don't remove it! NULL }; const_sptr< CompressionMethod > CompressionMethod::selectedCompression = compressions[ 0 ]; const_sptr< CompressionMethod > CompressionMethod::findCompression( const std::string& name, bool optional ) { for ( const const_sptr* c = compressions + 0; *c; ++c ) { if ( (*c)->getName() == name ) { return (*c); } } if ( !optional ) { throw exUnsupportedCompressionMethod( name ); } return NULL; } // iterator over compressions CompressionMethod::iterator::iterator( const const_sptr< CompressionMethod > * ptr ): ptr( ptr ) { } CompressionMethod::iterator::iterator( const iterator & it ): ptr( it.ptr ) { } CompressionMethod::iterator& CompressionMethod::iterator::operator =( const iterator& it ) { this->ptr = it.ptr; return *this; } bool CompressionMethod::iterator::operator ==( const iterator& other ) const { // special case: one has ptr==NULL (end iterator returned by end()) and the // other has *ptr==NULL (end iterator obtained by calling ++) if ( !ptr && ( !other.ptr || !*other.ptr ) ) return true; else if ( !other.ptr && ( !ptr || !*ptr ) ) return true; else return (ptr == other.ptr); } bool CompressionMethod::iterator::operator !=( const iterator& other ) const { return !( *this == other ); } bool CompressionMethod::iterator::atEnd() const { return !ptr || !*ptr; } CompressionMethod::iterator& CompressionMethod::iterator::operator ++() { CHECK( ptr && *ptr, "Cannot increment the end iterator" ); ++ptr; return *this; } const_sptr CompressionMethod::iterator::operator *() { CHECK( ptr && *ptr, "Cannot dereference the end iterator" ); return *ptr; } CompressionMethod::iterator CompressionMethod::begin() { return iterator(compressions); } CompressionMethod::iterator CompressionMethod::end() { return iterator(NULL); } }