zbackup/compression.cc

658 lines
19 KiB
C++
Raw Normal View History

// Copyright (c) 2012-2013 Konstantin Isakov <ikm@zbackup.org>
// Part of ZBackup. Licensed under GNU GPLv2 or later
#include <string>
#include "compression.hh"
#include "check.hh"
EnDecoder::EnDecoder() { }
EnDecoder::EnDecoder(const EnDecoder&) { }
EnDecoder::~EnDecoder() { }
Compression::~Compression() { }
// LZMA
#include <lzma.h>
class LZMAEnDecoder : public EnDecoder
{
protected:
static lzma_stream init_value;
lzma_stream strm;
public:
LZMAEnDecoder()
{
strm = init_value;
}
void setInput( const void* data, size_t size )
{
strm.next_in = (const uint8_t *) data;
strm.avail_in = size;
}
void setOutput( void* data, size_t size )
{
strm.next_out = (uint8_t *) data;
strm.avail_out = size;
}
size_t getAvailableInput()
{
return strm.avail_in;
}
size_t getAvailableOutput()
{
return strm.avail_out;
}
bool process( bool finish )
{
lzma_ret ret = lzma_code( &strm, ( finish ? LZMA_FINISH : LZMA_RUN ) );
CHECK( ret == LZMA_OK || ret == LZMA_STREAM_END, "lzma_code error: %d", (int) ret );
return ( ret == LZMA_STREAM_END );
}
};
lzma_stream LZMAEnDecoder::init_value = LZMA_STREAM_INIT;
class LZMAEncoder : public LZMAEnDecoder
{
public:
LZMAEncoder()
{
uint32_t preset = 6; // TODO: make this customizable, although 6 seems to be
// the best option
lzma_ret ret = lzma_easy_encoder( &strm, preset, LZMA_CHECK_CRC64 );
CHECK( ret == LZMA_OK, "lzma_easy_encoder error: %d", (int) ret );
}
};
class LZMADecoder : public LZMAEnDecoder
{
public:
LZMADecoder()
{
lzma_ret ret = lzma_stream_decoder( &strm, UINT64_MAX, 0 );
CHECK( ret == LZMA_OK,"lzma_stream_decoder error: %d", (int) ret );
}
};
class LZMACompression : public Compression
{
public:
sptr<EnDecoder> createEncoder() const
{
return new LZMAEncoder();
}
sptr<EnDecoder> createDecoder() const
{
return new LZMADecoder();
}
std::string getName() const { return "lzma"; }
};
// LZO
// liblzo implements a lot of algorithms "for unlimited backward compatibility"
// The web site says:
// "My experiments have shown that LZO1B is good with a large blocksize
// or with very redundant data, LZO1F is good with a small blocksize or
// with binary data and that LZO1X is often the best choice of all.
// LZO1Y and LZO1Z are almost identical to LZO1X - they can achieve a
// better compression ratio on some files.
// Beware, your mileage may vary."
// => I'm using LZO1X, as suggested
#include <string.h>
// Unfortunately, liblzo always works with the whole data, so it doesn't support
// the streaming approach that most other libraries use. This means that we have
// to use a big buffer for the data. The class NoStreamEnDecoder implements this
// so we can use it, if there is another library like liblzo.
// Collect all data and process it in one pass
class NoStreamEnDecoder : public EnDecoder
{
std::string acc_data_in, acc_data_out;
const char* data_in;
char* data_out;
size_t avail_in, avail_out;
bool processed;
size_t pos_in_acc_data_out;
protected:
// you must implement these:
// Should we try with the existing output buffer which has avail_out
// bytes of free space? If you know that this will fail, return false.
// You may peek into data_in which contains the complete compressed data.
virtual bool shouldTryWith( const char* data_in, size_t avail_in, size_t avail_out ) =0;
// We will allocate a buffer for the output data. How big should it be?
// You may peek into data_in which contains the complete compressed data.
virtual size_t suggestOutputSize( const char* data_in, size_t avail_in ) =0;
// Is this input complete?
// An encoder should return false.
virtual bool isCompleteInput( const char* data_in, size_t avail_in ) =0;
// Process the data in data_in and put the result into data_out. You musn't
// write more than avail_out bytes! If the output buffer is big enough,
// process the data and store the output size in output_size. If the output
// buffer is too small, return false and we will give you a bigger one. If
// any other error occurrs, abort the program. We don't have any better
// error handling. Sorry. Do NOT return false for errors that won't be
// remedied by a bigger buffer!
virtual bool do_process( const char* data_in, size_t avail_in,
char* data_out, size_t avail_out, size_t& output_size ) =0;
void setUnusedInput( size_t unused )
{
this->data_in += avail_in - unused;
this->avail_in = unused;
}
public:
NoStreamEnDecoder()
{
data_in = data_out = NULL;
avail_in = avail_out = pos_in_acc_data_out = 0;
processed = false;
}
void setInput( const void* data, size_t size )
{
data_in = (const char *) data;
avail_in = size;
}
void setOutput( void* data, size_t size )
{
data_out = (char *) data;
avail_out = size;
}
size_t getAvailableInput()
{
return avail_in;
}
size_t getAvailableOutput()
{
return avail_out;
}
bool process( bool finish )
{
// try to process the input, if we haven't done it, yet
if ( !processed )
{
// data has not been encoded
if ( acc_data_in.empty() )
{
// this is the first piece of data
if ( finish || isCompleteInput( data_in, avail_in ) )
{
// special case: all the data has been passed at once
// -> process it without using acc_data_in
process_finish( data_in, avail_in );
}
}
// if we didn't process the data, put it into accumulator
if ( !processed )
{
// accumulate data in acc_data_in
acc_data_in.append( data_in, avail_in );
// If this was the last bit of data, we process it, now.
if ( finish || isCompleteInput( acc_data_in.data(), acc_data_in.size() ) )
{
process_finish( acc_data_in.data(), acc_data_in.size() );
}
}
}
// If the input has been processed, try to copy some of it to the output buffer.
if ( processed )
{
// data has been encoded or decoded, remaining output is in acc_data_out
// -> copy to output
if (avail_out > 0 && acc_data_out.size() - pos_in_acc_data_out > 0)
{
size_t sz = avail_out;
if ( sz > acc_data_out.size() - pos_in_acc_data_out )
sz = acc_data_out.size() - pos_in_acc_data_out;
memcpy( data_out, acc_data_out.data() + pos_in_acc_data_out, sz );
data_out += sz;
avail_out -= sz;
pos_in_acc_data_out += sz;
}
// no more data left? -> return true
return ( acc_data_out.size() - pos_in_acc_data_out == 0 );
} else {
// not yet processed, so we cannot be done
return false;
}
}
private:
void process_finish( const char* data_in, size_t avail_in )
{
// should we try with the existing output buffer?
if ( shouldTryWith( data_in, avail_in, avail_out ) )
{
size_t output_size;
if ( do_process( data_in, avail_in, data_out, avail_out, output_size ) )
{
// it worked :-)
processed = true;
avail_out -= output_size;
return ;
}
}
// we use our own buffer
size_t buffer_size = suggestOutputSize( data_in, avail_in );
do {
acc_data_out.resize(buffer_size);
size_t output_size;
//TODO doc says we mustn't modify the pointer returned by data()...
if ( do_process( data_in, avail_in,
(char*) acc_data_out.data(), buffer_size, output_size ) )
{
// buffer is big enough
acc_data_out.resize( output_size );
processed = true;
return ;
}
// try a bigger one
buffer_size *= 2;
} while (true);
}
};
#include <endian.h>
// like NoStreamEnDecoder, but also adds the uncompressed size before the stream
//NOTE You should make sure that the compression function doesn't overwrite any
// memory, if this information is corrupted! This could be exploited by a
// malicious person and there is nothing I can do about it. I could check for
// an overflow, but when control gets back to this class, it is already too
// late, as one 'ret' instruction is enough to do harm.
class NoStreamAndUnknownSizeDecoder : public NoStreamEnDecoder
{
protected:
// You implement this one:
// If you don't know the real decoded size, don't change output_size.
virtual bool do_process_no_size( const char* data_in, size_t avail_in,
char* data_out, size_t avail_out, size_t& output_size ) =0;
bool shouldTryWith( const char* data_in, size_t avail_in, size_t avail_out )
{
return suggestOutputSize( data_in, avail_in ) <= avail_out;
}
// Is this input complete?
bool isCompleteInput( const char* data_in, size_t avail_in )
{
if ( avail_in < 2*sizeof(uint64_t) )
return false;
data_in += sizeof(uint64_t);
size_t input_size = le32toh( *(uint32_t*) data_in );
return ( avail_in >= input_size + 2*sizeof(uint64_t) );
}
size_t suggestOutputSize( const char* data_in, size_t avail_in )
{
CHECK( avail_in >= sizeof(uint64_t), "not enough input data" );
// We're not using size_t because we need a type that has the same size on all
// architectures. A 32-bit host won't be able to open files with more than
// 4GB (actually much less), so 4 byte are enough. Even a 64-bit host would
// have some trouble with allocating 8GB of RAM just for our buffers ;-)
//NOTE If your compiler doesn't accept this cast, your size_t is smaller than
// uint32_t. In that case, you are in trouble...
size_t output_size = le32toh( *(uint32_t*) data_in );
return output_size;
}
bool do_process( const char* data_in, size_t avail_in,
char* data_out, size_t avail_out, size_t& output_size )
{
if ( avail_in < 2*sizeof( uint64_t ) )
return false;
//NOTE We skip 8 bytes. If we later decide to drop compatibility with 32-bit
// hosts, we can save a 64-bit size. Well, that will be much later, when
// we can easily hold two copies of a 4GB file in main memory :-D
size_t needed_output_size = le32toh( *(uint32_t*) data_in );
data_in += sizeof(uint64_t);
size_t input_size = le32toh( *(uint32_t*) data_in );
data_in += sizeof(uint64_t);
if ( output_size < needed_output_size )
return false;
avail_in -= 2*sizeof( uint64_t );
// We might not need all of our input data.
setUnusedInput( avail_in - input_size );
avail_in = input_size;
size_t reported_output_size = needed_output_size;
if ( !do_process_no_size( data_in, avail_in, data_out, avail_out, reported_output_size ) )
return false;
CHECK( reported_output_size == needed_output_size,
"Size of decoded data is different than expected" );
output_size = needed_output_size;
return true;
}
};
// encoder for NoStreamAndUnknownSizeDecoder
class NoStreamAndUnknownSizeEncoder : public NoStreamEnDecoder
{
protected:
// You implement this one:
virtual bool do_process_no_size( const char* data_in, size_t avail_in,
char* data_out, size_t avail_out, size_t& output_size ) =0;
bool shouldTryWith( const char*, size_t, size_t avail_out )
{
// If the compression doesn't use any spaces...
return avail_out > sizeof( uint64_t );
}
bool isCompleteInput( const char* data_in, size_t avail_in )
{
// We cannot know whether the user wants to send more data.
// -> return false; user must use finish=true to signal end of data
return false;
}
size_t getOverhead()
{
return 2*sizeof( uint64_t );
}
size_t suggestOutputSize( const char*, size_t avail_in )
{
// We assume that the compression won't make the data any bigger.
return avail_in + getOverhead();
}
bool do_process( const char* data_in, size_t avail_in,
char* data_out, size_t avail_out, size_t& output_size )
{
CHECK( avail_in <= UINT32_MAX,
"You want to compress more than 4GB of data?! Sorry, we don't support that, yet." );
memcpy(data_out, "ABCDEFGHIJKLMNOP", 16);
// store size
*(uint32_t*)data_out = htole32( avail_in );
uint32_t* compressed_size = (uint32_t*) ( data_out + sizeof( uint64_t ) );
// compressed data goes after the size
// We skip more than we actually use; see NoStreamAndUnknownSizeDecoder::do_process(...).
data_out += getOverhead();
avail_out -= getOverhead();
if ( !do_process_no_size( data_in, avail_in, data_out, avail_out, output_size ) )
return false;
CHECK( output_size <= UINT32_MAX,
"The compressed data is more than 4GB?! Sorry, we don't support that, yet." );
*compressed_size = htole32( (uint32_t) output_size );
output_size += getOverhead();
return true;
}
};
#ifdef HAVE_LIBLZO
#include <lzo/lzo1x.h>
// finally, we can implement lzo
class LZO1X_1_Decoder : public NoStreamAndUnknownSizeDecoder
{
protected:
bool do_process_no_size( const char* data_in, size_t avail_in,
char* data_out, size_t avail_out, size_t& output_size )
{
// same argument is used for available output size and size of decompressed data
output_size = avail_out;
int ret = lzo1x_decompress_safe( (const lzo_bytep) data_in, avail_in,
(lzo_bytep) data_out, &output_size, NULL );
if ( ret == LZO_E_OUTPUT_OVERRUN )
return false;
CHECK( ret >= LZO_E_OK, "lzo1x_decompress_safe failed (code %d)", ret );
return true;
}
};
class LZO1X_1_Compression;
class LZO1X_1_Encoder : public NoStreamAndUnknownSizeEncoder
{
const LZO1X_1_Compression* compression;
static size_t calcMaxCompressedSize(size_t avail_in);
public:
LZO1X_1_Encoder(const LZO1X_1_Compression* compression)
{
this->compression = compression;
}
protected:
bool do_process_no_size( const char* data_in, size_t avail_in,
char* data_out, size_t avail_out, size_t& output_size );
bool shouldTryWith( const char*, size_t, size_t avail_out );
size_t suggestOutputSize( const char*, size_t avail_in );
};
class LZO1X_1_Compression : public Compression
{
static bool initialized;
static void init()
{
//TODO This is not thread-safe. Does it have to be?
if (!initialized)
{
int ret = lzo_init();
CHECK( ret == LZO_E_OK, "lzo_init failed (%d)", ret );
initialized = true;
}
}
public:
sptr<EnDecoder> createEncoder() const
{
init();
return new LZO1X_1_Encoder(this);
}
sptr<EnDecoder> createDecoder() const
{
init();
return new LZO1X_1_Decoder();
}
std::string getName() const { return "lzo1x_1"; }
lzo_voidp getWorkmem( size_t size ) const
{
return new char[size];
}
void giveBackWorkmem( lzo_voidp wrkmem ) const
{
//TODO I think we should keep the memory around and reuse it. After all
// it is only a few kilobytes and we will need it a lot. However, I
// won't risk anything here because I don't know whether this will be
// called by more than one thread.
delete[] (char*)wrkmem;
}
};
bool LZO1X_1_Compression::initialized = false;
size_t LZO1X_1_Encoder::calcMaxCompressedSize( size_t avail_in )
{
// It seems that lzo1x_1_compress does NOT check whether the buffer is big enough.
// The documentation refers to example/simple.c which says:
// "Because the input block may be incompressible, we must provide a little more
// output space in case that compression is not possible."
// -> We use the same formula.
return (avail_in + avail_in / 16 + 64 + 3);
}
bool LZO1X_1_Encoder::shouldTryWith( const char* data_in, size_t avail_in, size_t avail_out )
{
return avail_out >= suggestOutputSize( data_in, avail_in );
}
size_t LZO1X_1_Encoder::suggestOutputSize( const char*, size_t avail_in )
{
// It seems that lzo1x_1_compress does NOT check whether the buffer is big enough.
// The documentation refers to example/simple.c which says:
// "Because the input block may be incompressible, we must provide a little more
// output space in case that compression is not possible."
// -> We use the same formula.
return calcMaxCompressedSize( avail_in ) + getOverhead();
}
bool LZO1X_1_Encoder::do_process_no_size( const char* data_in, size_t avail_in,
char* data_out, size_t avail_out, size_t& output_size )
{
// It seems that lzo1x_1_compress does NOT check whether the buffer is big enough.
// Therefore, we won't try it unless we are sure that the buffer is big enough.
if ( avail_out < calcMaxCompressedSize( avail_in ) )
return false;
// same argument is used for available output size (haha, see above)
// and size of decompressed data
output_size = avail_out;
lzo_voidp wrkmem = compression->getWorkmem(LZO1X_1_MEM_COMPRESS);
2013-10-03 12:24:47 +04:00
int ret = lzo1x_1_compress( (const lzo_bytep) data_in, avail_in,
(lzo_bytep) data_out, &output_size, wrkmem );
compression->giveBackWorkmem(wrkmem);
if ( ret == LZO_E_OUTPUT_OVERRUN )
return false;
CHECK( ret >= LZO_E_OK, "lzo1x_1_compress failed (code %d)", ret );
return true;
}
#endif // HAVE_LIBLZO
// register them
static const_sptr<Compression> const compressions[] = {
new LZMACompression(),
# ifdef HAVE_LIBLZO
new LZO1X_1_Compression(),
# endif
// NULL entry marks end of list. Don't remove it!
NULL
};
const_sptr<Compression> Compression::default_compression = compressions[0];
const_sptr<Compression> Compression::findCompression( const std::string& name, bool optional )
{
for ( const const_sptr<Compression>* c = compressions+0; *c; ++c )
{
if ( (*c)->getName() == name )
{
return (*c);
}
}
CHECK( !optional, "Couldn't find compression method '%s'", name.c_str() );
return NULL;
}
// iterator over compressions
Compression::iterator::iterator( const const_sptr<Compression>* ptr ) : ptr( ptr) { }
Compression::iterator::iterator( const iterator& it ) : ptr(it.ptr) { }
Compression::iterator& Compression::iterator::operator =( const iterator& it )
{
this->ptr = it.ptr;
return *this;
}
bool Compression::iterator::operator ==( const iterator& other ) const
{
// special case: one has ptr==NULL (end iterator returned by end()) and the
// other has *ptr==NULL (end iterator obtained by calling ++)
if ( !ptr && ( !other.ptr || !*other.ptr ) )
return true;
else if ( !other.ptr && ( !ptr || !*ptr ) )
return true;
else
return (ptr == other.ptr);
}
bool Compression::iterator::operator !=( const iterator& other ) const
{
return !( *this == other );
}
bool Compression::iterator::at_end() const
{
return !ptr || !*ptr;
}
Compression::iterator& Compression::iterator::operator ++()
{
CHECK( ptr && *ptr, "Cannot increment the end iterator" );
++ptr;
return *this;
}
const_sptr<Compression> Compression::iterator::operator *()
{
CHECK( ptr && *ptr, "Cannot dereference the end iterator" );
return *ptr;
}
Compression::iterator Compression::begin()
{
return iterator(compressions);
}
Compression::iterator Compression::end()
{
return iterator(NULL);
}