From 3ff24945a2bb50a87ba7fc2111f39e6444ad93fc Mon Sep 17 00:00:00 2001 From: Vladimir Stackov Date: Thu, 6 Aug 2015 14:02:55 +0300 Subject: [PATCH 1/4] Added boilerplates for index GC Fixed misspelling: s/indicies/indexes/g --- backup_collector.cc | 11 ----------- backup_collector.hh | 7 ------- backup_exchanger.hh | 2 +- config.cc | 9 +++++---- zbackup.cc | 41 +++++++++++++++++++++++++++++++++++------ zutils.cc | 14 +++++++++----- zutils.hh | 3 ++- 7 files changed, 52 insertions(+), 35 deletions(-) diff --git a/backup_collector.cc b/backup_collector.cc index 147f874..5c99c4d 100644 --- a/backup_collector.cc +++ b/backup_collector.cc @@ -3,17 +3,6 @@ #include "backup_collector.hh" -#include -#include - -#include "bundle.hh" -#include "chunk_index.hh" -#include "backup_restorer.hh" -#include "backup_file.hh" -#include "backup_exchanger.hh" - -#include "debug.hh" - using std::string; void BundleCollector::startIndex( string const & indexFn ) diff --git a/backup_collector.hh b/backup_collector.hh index 96669e7..92b1d6c 100644 --- a/backup_collector.hh +++ b/backup_collector.hh @@ -4,18 +4,11 @@ #ifndef BACKUP_COLLECTOR_HH_INCLUDED #define BACKUP_COLLECTOR_HH_INCLUDED -#include "zbackup_base.hh" -#include "chunk_storage.hh" - #include #include -#include -#include "bundle.hh" -#include "chunk_index.hh" #include "backup_restorer.hh" #include "backup_file.hh" -#include "backup_exchanger.hh" #include "debug.hh" diff --git a/backup_exchanger.hh b/backup_exchanger.hh index 457c48f..d32ae73 100644 --- a/backup_exchanger.hh +++ b/backup_exchanger.hh @@ -17,7 +17,7 @@ using std::pair; enum { backups, bundles, - index, + indexes, Flags }; diff --git a/config.cc b/config.cc index d29195c..cd37f00 100644 --- a/config.cc +++ b/config.cc @@ -119,7 +119,7 @@ void Config::prefillKeywords() "Valid values:\n" "backups - exchange backup instructions (files in backups/ directory)\n" "bundles - exchange bundles with data (files in bunles/ directory)\n" - "index - exchange indicies of chunks (files in index/ directory)\n" + "indexes - exchange indexes of chunks (files in index/ directory)\n" "No default value, you should specify it explicitly" }, @@ -439,12 +439,13 @@ bool Config::parseOrValidate( const string & option, const OptionType type, if ( strcmp( optionValue, "bundles" ) == 0 ) runtime.exchange.set( BackupExchanger::bundles ); else - if ( strcmp( optionValue, "index" ) == 0 ) - runtime.exchange.set( BackupExchanger::index ); + if ( strcmp( optionValue, "indexes" ) == 0 || + strcmp( optionValue, "index" ) == 0 ) + runtime.exchange.set( BackupExchanger::indexes ); else { fprintf( stderr, "Invalid exchange value specified: %s\n" - "Must be one of the following: backups, bundles, index.\n", + "Must be one of the following: backups, bundles, indexes.\n", optionValue ); return false; } diff --git a/zbackup.cc b/zbackup.cc index 04479a2..1180977 100644 --- a/zbackup.cc +++ b/zbackup.cc @@ -172,7 +172,8 @@ invalid_option: " performs import from source to destination storage,\n" " for export/import storage path must be\n" " a valid (initialized) storage\n" -" gc - performs chunk garbage collection\n" +" gc [chunks|indexes] - performs garbage\n" +" collection (default is chunks)\n" " passwd - changes repo info file passphrase\n" //" info - shows repo information\n" " config [show|edit|set|reset] - performs\n" @@ -278,15 +279,43 @@ invalid_option: else if ( strcmp( args[ 0 ], "gc" ) == 0 ) { - // Perform the restore - if ( args.size() != 2 ) + // Perform the garbage collection + if ( args.size() < 2 || args.size() > 3 ) { - fprintf( stderr, "Usage: %s %s \n", + fprintf( stderr, "Usage: %s %s [chunks|indexes] \n", *argv, args[ 0 ] ); return EXIT_FAILURE; } - ZCollector zc( args[ 1 ], passwords[ 0 ], config ); - zc.gc(); + + int fieldStorage = 1; + int fieldAction = 2; + + if ( args.size() == 3 ) + { + fieldStorage = 2; + fieldAction = 1; + } + + if ( args.size() > 2 && strcmp( args[ fieldAction ], "chunks" ) == 0 ) + { + ZCollector zc( ZBackupBase::deriveStorageDirFromBackupsFile( args[ fieldStorage ], true ), + passwords[ 0 ], config ); + zc.gcChunks(); + } + else + if ( args.size() > 2 && strcmp( args[ fieldAction ], "indexes" ) == 0 ) + { + ZCollector zc( ZBackupBase::deriveStorageDirFromBackupsFile( args[ fieldStorage ], true ), + passwords[ 0 ], config ); + fprintf( stderr, "NOT IMPLEMENTED YET!\n" ); + zc.gcIndexes(); + } + else + { + ZCollector zc( ZBackupBase::deriveStorageDirFromBackupsFile( args[ fieldStorage ], true ), + passwords[ 0 ], config ); + zc.gcChunks(); + } } else if ( strcmp( args[ 0 ], "passwd" ) == 0 ) diff --git a/zutils.cc b/zutils.cc index d57a524..9818e86 100644 --- a/zutils.cc +++ b/zutils.cc @@ -207,13 +207,13 @@ void ZExchange::exchange() verbosePrintf( "Bundle exchange completed.\n" ); } - if ( config.runtime.exchange.test( BackupExchanger::index ) ) + if ( config.runtime.exchange.test( BackupExchanger::indexes ) ) { - verbosePrintf( "Searching for indicies...\n" ); - vector< string > indicies = BackupExchanger::findOrRebuild( + verbosePrintf( "Searching for indexes...\n" ); + vector< string > indexes = BackupExchanger::findOrRebuild( srcZBackupBase.getIndexPath(), dstZBackupBase.getIndexPath() ); - for ( std::vector< string >::iterator it = indicies.begin(); it != indicies.end(); ++it ) + for ( std::vector< string >::iterator it = indexes.begin(); it != indexes.end(); ++it ) { verbosePrintf( "Processing index file %s... ", it->c_str() ); string outputFileName ( Dir::addPath( dstZBackupBase.getIndexPath(), *it ) ); @@ -307,7 +307,7 @@ ZCollector::ZCollector( string const & storageDir, string const & password, { } -void ZCollector::gc() +void ZCollector::gcChunks() { ChunkIndex chunkReindex( encryptionkey, tmpMgr, getIndexPath(), true ); @@ -366,3 +366,7 @@ void ZCollector::gc() verbosePrintf( "Garbage collection complete\n" ); } + +void ZCollector::gcIndexes() +{ +} diff --git a/zutils.hh b/zutils.hh index 59d23d4..7201ff2 100644 --- a/zutils.hh +++ b/zutils.hh @@ -55,7 +55,8 @@ public: ZCollector( std::string const & storageDir, std::string const & password, Config & configIn ); - void gc(); + void gcChunks(); + void gcIndexes(); }; #endif From a064d9a1d196ca5a68e7a0dd0823f94c5e578538 Mon Sep 17 00:00:00 2001 From: Vladimir Stackov Date: Wed, 12 Aug 2015 16:27:54 +0300 Subject: [PATCH 2/4] Index pseudo-GC implementation --- backup_collector.cc | 21 ++++++++++++++++----- backup_collector.hh | 4 +++- chunk_index.hh | 2 +- config.cc | 19 +++++++++++++++++++ config.hh | 5 ++++- zbackup.cc | 36 +++++------------------------------- zutils.cc | 7 ++----- zutils.hh | 3 +-- 8 files changed, 51 insertions(+), 46 deletions(-) diff --git a/backup_collector.cc b/backup_collector.cc index 5c99c4d..dcb9fb9 100644 --- a/backup_collector.cc +++ b/backup_collector.cc @@ -7,7 +7,7 @@ using std::string; void BundleCollector::startIndex( string const & indexFn ) { - indexModified = false; + indexModified = indexNecessary = false; indexTotalChunks = indexUsedChunks = 0; indexModifiedBundles = indexKeptBundles = indexRemovedBundles = 0; } @@ -25,6 +25,8 @@ void BundleCollector::finishIndex( string const & indexFn ) else { chunkStorageWriter->reset(); + if ( indexGC && !indexNecessary ) + filesToUnlink.push_back( indexFn ); } } @@ -37,10 +39,18 @@ void BundleCollector::startBundle( Bundle::Id const & bundleId ) void BundleCollector::processChunk( ChunkId const & chunkId ) { + if ( indexGC ) + { + if ( overallChunkSet.find ( chunkId ) == overallChunkSet.end() ) + overallChunkSet.insert( chunkId ); + else + return; + } totalChunks++; if ( usedChunkSet.find( chunkId ) != usedChunkSet.end() ) { usedChunks++; + indexNecessary = true; } } @@ -49,16 +59,16 @@ void BundleCollector::finishBundle( Bundle::Id const & bundleId, BundleInfo cons string i = Bundle::generateFileName( savedId, "", false ); indexTotalChunks += totalChunks; indexUsedChunks += usedChunks; - if ( usedChunks == 0 ) + if ( 0 == usedChunks && 0 != totalChunks ) { - verbosePrintf( "Deleting %s bundle\n", i.c_str() ); + dPrintf( "Deleting %s bundle\n", i.c_str() ); filesToUnlink.push_back( Dir::addPath( bundlesPath, i ) ); indexModified = true; indexRemovedBundles++; } else if ( usedChunks < totalChunks ) { - verbosePrintf( "%s: used %d/%d chunks\n", i.c_str(), usedChunks, totalChunks ); + dPrintf( "%s: used %d/%d chunks\n", i.c_str(), usedChunks, totalChunks ); filesToUnlink.push_back( Dir::addPath( bundlesPath, i ) ); indexModified = true; // Copy used chunks to the new index @@ -79,7 +89,7 @@ void BundleCollector::finishBundle( Bundle::Id const & bundleId, BundleInfo cons else { chunkStorageWriter->addBundle( info, savedId ); - verbosePrintf( "Keeping %s bundle\n", i.c_str() ); + dPrintf( "Keeping %s bundle\n", i.c_str() ); indexKeptBundles++; } } @@ -88,6 +98,7 @@ void BundleCollector::commit() { for ( int i = filesToUnlink.size(); i--; ) { + dPrintf( "Unlinking %s\n", filesToUnlink[i].c_str() ); unlink( filesToUnlink[i].c_str() ); } filesToUnlink.clear(); diff --git a/backup_collector.hh b/backup_collector.hh index 92b1d6c..3d6a50e 100644 --- a/backup_collector.hh +++ b/backup_collector.hh @@ -20,14 +20,16 @@ private: Bundle::Id savedId; int totalChunks, usedChunks, indexTotalChunks, indexUsedChunks; int indexModifiedBundles, indexKeptBundles, indexRemovedBundles; - bool indexModified; + bool indexModified, indexNecessary; vector< string > filesToUnlink; + BackupRestorer::ChunkSet overallChunkSet; public: string bundlesPath; ChunkStorage::Reader *chunkStorageReader; ChunkStorage::Writer *chunkStorageWriter; BackupRestorer::ChunkSet usedChunkSet; + bool indexGC; void startIndex( string const & indexFn ); diff --git a/chunk_index.hh b/chunk_index.hh index bd34a18..09108a4 100644 --- a/chunk_index.hh +++ b/chunk_index.hh @@ -87,7 +87,7 @@ public: DEF_EX( Ex, "Chunk index exception", std::exception ) DEF_EX( exIncorrectChunkIdSize, "Incorrect chunk id size encountered", Ex ) - ChunkIndex( EncryptionKey const &, TmpMgr &, string const & indexPath, bool prohibitChunkIndexLoading ); + ChunkIndex( EncryptionKey const &, TmpMgr &, string const & indexPath, bool ); struct ChunkInfoInterface { diff --git a/config.cc b/config.cc index cd37f00..4454a87 100644 --- a/config.cc +++ b/config.cc @@ -123,6 +123,16 @@ void Config::prefillKeywords() "No default value, you should specify it explicitly" }, + { + "gc-indexes", + Config::oRuntime_gcIndexes, + Config::Runtime, + "Purge duplicated indexes from repo during\n" + "garbage collection\n" + "Normally you would not need this\n" + "No value, specify to enable" + }, + { "", Config::oBadOption, Config::None } }; @@ -456,6 +466,15 @@ bool Config::parseOrValidate( const string & option, const OptionType type, /* NOTREACHED */ break; + case oRuntime_gcIndexes: + runtime.gcIndexes = true; + + dPrintf( "runtime[gcIndexes] = true\n" ); + + return true; + /* NOTREACHED */ + break; + case oBadOption: default: return false; diff --git a/config.hh b/config.hh index 871501d..0003008 100644 --- a/config.hh +++ b/config.hh @@ -29,11 +29,13 @@ public: size_t threads; size_t cacheSize; bitset< BackupExchanger::Flags > exchange; + bool gcIndexes; // Default runtime config RuntimeConfig(): threads( getNumberOfCpus() ), - cacheSize( 40 * 1024 * 1024 ) // 40 MB + cacheSize( 40 * 1024 * 1024 ), // 40 MB + gcIndexes ( false ) { } }; @@ -58,6 +60,7 @@ public: oRuntime_threads, oRuntime_cacheSize, oRuntime_exchange, + oRuntime_gcIndexes, oDeprecated, oUnsupported } OpCodes; diff --git a/zbackup.cc b/zbackup.cc index 1180977..59a7eb2 100644 --- a/zbackup.cc +++ b/zbackup.cc @@ -280,42 +280,16 @@ invalid_option: if ( strcmp( args[ 0 ], "gc" ) == 0 ) { // Perform the garbage collection - if ( args.size() < 2 || args.size() > 3 ) + if ( args.size() != 2 ) { - fprintf( stderr, "Usage: %s %s [chunks|indexes] \n", + fprintf( stderr, "Usage: %s %s \n", *argv, args[ 0 ] ); return EXIT_FAILURE; } - int fieldStorage = 1; - int fieldAction = 2; - - if ( args.size() == 3 ) - { - fieldStorage = 2; - fieldAction = 1; - } - - if ( args.size() > 2 && strcmp( args[ fieldAction ], "chunks" ) == 0 ) - { - ZCollector zc( ZBackupBase::deriveStorageDirFromBackupsFile( args[ fieldStorage ], true ), - passwords[ 0 ], config ); - zc.gcChunks(); - } - else - if ( args.size() > 2 && strcmp( args[ fieldAction ], "indexes" ) == 0 ) - { - ZCollector zc( ZBackupBase::deriveStorageDirFromBackupsFile( args[ fieldStorage ], true ), - passwords[ 0 ], config ); - fprintf( stderr, "NOT IMPLEMENTED YET!\n" ); - zc.gcIndexes(); - } - else - { - ZCollector zc( ZBackupBase::deriveStorageDirFromBackupsFile( args[ fieldStorage ], true ), - passwords[ 0 ], config ); - zc.gcChunks(); - } + ZCollector zc( ZBackupBase::deriveStorageDirFromBackupsFile( args[ 1 ], true ), + passwords[ 0 ], config ); + zc.gc(); } else if ( strcmp( args[ 0 ], "passwd" ) == 0 ) diff --git a/zutils.cc b/zutils.cc index 9818e86..5ed6529 100644 --- a/zutils.cc +++ b/zutils.cc @@ -307,7 +307,7 @@ ZCollector::ZCollector( string const & storageDir, string const & password, { } -void ZCollector::gcChunks() +void ZCollector::gc() { ChunkIndex chunkReindex( encryptionkey, tmpMgr, getIndexPath(), true ); @@ -322,6 +322,7 @@ void ZCollector::gcChunks() collector.bundlesPath = getBundlesPath(); collector.chunkStorageReader = &this->chunkStorageReader; collector.chunkStorageWriter = &chunkStorageWriter; + collector.indexGC = config.runtime.gcIndexes; verbosePrintf( "Checking used chunks...\n" ); @@ -366,7 +367,3 @@ void ZCollector::gcChunks() verbosePrintf( "Garbage collection complete\n" ); } - -void ZCollector::gcIndexes() -{ -} diff --git a/zutils.hh b/zutils.hh index 7201ff2..59d23d4 100644 --- a/zutils.hh +++ b/zutils.hh @@ -55,8 +55,7 @@ public: ZCollector( std::string const & storageDir, std::string const & password, Config & configIn ); - void gcChunks(); - void gcIndexes(); + void gc(); }; #endif From 0a042c4bd2f4b5cdb9dab01f781fe3f1267d3fa6 Mon Sep 17 00:00:00 2001 From: Vladimir Stackov Date: Tue, 18 Aug 2015 17:27:27 +0300 Subject: [PATCH 3/4] Initial implementation of deep GC --- backup_collector.cc | 64 +++++++++++++++++++++++++++------------------ backup_collector.hh | 4 ++- config.cc | 20 +++++++------- config.hh | 6 ++--- zutils.cc | 2 +- 5 files changed, 57 insertions(+), 39 deletions(-) diff --git a/backup_collector.cc b/backup_collector.cc index dcb9fb9..79cc478 100644 --- a/backup_collector.cc +++ b/backup_collector.cc @@ -18,14 +18,15 @@ void BundleCollector::finishIndex( string const & indexFn ) { verbosePrintf( "Chunks used: %d/%d, bundles: %d kept, %d modified, %d removed\n", indexUsedChunks, indexTotalChunks, indexKeptBundles, - indexModifiedBundles, indexRemovedBundles); + indexModifiedBundles, indexRemovedBundles ); filesToUnlink.push_back( indexFn ); commit(); } else { chunkStorageWriter->reset(); - if ( indexGC && !indexNecessary ) + if ( !indexNecessary ) + // this index was a complete copy so we don't need it filesToUnlink.push_back( indexFn ); } } @@ -39,13 +40,11 @@ void BundleCollector::startBundle( Bundle::Id const & bundleId ) void BundleCollector::processChunk( ChunkId const & chunkId ) { - if ( indexGC ) - { - if ( overallChunkSet.find ( chunkId ) == overallChunkSet.end() ) - overallChunkSet.insert( chunkId ); - else - return; - } + if ( overallChunkSet.find ( chunkId ) == overallChunkSet.end() ) + overallChunkSet.insert( chunkId ); + else + return; + totalChunks++; if ( usedChunkSet.find( chunkId ) != usedChunkSet.end() ) { @@ -71,26 +70,41 @@ void BundleCollector::finishBundle( Bundle::Id const & bundleId, BundleInfo cons dPrintf( "%s: used %d/%d chunks\n", i.c_str(), usedChunks, totalChunks ); filesToUnlink.push_back( Dir::addPath( bundlesPath, i ) ); indexModified = true; - // Copy used chunks to the new index - string chunk; - size_t chunkSize; - for ( int x = info.chunk_record_size(); x--; ) - { - BundleInfo_ChunkRecord const & record = info.chunk_record( x ); - ChunkId id( record.id() ); - if ( usedChunkSet.find( id ) != usedChunkSet.end() ) - { - chunkStorageReader->get( id, chunk, chunkSize ); - chunkStorageWriter->add( id, chunk.data(), chunkSize ); - } - } + copyUsedChunks( info ); indexModifiedBundles++; } else { - chunkStorageWriter->addBundle( info, savedId ); - dPrintf( "Keeping %s bundle\n", i.c_str() ); - indexKeptBundles++; + if ( !deepGC ) + { + chunkStorageWriter->addBundle( info, savedId ); + dPrintf( "Keeping %s bundle\n", i.c_str() ); + indexKeptBundles++; + } + else + { + filesToUnlink.push_back( Dir::addPath( bundlesPath, i ) ); + indexModified = true; + copyUsedChunks( info ); + indexModifiedBundles++; + } + } +} + +void BundleCollector::copyUsedChunks( BundleInfo const & info ) +{ + // Copy used chunks to the new index + string chunk; + size_t chunkSize; + for ( int x = info.chunk_record_size(); x--; ) + { + BundleInfo_ChunkRecord const & record = info.chunk_record( x ); + ChunkId id( record.id() ); + if ( usedChunkSet.find( id ) != usedChunkSet.end() ) + { + chunkStorageReader->get( id, chunk, chunkSize ); + chunkStorageWriter->add( id, chunk.data(), chunkSize ); + } } } diff --git a/backup_collector.hh b/backup_collector.hh index 3d6a50e..5e1a93d 100644 --- a/backup_collector.hh +++ b/backup_collector.hh @@ -24,12 +24,14 @@ private: vector< string > filesToUnlink; BackupRestorer::ChunkSet overallChunkSet; + void copyUsedChunks( BundleInfo const & info ); + public: string bundlesPath; ChunkStorage::Reader *chunkStorageReader; ChunkStorage::Writer *chunkStorageWriter; BackupRestorer::ChunkSet usedChunkSet; - bool indexGC; + bool deepGC; void startIndex( string const & indexFn ); diff --git a/config.cc b/config.cc index 4454a87..967d566 100644 --- a/config.cc +++ b/config.cc @@ -124,13 +124,15 @@ void Config::prefillKeywords() }, { - "gc-indexes", - Config::oRuntime_gcIndexes, + "gc-deep", + Config::oRuntime_gcDeep, Config::Runtime, - "Purge duplicated indexes from repo during\n" - "garbage collection\n" - "Normally you would not need this\n" - "No value, specify to enable" + "Perform inter-bundle and inter-index deduplication\n" + "during garbage collection\n" + "You would probably need it after exchange operation\n" + "You could also use this switch to repack all bundles\n" + "Beware that this switch causes very intensive IO!\n" + "This switch is not used by default, specify to enable" }, { "", Config::oBadOption, Config::None } @@ -466,10 +468,10 @@ bool Config::parseOrValidate( const string & option, const OptionType type, /* NOTREACHED */ break; - case oRuntime_gcIndexes: - runtime.gcIndexes = true; + case oRuntime_gcDeep: + runtime.gcDeep = true; - dPrintf( "runtime[gcIndexes] = true\n" ); + dPrintf( "runtime[gcDeep] = true\n" ); return true; /* NOTREACHED */ diff --git a/config.hh b/config.hh index 0003008..31cd5be 100644 --- a/config.hh +++ b/config.hh @@ -29,13 +29,13 @@ public: size_t threads; size_t cacheSize; bitset< BackupExchanger::Flags > exchange; - bool gcIndexes; + bool gcDeep; // Default runtime config RuntimeConfig(): threads( getNumberOfCpus() ), cacheSize( 40 * 1024 * 1024 ), // 40 MB - gcIndexes ( false ) + gcDeep ( false ) { } }; @@ -60,7 +60,7 @@ public: oRuntime_threads, oRuntime_cacheSize, oRuntime_exchange, - oRuntime_gcIndexes, + oRuntime_gcDeep, oDeprecated, oUnsupported } OpCodes; diff --git a/zutils.cc b/zutils.cc index 5ed6529..a31bad1 100644 --- a/zutils.cc +++ b/zutils.cc @@ -322,7 +322,7 @@ void ZCollector::gc() collector.bundlesPath = getBundlesPath(); collector.chunkStorageReader = &this->chunkStorageReader; collector.chunkStorageWriter = &chunkStorageWriter; - collector.indexGC = config.runtime.gcIndexes; + collector.deepGC = config.runtime.gcDeep; verbosePrintf( "Checking used chunks...\n" ); From ff13dd72ad8adf8f1e3bb8efa9dea30be772387f Mon Sep 17 00:00:00 2001 From: Vladimir Stackov Date: Tue, 18 Aug 2015 19:02:12 +0300 Subject: [PATCH 4/4] GC collects duplicates among all repo data --- backup_collector.cc | 41 +++++++++++++++++++++++++++++++---------- backup_collector.hh | 3 ++- bundle.hh | 2 ++ config.cc | 30 ++++++++++++++---------------- config.hh | 6 +++--- zutils.cc | 7 +++---- 6 files changed, 55 insertions(+), 34 deletions(-) diff --git a/backup_collector.cc b/backup_collector.cc index 79cc478..0a2ee08 100644 --- a/backup_collector.cc +++ b/backup_collector.cc @@ -14,11 +14,11 @@ void BundleCollector::startIndex( string const & indexFn ) void BundleCollector::finishIndex( string const & indexFn ) { + verbosePrintf( "Chunks used: %d/%d, bundles: %d kept, %d modified, %d removed\n", + indexUsedChunks, indexTotalChunks, indexKeptBundles, + indexModifiedBundles, indexRemovedBundles ); if ( indexModified ) { - verbosePrintf( "Chunks used: %d/%d, bundles: %d kept, %d modified, %d removed\n", - indexUsedChunks, indexTotalChunks, indexKeptBundles, - indexModifiedBundles, indexRemovedBundles ); filesToUnlink.push_back( indexFn ); commit(); } @@ -75,19 +75,40 @@ void BundleCollector::finishBundle( Bundle::Id const & bundleId, BundleInfo cons } else { - if ( !deepGC ) - { - chunkStorageWriter->addBundle( info, savedId ); - dPrintf( "Keeping %s bundle\n", i.c_str() ); - indexKeptBundles++; - } - else + if ( gcRepack ) { filesToUnlink.push_back( Dir::addPath( bundlesPath, i ) ); indexModified = true; copyUsedChunks( info ); indexModifiedBundles++; } + else + { + if ( 0 == totalChunks ) + { + if ( overallBundleSet.find ( bundleId ) == overallBundleSet.end() ) + { + overallBundleSet.insert( bundleId ); + dPrintf( "Deleting %s bundle\n", i.c_str() ); + filesToUnlink.push_back( Dir::addPath( bundlesPath, i ) ); + indexModified = true; + indexRemovedBundles++; + } + else + { + // trigger index update + indexModified = true; + } + } + else + { + if ( overallBundleSet.find ( bundleId ) == overallBundleSet.end() ) + overallBundleSet.insert( bundleId ); + chunkStorageWriter->addBundle( info, savedId ); + dPrintf( "Keeping %s bundle\n", i.c_str() ); + indexKeptBundles++; + } + } } } diff --git a/backup_collector.hh b/backup_collector.hh index 5e1a93d..8144051 100644 --- a/backup_collector.hh +++ b/backup_collector.hh @@ -23,6 +23,7 @@ private: bool indexModified, indexNecessary; vector< string > filesToUnlink; BackupRestorer::ChunkSet overallChunkSet; + std::set< Bundle::Id > overallBundleSet; void copyUsedChunks( BundleInfo const & info ); @@ -31,7 +32,7 @@ public: ChunkStorage::Reader *chunkStorageReader; ChunkStorage::Writer *chunkStorageWriter; BackupRestorer::ChunkSet usedChunkSet; - bool deepGC; + bool gcRepack; void startIndex( string const & indexFn ); diff --git a/bundle.hh b/bundle.hh index 709a126..f800774 100644 --- a/bundle.hh +++ b/bundle.hh @@ -42,6 +42,8 @@ struct Id { return memcmp( blob, other.blob, sizeof( blob ) ) == 0; } bool operator != ( Id const & other ) const { return ! operator == ( other ); } + bool operator < ( Id const & other ) const + { return memcmp( blob, other.blob, sizeof( blob ) ) < 0; } }; STATIC_ASSERT( sizeof( Id ) == IdSize ); diff --git a/config.cc b/config.cc index 967d566..cdeba8c 100644 --- a/config.cc +++ b/config.cc @@ -104,8 +104,8 @@ void Config::prefillKeywords() "cache-size", Config::oRuntime_cacheSize, Config::Runtime, - "Cache size to use in restore process\n" - "Affects restore process speed directly\n" + "Cache size to use in restore process.\n" + "Affects restore process speed directly.\n" VALID_SUFFIXES "Default is %sMiB", Utils::numberToString( runtime.cacheSize / 1024 / 1024 ) @@ -114,25 +114,23 @@ void Config::prefillKeywords() "exchange", Config::oRuntime_exchange, Config::Runtime, - "Data to exchange between repositories in import/export process\n" - "Can be specified multiple times\n" + "Data to exchange between repositories in import/export process.\n" + "Can be specified multiple times.\n" "Valid values:\n" "backups - exchange backup instructions (files in backups/ directory)\n" "bundles - exchange bundles with data (files in bunles/ directory)\n" "indexes - exchange indexes of chunks (files in index/ directory)\n" - "No default value, you should specify it explicitly" + "No default value, you should specify it explicitly." }, { - "gc-deep", - Config::oRuntime_gcDeep, + "gc-repack", + Config::oRuntime_gcRepack, Config::Runtime, - "Perform inter-bundle and inter-index deduplication\n" - "during garbage collection\n" - "You would probably need it after exchange operation\n" - "You could also use this switch to repack all bundles\n" - "Beware that this switch causes very intensive IO!\n" - "This switch is not used by default, specify to enable" + "Repack indexes and bundles during garbage collection.\n" + "Normally you would not need this.\n" + "Beware that this options causes very intensive IO!\n" + "Not default, you should specify it explicitly." }, { "", Config::oBadOption, Config::None } @@ -468,10 +466,10 @@ bool Config::parseOrValidate( const string & option, const OptionType type, /* NOTREACHED */ break; - case oRuntime_gcDeep: - runtime.gcDeep = true; + case oRuntime_gcRepack: + runtime.gcRepack = true; - dPrintf( "runtime[gcDeep] = true\n" ); + dPrintf( "runtime[gcRepack] = true\n" ); return true; /* NOTREACHED */ diff --git a/config.hh b/config.hh index 31cd5be..a721055 100644 --- a/config.hh +++ b/config.hh @@ -29,13 +29,13 @@ public: size_t threads; size_t cacheSize; bitset< BackupExchanger::Flags > exchange; - bool gcDeep; + bool gcRepack; // Default runtime config RuntimeConfig(): threads( getNumberOfCpus() ), cacheSize( 40 * 1024 * 1024 ), // 40 MB - gcDeep ( false ) + gcRepack ( false ) { } }; @@ -60,7 +60,7 @@ public: oRuntime_threads, oRuntime_cacheSize, oRuntime_exchange, - oRuntime_gcDeep, + oRuntime_gcRepack, oDeprecated, oUnsupported } OpCodes; diff --git a/zutils.cc b/zutils.cc index a31bad1..fbffce5 100644 --- a/zutils.cc +++ b/zutils.cc @@ -316,15 +316,13 @@ void ZCollector::gc() string fileName; - Dir::Entry entry; - BundleCollector collector; collector.bundlesPath = getBundlesPath(); collector.chunkStorageReader = &this->chunkStorageReader; collector.chunkStorageWriter = &chunkStorageWriter; - collector.deepGC = config.runtime.gcDeep; + collector.gcRepack = config.runtime.gcRepack; - verbosePrintf( "Checking used chunks...\n" ); + verbosePrintf( "Performing garbage collection...\n" ); verbosePrintf( "Searching for backups...\n" ); vector< string > backups = BackupExchanger::findOrRebuild( getBackupsPath() ); @@ -356,6 +354,7 @@ void ZCollector::gc() string bundlesPath = getBundlesPath(); Dir::Listing bundleLst( bundlesPath ); + Dir::Entry entry; while( bundleLst.getNext( entry ) ) { const string dirPath = Dir::addPath( bundlesPath, entry.getFileName());