From 0a042c4bd2f4b5cdb9dab01f781fe3f1267d3fa6 Mon Sep 17 00:00:00 2001 From: Vladimir Stackov Date: Tue, 18 Aug 2015 17:27:27 +0300 Subject: [PATCH] Initial implementation of deep GC --- backup_collector.cc | 64 +++++++++++++++++++++++++++------------------ backup_collector.hh | 4 ++- config.cc | 20 +++++++------- config.hh | 6 ++--- zutils.cc | 2 +- 5 files changed, 57 insertions(+), 39 deletions(-) diff --git a/backup_collector.cc b/backup_collector.cc index dcb9fb9..79cc478 100644 --- a/backup_collector.cc +++ b/backup_collector.cc @@ -18,14 +18,15 @@ void BundleCollector::finishIndex( string const & indexFn ) { verbosePrintf( "Chunks used: %d/%d, bundles: %d kept, %d modified, %d removed\n", indexUsedChunks, indexTotalChunks, indexKeptBundles, - indexModifiedBundles, indexRemovedBundles); + indexModifiedBundles, indexRemovedBundles ); filesToUnlink.push_back( indexFn ); commit(); } else { chunkStorageWriter->reset(); - if ( indexGC && !indexNecessary ) + if ( !indexNecessary ) + // this index was a complete copy so we don't need it filesToUnlink.push_back( indexFn ); } } @@ -39,13 +40,11 @@ void BundleCollector::startBundle( Bundle::Id const & bundleId ) void BundleCollector::processChunk( ChunkId const & chunkId ) { - if ( indexGC ) - { - if ( overallChunkSet.find ( chunkId ) == overallChunkSet.end() ) - overallChunkSet.insert( chunkId ); - else - return; - } + if ( overallChunkSet.find ( chunkId ) == overallChunkSet.end() ) + overallChunkSet.insert( chunkId ); + else + return; + totalChunks++; if ( usedChunkSet.find( chunkId ) != usedChunkSet.end() ) { @@ -71,26 +70,41 @@ void BundleCollector::finishBundle( Bundle::Id const & bundleId, BundleInfo cons dPrintf( "%s: used %d/%d chunks\n", i.c_str(), usedChunks, totalChunks ); filesToUnlink.push_back( Dir::addPath( bundlesPath, i ) ); indexModified = true; - // Copy used chunks to the new index - string chunk; - size_t chunkSize; - for ( int x = info.chunk_record_size(); x--; ) - { - BundleInfo_ChunkRecord const & record = info.chunk_record( x ); - ChunkId id( record.id() ); - if ( usedChunkSet.find( id ) != usedChunkSet.end() ) - { - chunkStorageReader->get( id, chunk, chunkSize ); - chunkStorageWriter->add( id, chunk.data(), chunkSize ); - } - } + copyUsedChunks( info ); indexModifiedBundles++; } else { - chunkStorageWriter->addBundle( info, savedId ); - dPrintf( "Keeping %s bundle\n", i.c_str() ); - indexKeptBundles++; + if ( !deepGC ) + { + chunkStorageWriter->addBundle( info, savedId ); + dPrintf( "Keeping %s bundle\n", i.c_str() ); + indexKeptBundles++; + } + else + { + filesToUnlink.push_back( Dir::addPath( bundlesPath, i ) ); + indexModified = true; + copyUsedChunks( info ); + indexModifiedBundles++; + } + } +} + +void BundleCollector::copyUsedChunks( BundleInfo const & info ) +{ + // Copy used chunks to the new index + string chunk; + size_t chunkSize; + for ( int x = info.chunk_record_size(); x--; ) + { + BundleInfo_ChunkRecord const & record = info.chunk_record( x ); + ChunkId id( record.id() ); + if ( usedChunkSet.find( id ) != usedChunkSet.end() ) + { + chunkStorageReader->get( id, chunk, chunkSize ); + chunkStorageWriter->add( id, chunk.data(), chunkSize ); + } } } diff --git a/backup_collector.hh b/backup_collector.hh index 3d6a50e..5e1a93d 100644 --- a/backup_collector.hh +++ b/backup_collector.hh @@ -24,12 +24,14 @@ private: vector< string > filesToUnlink; BackupRestorer::ChunkSet overallChunkSet; + void copyUsedChunks( BundleInfo const & info ); + public: string bundlesPath; ChunkStorage::Reader *chunkStorageReader; ChunkStorage::Writer *chunkStorageWriter; BackupRestorer::ChunkSet usedChunkSet; - bool indexGC; + bool deepGC; void startIndex( string const & indexFn ); diff --git a/config.cc b/config.cc index 4454a87..967d566 100644 --- a/config.cc +++ b/config.cc @@ -124,13 +124,15 @@ void Config::prefillKeywords() }, { - "gc-indexes", - Config::oRuntime_gcIndexes, + "gc-deep", + Config::oRuntime_gcDeep, Config::Runtime, - "Purge duplicated indexes from repo during\n" - "garbage collection\n" - "Normally you would not need this\n" - "No value, specify to enable" + "Perform inter-bundle and inter-index deduplication\n" + "during garbage collection\n" + "You would probably need it after exchange operation\n" + "You could also use this switch to repack all bundles\n" + "Beware that this switch causes very intensive IO!\n" + "This switch is not used by default, specify to enable" }, { "", Config::oBadOption, Config::None } @@ -466,10 +468,10 @@ bool Config::parseOrValidate( const string & option, const OptionType type, /* NOTREACHED */ break; - case oRuntime_gcIndexes: - runtime.gcIndexes = true; + case oRuntime_gcDeep: + runtime.gcDeep = true; - dPrintf( "runtime[gcIndexes] = true\n" ); + dPrintf( "runtime[gcDeep] = true\n" ); return true; /* NOTREACHED */ diff --git a/config.hh b/config.hh index 0003008..31cd5be 100644 --- a/config.hh +++ b/config.hh @@ -29,13 +29,13 @@ public: size_t threads; size_t cacheSize; bitset< BackupExchanger::Flags > exchange; - bool gcIndexes; + bool gcDeep; // Default runtime config RuntimeConfig(): threads( getNumberOfCpus() ), cacheSize( 40 * 1024 * 1024 ), // 40 MB - gcIndexes ( false ) + gcDeep ( false ) { } }; @@ -60,7 +60,7 @@ public: oRuntime_threads, oRuntime_cacheSize, oRuntime_exchange, - oRuntime_gcIndexes, + oRuntime_gcDeep, oDeprecated, oUnsupported } OpCodes; diff --git a/zutils.cc b/zutils.cc index 5ed6529..a31bad1 100644 --- a/zutils.cc +++ b/zutils.cc @@ -322,7 +322,7 @@ void ZCollector::gc() collector.bundlesPath = getBundlesPath(); collector.chunkStorageReader = &this->chunkStorageReader; collector.chunkStorageWriter = &chunkStorageWriter; - collector.indexGC = config.runtime.gcIndexes; + collector.deepGC = config.runtime.gcDeep; verbosePrintf( "Checking used chunks...\n" );