GC collects duplicates among all repo data

master
Vladimir Stackov 2015-08-18 19:02:12 +03:00
parent 0a042c4bd2
commit ff13dd72ad
6 changed files with 55 additions and 34 deletions

View File

@ -14,11 +14,11 @@ void BundleCollector::startIndex( string const & indexFn )
void BundleCollector::finishIndex( string const & indexFn ) void BundleCollector::finishIndex( string const & indexFn )
{ {
verbosePrintf( "Chunks used: %d/%d, bundles: %d kept, %d modified, %d removed\n",
indexUsedChunks, indexTotalChunks, indexKeptBundles,
indexModifiedBundles, indexRemovedBundles );
if ( indexModified ) if ( indexModified )
{ {
verbosePrintf( "Chunks used: %d/%d, bundles: %d kept, %d modified, %d removed\n",
indexUsedChunks, indexTotalChunks, indexKeptBundles,
indexModifiedBundles, indexRemovedBundles );
filesToUnlink.push_back( indexFn ); filesToUnlink.push_back( indexFn );
commit(); commit();
} }
@ -75,19 +75,40 @@ void BundleCollector::finishBundle( Bundle::Id const & bundleId, BundleInfo cons
} }
else else
{ {
if ( !deepGC ) if ( gcRepack )
{
chunkStorageWriter->addBundle( info, savedId );
dPrintf( "Keeping %s bundle\n", i.c_str() );
indexKeptBundles++;
}
else
{ {
filesToUnlink.push_back( Dir::addPath( bundlesPath, i ) ); filesToUnlink.push_back( Dir::addPath( bundlesPath, i ) );
indexModified = true; indexModified = true;
copyUsedChunks( info ); copyUsedChunks( info );
indexModifiedBundles++; indexModifiedBundles++;
} }
else
{
if ( 0 == totalChunks )
{
if ( overallBundleSet.find ( bundleId ) == overallBundleSet.end() )
{
overallBundleSet.insert( bundleId );
dPrintf( "Deleting %s bundle\n", i.c_str() );
filesToUnlink.push_back( Dir::addPath( bundlesPath, i ) );
indexModified = true;
indexRemovedBundles++;
}
else
{
// trigger index update
indexModified = true;
}
}
else
{
if ( overallBundleSet.find ( bundleId ) == overallBundleSet.end() )
overallBundleSet.insert( bundleId );
chunkStorageWriter->addBundle( info, savedId );
dPrintf( "Keeping %s bundle\n", i.c_str() );
indexKeptBundles++;
}
}
} }
} }

View File

@ -23,6 +23,7 @@ private:
bool indexModified, indexNecessary; bool indexModified, indexNecessary;
vector< string > filesToUnlink; vector< string > filesToUnlink;
BackupRestorer::ChunkSet overallChunkSet; BackupRestorer::ChunkSet overallChunkSet;
std::set< Bundle::Id > overallBundleSet;
void copyUsedChunks( BundleInfo const & info ); void copyUsedChunks( BundleInfo const & info );
@ -31,7 +32,7 @@ public:
ChunkStorage::Reader *chunkStorageReader; ChunkStorage::Reader *chunkStorageReader;
ChunkStorage::Writer *chunkStorageWriter; ChunkStorage::Writer *chunkStorageWriter;
BackupRestorer::ChunkSet usedChunkSet; BackupRestorer::ChunkSet usedChunkSet;
bool deepGC; bool gcRepack;
void startIndex( string const & indexFn ); void startIndex( string const & indexFn );

View File

@ -42,6 +42,8 @@ struct Id
{ return memcmp( blob, other.blob, sizeof( blob ) ) == 0; } { return memcmp( blob, other.blob, sizeof( blob ) ) == 0; }
bool operator != ( Id const & other ) const bool operator != ( Id const & other ) const
{ return ! operator == ( other ); } { return ! operator == ( other ); }
bool operator < ( Id const & other ) const
{ return memcmp( blob, other.blob, sizeof( blob ) ) < 0; }
}; };
STATIC_ASSERT( sizeof( Id ) == IdSize ); STATIC_ASSERT( sizeof( Id ) == IdSize );

View File

@ -104,8 +104,8 @@ void Config::prefillKeywords()
"cache-size", "cache-size",
Config::oRuntime_cacheSize, Config::oRuntime_cacheSize,
Config::Runtime, Config::Runtime,
"Cache size to use in restore process\n" "Cache size to use in restore process.\n"
"Affects restore process speed directly\n" "Affects restore process speed directly.\n"
VALID_SUFFIXES VALID_SUFFIXES
"Default is %sMiB", "Default is %sMiB",
Utils::numberToString( runtime.cacheSize / 1024 / 1024 ) Utils::numberToString( runtime.cacheSize / 1024 / 1024 )
@ -114,25 +114,23 @@ void Config::prefillKeywords()
"exchange", "exchange",
Config::oRuntime_exchange, Config::oRuntime_exchange,
Config::Runtime, Config::Runtime,
"Data to exchange between repositories in import/export process\n" "Data to exchange between repositories in import/export process.\n"
"Can be specified multiple times\n" "Can be specified multiple times.\n"
"Valid values:\n" "Valid values:\n"
"backups - exchange backup instructions (files in backups/ directory)\n" "backups - exchange backup instructions (files in backups/ directory)\n"
"bundles - exchange bundles with data (files in bunles/ directory)\n" "bundles - exchange bundles with data (files in bunles/ directory)\n"
"indexes - exchange indexes of chunks (files in index/ directory)\n" "indexes - exchange indexes of chunks (files in index/ directory)\n"
"No default value, you should specify it explicitly" "No default value, you should specify it explicitly."
}, },
{ {
"gc-deep", "gc-repack",
Config::oRuntime_gcDeep, Config::oRuntime_gcRepack,
Config::Runtime, Config::Runtime,
"Perform inter-bundle and inter-index deduplication\n" "Repack indexes and bundles during garbage collection.\n"
"during garbage collection\n" "Normally you would not need this.\n"
"You would probably need it after exchange operation\n" "Beware that this options causes very intensive IO!\n"
"You could also use this switch to repack all bundles\n" "Not default, you should specify it explicitly."
"Beware that this switch causes very intensive IO!\n"
"This switch is not used by default, specify to enable"
}, },
{ "", Config::oBadOption, Config::None } { "", Config::oBadOption, Config::None }
@ -468,10 +466,10 @@ bool Config::parseOrValidate( const string & option, const OptionType type,
/* NOTREACHED */ /* NOTREACHED */
break; break;
case oRuntime_gcDeep: case oRuntime_gcRepack:
runtime.gcDeep = true; runtime.gcRepack = true;
dPrintf( "runtime[gcDeep] = true\n" ); dPrintf( "runtime[gcRepack] = true\n" );
return true; return true;
/* NOTREACHED */ /* NOTREACHED */

View File

@ -29,13 +29,13 @@ public:
size_t threads; size_t threads;
size_t cacheSize; size_t cacheSize;
bitset< BackupExchanger::Flags > exchange; bitset< BackupExchanger::Flags > exchange;
bool gcDeep; bool gcRepack;
// Default runtime config // Default runtime config
RuntimeConfig(): RuntimeConfig():
threads( getNumberOfCpus() ), threads( getNumberOfCpus() ),
cacheSize( 40 * 1024 * 1024 ), // 40 MB cacheSize( 40 * 1024 * 1024 ), // 40 MB
gcDeep ( false ) gcRepack ( false )
{ {
} }
}; };
@ -60,7 +60,7 @@ public:
oRuntime_threads, oRuntime_threads,
oRuntime_cacheSize, oRuntime_cacheSize,
oRuntime_exchange, oRuntime_exchange,
oRuntime_gcDeep, oRuntime_gcRepack,
oDeprecated, oUnsupported oDeprecated, oUnsupported
} OpCodes; } OpCodes;

View File

@ -316,15 +316,13 @@ void ZCollector::gc()
string fileName; string fileName;
Dir::Entry entry;
BundleCollector collector; BundleCollector collector;
collector.bundlesPath = getBundlesPath(); collector.bundlesPath = getBundlesPath();
collector.chunkStorageReader = &this->chunkStorageReader; collector.chunkStorageReader = &this->chunkStorageReader;
collector.chunkStorageWriter = &chunkStorageWriter; collector.chunkStorageWriter = &chunkStorageWriter;
collector.deepGC = config.runtime.gcDeep; collector.gcRepack = config.runtime.gcRepack;
verbosePrintf( "Checking used chunks...\n" ); verbosePrintf( "Performing garbage collection...\n" );
verbosePrintf( "Searching for backups...\n" ); verbosePrintf( "Searching for backups...\n" );
vector< string > backups = BackupExchanger::findOrRebuild( getBackupsPath() ); vector< string > backups = BackupExchanger::findOrRebuild( getBackupsPath() );
@ -356,6 +354,7 @@ void ZCollector::gc()
string bundlesPath = getBundlesPath(); string bundlesPath = getBundlesPath();
Dir::Listing bundleLst( bundlesPath ); Dir::Listing bundleLst( bundlesPath );
Dir::Entry entry;
while( bundleLst.getNext( entry ) ) while( bundleLst.getNext( entry ) )
{ {
const string dirPath = Dir::addPath( bundlesPath, entry.getFileName()); const string dirPath = Dir::addPath( bundlesPath, entry.getFileName());