GC collects duplicates among all repo data

master
Vladimir Stackov 2015-08-18 19:02:12 +03:00
parent 0a042c4bd2
commit ff13dd72ad
6 changed files with 55 additions and 34 deletions

View File

@ -14,11 +14,11 @@ void BundleCollector::startIndex( string const & indexFn )
void BundleCollector::finishIndex( string const & indexFn )
{
verbosePrintf( "Chunks used: %d/%d, bundles: %d kept, %d modified, %d removed\n",
indexUsedChunks, indexTotalChunks, indexKeptBundles,
indexModifiedBundles, indexRemovedBundles );
if ( indexModified )
{
verbosePrintf( "Chunks used: %d/%d, bundles: %d kept, %d modified, %d removed\n",
indexUsedChunks, indexTotalChunks, indexKeptBundles,
indexModifiedBundles, indexRemovedBundles );
filesToUnlink.push_back( indexFn );
commit();
}
@ -75,19 +75,40 @@ void BundleCollector::finishBundle( Bundle::Id const & bundleId, BundleInfo cons
}
else
{
if ( !deepGC )
{
chunkStorageWriter->addBundle( info, savedId );
dPrintf( "Keeping %s bundle\n", i.c_str() );
indexKeptBundles++;
}
else
if ( gcRepack )
{
filesToUnlink.push_back( Dir::addPath( bundlesPath, i ) );
indexModified = true;
copyUsedChunks( info );
indexModifiedBundles++;
}
else
{
if ( 0 == totalChunks )
{
if ( overallBundleSet.find ( bundleId ) == overallBundleSet.end() )
{
overallBundleSet.insert( bundleId );
dPrintf( "Deleting %s bundle\n", i.c_str() );
filesToUnlink.push_back( Dir::addPath( bundlesPath, i ) );
indexModified = true;
indexRemovedBundles++;
}
else
{
// trigger index update
indexModified = true;
}
}
else
{
if ( overallBundleSet.find ( bundleId ) == overallBundleSet.end() )
overallBundleSet.insert( bundleId );
chunkStorageWriter->addBundle( info, savedId );
dPrintf( "Keeping %s bundle\n", i.c_str() );
indexKeptBundles++;
}
}
}
}

View File

@ -23,6 +23,7 @@ private:
bool indexModified, indexNecessary;
vector< string > filesToUnlink;
BackupRestorer::ChunkSet overallChunkSet;
std::set< Bundle::Id > overallBundleSet;
void copyUsedChunks( BundleInfo const & info );
@ -31,7 +32,7 @@ public:
ChunkStorage::Reader *chunkStorageReader;
ChunkStorage::Writer *chunkStorageWriter;
BackupRestorer::ChunkSet usedChunkSet;
bool deepGC;
bool gcRepack;
void startIndex( string const & indexFn );

View File

@ -42,6 +42,8 @@ struct Id
{ return memcmp( blob, other.blob, sizeof( blob ) ) == 0; }
bool operator != ( Id const & other ) const
{ return ! operator == ( other ); }
bool operator < ( Id const & other ) const
{ return memcmp( blob, other.blob, sizeof( blob ) ) < 0; }
};
STATIC_ASSERT( sizeof( Id ) == IdSize );

View File

@ -104,8 +104,8 @@ void Config::prefillKeywords()
"cache-size",
Config::oRuntime_cacheSize,
Config::Runtime,
"Cache size to use in restore process\n"
"Affects restore process speed directly\n"
"Cache size to use in restore process.\n"
"Affects restore process speed directly.\n"
VALID_SUFFIXES
"Default is %sMiB",
Utils::numberToString( runtime.cacheSize / 1024 / 1024 )
@ -114,25 +114,23 @@ void Config::prefillKeywords()
"exchange",
Config::oRuntime_exchange,
Config::Runtime,
"Data to exchange between repositories in import/export process\n"
"Can be specified multiple times\n"
"Data to exchange between repositories in import/export process.\n"
"Can be specified multiple times.\n"
"Valid values:\n"
"backups - exchange backup instructions (files in backups/ directory)\n"
"bundles - exchange bundles with data (files in bunles/ directory)\n"
"indexes - exchange indexes of chunks (files in index/ directory)\n"
"No default value, you should specify it explicitly"
"No default value, you should specify it explicitly."
},
{
"gc-deep",
Config::oRuntime_gcDeep,
"gc-repack",
Config::oRuntime_gcRepack,
Config::Runtime,
"Perform inter-bundle and inter-index deduplication\n"
"during garbage collection\n"
"You would probably need it after exchange operation\n"
"You could also use this switch to repack all bundles\n"
"Beware that this switch causes very intensive IO!\n"
"This switch is not used by default, specify to enable"
"Repack indexes and bundles during garbage collection.\n"
"Normally you would not need this.\n"
"Beware that this options causes very intensive IO!\n"
"Not default, you should specify it explicitly."
},
{ "", Config::oBadOption, Config::None }
@ -468,10 +466,10 @@ bool Config::parseOrValidate( const string & option, const OptionType type,
/* NOTREACHED */
break;
case oRuntime_gcDeep:
runtime.gcDeep = true;
case oRuntime_gcRepack:
runtime.gcRepack = true;
dPrintf( "runtime[gcDeep] = true\n" );
dPrintf( "runtime[gcRepack] = true\n" );
return true;
/* NOTREACHED */

View File

@ -29,13 +29,13 @@ public:
size_t threads;
size_t cacheSize;
bitset< BackupExchanger::Flags > exchange;
bool gcDeep;
bool gcRepack;
// Default runtime config
RuntimeConfig():
threads( getNumberOfCpus() ),
cacheSize( 40 * 1024 * 1024 ), // 40 MB
gcDeep ( false )
gcRepack ( false )
{
}
};
@ -60,7 +60,7 @@ public:
oRuntime_threads,
oRuntime_cacheSize,
oRuntime_exchange,
oRuntime_gcDeep,
oRuntime_gcRepack,
oDeprecated, oUnsupported
} OpCodes;

View File

@ -316,15 +316,13 @@ void ZCollector::gc()
string fileName;
Dir::Entry entry;
BundleCollector collector;
collector.bundlesPath = getBundlesPath();
collector.chunkStorageReader = &this->chunkStorageReader;
collector.chunkStorageWriter = &chunkStorageWriter;
collector.deepGC = config.runtime.gcDeep;
collector.gcRepack = config.runtime.gcRepack;
verbosePrintf( "Checking used chunks...\n" );
verbosePrintf( "Performing garbage collection...\n" );
verbosePrintf( "Searching for backups...\n" );
vector< string > backups = BackupExchanger::findOrRebuild( getBackupsPath() );
@ -356,6 +354,7 @@ void ZCollector::gc()
string bundlesPath = getBundlesPath();
Dir::Listing bundleLst( bundlesPath );
Dir::Entry entry;
while( bundleLst.getNext( entry ) )
{
const string dirPath = Dir::addPath( bundlesPath, entry.getFileName());