Check MD5 only when it is needed (fixes #101)

This makes grive check md5 sums only when:
1) a local rename is supposed (when there are a new file and a deleted file of the same size)
2) local ctime is changed, but file size isn't
pull/126/head
Vitaliy Filippov 2016-09-28 01:19:13 +03:00
parent 59d02a65cb
commit dd77c99872
11 changed files with 128 additions and 55 deletions

View File

@ -89,6 +89,8 @@ Grive uses cmake to build. Basic install sequence is
- ignore regexp does not persist anymore (note that Grive will still track it to not - ignore regexp does not persist anymore (note that Grive will still track it to not
accidentally delete remote files when changing ignore regexp) accidentally delete remote files when changing ignore regexp)
- added options to limit upload and download speed - added options to limit upload and download speed
- faster upload of new and changed files. now Grive uploads files without first calculating
md5 checksum when file is created locally or when its size changes.
### Grive2 v0.5 ### Grive2 v0.5

View File

@ -36,7 +36,8 @@ Entry::Entry( ) :
m_is_dir ( true ), m_is_dir ( true ),
m_resource_id ( "folder:root" ), m_resource_id ( "folder:root" ),
m_change_stamp ( -1 ), m_change_stamp ( -1 ),
m_is_removed ( false ) m_is_removed ( false ),
m_size ( 0 )
{ {
} }
@ -65,6 +66,11 @@ std::string Entry::MD5() const
return m_md5 ; return m_md5 ;
} }
u64_t Entry::Size() const
{
return m_size ;
}
DateTime Entry::MTime() const DateTime Entry::MTime() const
{ {
return m_mtime ; return m_mtime ;

View File

@ -19,6 +19,7 @@
#pragma once #pragma once
#include "util/Types.hh"
#include "util/DateTime.hh" #include "util/DateTime.hh"
#include "util/FileSystem.hh" #include "util/FileSystem.hh"
@ -44,6 +45,7 @@ public :
bool IsDir() const ; bool IsDir() const ;
std::string MD5() const ; std::string MD5() const ;
DateTime MTime() const ; DateTime MTime() const ;
u64_t Size() const ;
std::string Name() const ; std::string Name() const ;
@ -80,6 +82,7 @@ protected :
DateTime m_mtime ; DateTime m_mtime ;
bool m_is_removed ; bool m_is_removed ;
u64_t m_size ;
} ; } ;
} // end of namespace gr } // end of namespace gr

View File

@ -47,6 +47,7 @@ namespace gr {
Resource::Resource( const fs::path& root_folder ) : Resource::Resource( const fs::path& root_folder ) :
m_name ( root_folder.string() ), m_name ( root_folder.string() ),
m_kind ( "folder" ), m_kind ( "folder" ),
m_size ( 0 ),
m_id ( "folder:root" ), m_id ( "folder:root" ),
m_href ( "root" ), m_href ( "root" ),
m_is_editable( true ), m_is_editable( true ),
@ -60,6 +61,7 @@ Resource::Resource( const fs::path& root_folder ) :
Resource::Resource( const std::string& name, const std::string& kind ) : Resource::Resource( const std::string& name, const std::string& kind ) :
m_name ( name ), m_name ( name ),
m_kind ( kind ), m_kind ( kind ),
m_size ( 0 ),
m_is_editable( true ), m_is_editable( true ),
m_parent ( 0 ), m_parent ( 0 ),
m_state ( unknown ), m_state ( unknown ),
@ -149,6 +151,7 @@ void Resource::AssignIDs( const Entry& remote )
m_content = remote.ContentSrc() ; m_content = remote.ContentSrc() ;
m_is_editable = remote.IsEditable() ; m_is_editable = remote.IsEditable() ;
m_etag = remote.ETag() ; m_etag = remote.ETag() ;
m_md5 = remote.MD5() ;
} }
} }
@ -193,7 +196,7 @@ void Resource::FromRemoteFile( const Entry& remote )
m_state = local_deleted ; m_state = local_deleted ;
} }
} }
// remote checksum unknown, assume the file is not changed in remote // remote checksum unknown, assume the file is not changed in remote
else if ( remote.MD5().empty() ) else if ( remote.MD5().empty() )
{ {
@ -201,16 +204,9 @@ void Resource::FromRemoteFile( const Entry& remote )
Path(), log::verbose ) ; Path(), log::verbose ) ;
m_state = sync ; m_state = sync ;
} }
// if checksum is equal, no need to compare the mtime
else if ( remote.MD5() == m_md5 )
{
Log( "file %1% is already in sync", Path(), log::verbose ) ;
m_state = sync ;
}
// use mtime to check which one is more recent // use mtime to check which one is more recent
else else if ( remote.Size() != m_size || remote.MD5() != GetMD5() )
{ {
assert( m_state != unknown ) ; assert( m_state != unknown ) ;
@ -230,6 +226,13 @@ void Resource::FromRemoteFile( const Entry& remote )
else else
Trace( "file %1% state is %2%", m_name, m_state ) ; Trace( "file %1% state is %2%", m_name, m_state ) ;
} }
// if checksum is equal, no need to compare the mtime
else
{
Log( "file %1% is already in sync", Path(), log::verbose ) ;
m_state = sync ;
}
} }
void Resource::FromDeleted( Val& state ) void Resource::FromDeleted( Val& state )
@ -242,6 +245,8 @@ void Resource::FromDeleted( Val& state )
m_md5 = state["md5"]; m_md5 = state["md5"];
if ( state.Has( "srv_time" ) ) if ( state.Has( "srv_time" ) )
m_mtime.Assign( state[ "srv_time" ].U64(), 0 ) ; m_mtime.Assign( state[ "srv_time" ].U64(), 0 ) ;
if ( state.Has( "size" ) )
m_size = state[ "size" ].U64();
m_state = both_deleted; m_state = both_deleted;
} }
@ -259,7 +264,7 @@ void Resource::FromLocal( Val& state )
bool is_dir; bool is_dir;
try try
{ {
os::Stat( path, &m_ctime, NULL, &is_dir ) ; os::Stat( path, &m_ctime, (off64_t*)&m_size, &is_dir ) ;
} }
catch ( os::Error &e ) catch ( os::Error &e )
{ {
@ -287,9 +292,9 @@ void Resource::FromLocal( Val& state )
{ {
if ( !is_dir ) if ( !is_dir )
{ {
m_md5 = crypt::MD5::Get( path );
// File is changed locally. TODO: Detect conflicts // File is changed locally. TODO: Detect conflicts
is_changed = !state.Has( "md5" ) || m_md5 != state["md5"].Str(); is_changed = ( state.Has( "size" ) && m_size != state["size"].U64() ) ||
!state.Has( "md5" ) || GetMD5() != state["md5"].Str();
} }
else else
is_changed = true; is_changed = true;
@ -479,30 +484,33 @@ void Resource::Sync( Syncer *syncer, ResourceTree *res_tree, const Val& options
} }
} }
void Resource::SyncSelf( Syncer* syncer, ResourceTree *res_tree, const Val& options ) bool Resource::CheckRename( Syncer* syncer, ResourceTree *res_tree )
{ {
assert( !IsRoot() || m_state == sync ) ; // root is always sync if ( !IsFolder() && ( m_state == local_new || m_state == remote_new ) )
assert( IsRoot() || !syncer || m_parent->IsFolder() ) ;
assert( IsRoot() || m_parent->m_state != remote_deleted ) ;
assert( IsRoot() || m_parent->m_state != local_deleted ) ;
const fs::path path = Path() ;
// Detect renames
if ( !IsFolder() && ( m_state == local_new || m_state == local_deleted ||
m_state == remote_new || m_state == remote_deleted ) )
{ {
details::MD5Range moved = res_tree->FindByMD5( m_md5 ); bool is_local = m_state == local_new;
bool is_local = m_state == local_new || m_state == local_deleted; State other = is_local ? local_deleted : remote_deleted;
State other; if ( is_local )
if ( m_state == local_new ) {
other = local_deleted; // First check size index for locally added files
else if ( m_state == local_deleted ) details::SizeRange moved = res_tree->FindBySize( m_size );
other = local_new; bool found = false;
else if ( m_state == remote_new ) for ( details::SizeMap::iterator i = moved.first ; i != moved.second; i++ )
other = remote_deleted; {
else Resource *m = *i;
other = remote_new; if ( m->m_state == other )
{
found = true;
break;
}
}
if ( !found )
{
// Don't check md5 sums if there are no deleted files with same size
return false;
}
}
details::MD5Range moved = res_tree->FindByMD5( GetMD5() );
for ( details::MD5Map::iterator i = moved.first ; i != moved.second; i++ ) for ( details::MD5Map::iterator i = moved.first ; i != moved.second; i++ )
{ {
Resource *m = *i; Resource *m = *i;
@ -530,10 +538,25 @@ void Resource::SyncSelf( Syncer* syncer, ResourceTree *res_tree, const Val& opti
} }
from->m_state = both_deleted; from->m_state = both_deleted;
to->m_state = sync; to->m_state = sync;
return; return true;
} }
} }
} }
return false;
}
void Resource::SyncSelf( Syncer* syncer, ResourceTree *res_tree, const Val& options )
{
assert( !IsRoot() || m_state == sync ) ; // root is always sync
assert( IsRoot() || !syncer || m_parent->IsFolder() ) ;
assert( IsRoot() || m_parent->m_state != remote_deleted ) ;
assert( IsRoot() || m_parent->m_state != local_deleted ) ;
const fs::path path = Path() ;
// Detect renames
if ( CheckRename( syncer, res_tree ) )
return;
switch ( m_state ) switch ( m_state )
{ {
@ -688,6 +711,7 @@ void Resource::SetIndex( bool re_stat )
if ( !is_dir ) if ( !is_dir )
{ {
m_json->Set( "md5", Val( m_md5 ) ); m_json->Set( "md5", Val( m_md5 ) );
m_json->Set( "size", Val( m_size ) );
m_json->Del( "tree" ); m_json->Del( "tree" );
} }
else else
@ -695,6 +719,7 @@ void Resource::SetIndex( bool re_stat )
// add tree item if it does not exist // add tree item if it does not exist
m_json->Item( "tree" ); m_json->Item( "tree" );
m_json->Del( "md5" ); m_json->Del( "md5" );
m_json->Del( "size" );
} }
} }
@ -731,11 +756,28 @@ std::string Resource::StateStr() const
return ss.str() ; return ss.str() ;
} }
u64_t Resource::Size() const
{
return m_size ;
}
std::string Resource::MD5() const std::string Resource::MD5() const
{ {
return m_md5 ; return m_md5 ;
} }
std::string Resource::GetMD5()
{
if ( m_md5.empty() && !IsFolder() && m_local_exists )
{
// MD5 checksum is calculated lazily and only when really needed:
// 1) when a local rename is supposed (when there are a new file and a deleted file of the same size)
// 2) when local ctime is changed, but file size isn't
m_md5 = crypt::MD5::Get( Path() );
}
return m_md5 ;
}
bool Resource::IsRoot() const bool Resource::IsRoot() const
{ {
// Root entry does not show up in file feeds, so we check for empty parent (and self-href) // Root entry does not show up in file feeds, so we check for empty parent (and self-href)

View File

@ -19,6 +19,7 @@
#pragma once #pragma once
#include "util/Types.hh"
#include "util/DateTime.hh" #include "util/DateTime.hh"
#include "util/Exception.hh" #include "util/Exception.hh"
#include "util/FileSystem.hh" #include "util/FileSystem.hh"
@ -108,7 +109,9 @@ public :
bool IsInRootTree() const ; bool IsInRootTree() const ;
bool IsRoot() const ; bool IsRoot() const ;
bool HasID() const ; bool HasID() const ;
u64_t Size() const;
std::string MD5() const ; std::string MD5() const ;
std::string GetMD5() ;
void FromRemote( const Entry& remote ) ; void FromRemote( const Entry& remote ) ;
void FromDeleted( Val& state ) ; void FromDeleted( Val& state ) ;
@ -141,6 +144,7 @@ private :
void DeleteIndex() ; void DeleteIndex() ;
void SetIndex( bool ) ; void SetIndex( bool ) ;
bool CheckRename( Syncer* syncer, ResourceTree *res_tree ) ;
void SyncSelf( Syncer* syncer, ResourceTree *res_tree, const Val& options ) ; void SyncSelf( Syncer* syncer, ResourceTree *res_tree, const Val& options ) ;
private : private :
@ -149,6 +153,7 @@ private :
std::string m_md5 ; std::string m_md5 ;
DateTime m_mtime ; DateTime m_mtime ;
DateTime m_ctime ; DateTime m_ctime ;
u64_t m_size ;
std::string m_id ; std::string m_id ;
std::string m_href ; std::string m_href ;

View File

@ -105,7 +105,13 @@ MD5Range ResourceTree::FindByMD5( const std::string& md5 )
return MD5Range( map.end(), map.end() ) ; return MD5Range( map.end(), map.end() ) ;
} }
/// Reinsert should be called when the ID/HREF were updated SizeRange ResourceTree::FindBySize( u64_t size )
{
SizeMap& map = m_set.get<BySize>() ;
return map.equal_range( size );
}
/// Reinsert should be called when the ID/HREF/MD5 were updated
bool ResourceTree::ReInsert( Resource *coll ) bool ResourceTree::ReInsert( Resource *coll )
{ {
Set& s = m_set.get<ByIdentity>() ; Set& s = m_set.get<ByIdentity>() ;

View File

@ -36,19 +36,23 @@ namespace details
struct ByMD5 {} ; struct ByMD5 {} ;
struct ByHref {} ; struct ByHref {} ;
struct ByIdentity {} ; struct ByIdentity {} ;
struct BySize {} ;
typedef multi_index_container< typedef multi_index_container<
Resource*, Resource*,
indexed_by< indexed_by<
hashed_non_unique<tag<ByHref>, const_mem_fun<Resource, std::string, &Resource::SelfHref> >, hashed_non_unique<tag<ByHref>, const_mem_fun<Resource, std::string, &Resource::SelfHref> >,
hashed_non_unique<tag<ByMD5>, const_mem_fun<Resource, std::string, &Resource::MD5> >, hashed_non_unique<tag<ByMD5>, const_mem_fun<Resource, std::string, &Resource::MD5> >,
hashed_non_unique<tag<BySize>, const_mem_fun<Resource, u64_t, &Resource::Size> >,
hashed_unique<tag<ByIdentity>, identity<Resource*> > hashed_unique<tag<ByIdentity>, identity<Resource*> >
> >
> Folders ; > Folders ;
typedef Folders::index<ByMD5>::type MD5Map ; typedef Folders::index<ByMD5>::type MD5Map ;
typedef Folders::index<ByHref>::type HrefMap ; typedef Folders::index<ByHref>::type HrefMap ;
typedef Folders::index<BySize>::type SizeMap ;
typedef Folders::index<ByIdentity>::type Set ; typedef Folders::index<ByIdentity>::type Set ;
typedef std::pair<SizeMap::iterator, SizeMap::iterator> SizeRange ;
typedef std::pair<MD5Map::iterator, MD5Map::iterator> MD5Range ; typedef std::pair<MD5Map::iterator, MD5Map::iterator> MD5Range ;
} }
@ -70,6 +74,7 @@ public :
Resource* FindByHref( const std::string& href ) ; Resource* FindByHref( const std::string& href ) ;
const Resource* FindByHref( const std::string& href ) const ; const Resource* FindByHref( const std::string& href ) const ;
details::MD5Range FindByMD5( const std::string& md5 ) ; details::MD5Range FindByMD5( const std::string& md5 ) ;
details::SizeRange FindBySize( u64_t size ) ;
bool ReInsert( Resource *coll ) ; bool ReInsert( Resource *coll ) ;

View File

@ -104,20 +104,21 @@ void State::FromLocal( const fs::path& p, Resource* folder, Val& tree )
{ {
// if the Resource object of the child already exists, it should // if the Resource object of the child already exists, it should
// have been so no need to do anything here // have been so no need to do anything here
Resource *c = folder->FindChild( fname ) ; Resource *c = folder->FindChild( fname ), *c2 = c ;
if ( !c ) if ( !c )
{ {
c = new Resource( fname, "" ) ; c2 = new Resource( fname, "" ) ;
folder->AddChild( c ) ; folder->AddChild( c2 ) ;
m_res.Insert( c ) ;
} }
leftover.erase( fname ); leftover.erase( fname );
Val& rec = tree.Item( fname ); Val& rec = tree.Item( fname );
if ( m_force ) if ( m_force )
rec.Del( "srv_time" ); rec.Del( "srv_time" );
c->FromLocal( rec ) ; c2->FromLocal( rec ) ;
if ( c->IsFolder() ) if ( !c )
FromLocal( *i, c, rec.Item( "tree" ) ) ; m_res.Insert( c2 ) ;
if ( c2->IsFolder() )
FromLocal( *i, c2, rec.Item( "tree" ) ) ;
} }
} }
@ -129,17 +130,18 @@ void State::FromLocal( const fs::path& p, Resource* folder, Val& tree )
else else
{ {
// Restore state of locally deleted files // Restore state of locally deleted files
Resource *c = folder->FindChild( i->first ) ; Resource *c = folder->FindChild( i->first ), *c2 ;
if ( !c ) if ( !c )
{ {
c = new Resource( i->first, i->second.Has( "tree" ) ? "folder" : "file" ) ; c2 = new Resource( i->first, i->second.Has( "tree" ) ? "folder" : "file" ) ;
folder->AddChild( c ) ; folder->AddChild( c2 ) ;
m_res.Insert( c ) ;
} }
Val& rec = tree.Item( i->first ); Val& rec = tree.Item( i->first );
if ( m_force || m_ign_changed ) if ( m_force || m_ign_changed )
rec.Del( "srv_time" ); rec.Del( "srv_time" );
c->FromDeleted( rec ); c2->FromDeleted( rec );
if ( !c )
m_res.Insert( c2 ) ;
} }
} }
} }

View File

@ -44,6 +44,7 @@ void Entry2::Update( const Val& item )
// changestamp only appears in change feed entries // changestamp only appears in change feed entries
m_change_stamp = is_chg ? item["id"].Int() : -1 ; m_change_stamp = is_chg ? item["id"].Int() : -1 ;
m_is_removed = is_chg && item["deleted"].Bool() ; m_is_removed = is_chg && item["deleted"].Bool() ;
m_size = 0 ;
const Val& file = is_chg && !m_is_removed ? item["file"] : item; const Val& file = is_chg && !m_is_removed ? item["file"] : item;
@ -75,6 +76,7 @@ void Entry2::Update( const Val& item )
else else
{ {
m_md5 = file["md5Checksum"] ; m_md5 = file["md5Checksum"] ;
m_size = file["fileSize"].U64() ;
m_content_src = file["downloadUrl"] ; m_content_src = file["downloadUrl"] ;
// convert to lower case for easy comparison // convert to lower case for easy comparison
std::transform( m_md5.begin(), m_md5.end(), m_md5.begin(), tolower ) ; std::transform( m_md5.begin(), m_md5.end(), m_md5.begin(), tolower ) ;

View File

@ -44,7 +44,7 @@ void Stat( const fs::path& filename, DateTime *t, off_t *size, bool *is_dir )
Stat( filename.string(), t, size, is_dir ) ; Stat( filename.string(), t, size, is_dir ) ;
} }
void Stat( const std::string& filename, DateTime *t, off_t *size, bool *is_dir ) void Stat( const std::string& filename, DateTime *t, off64_t *size, bool *is_dir )
{ {
struct stat s = {} ; struct stat s = {} ;
if ( ::stat( filename.c_str(), &s ) != 0 ) if ( ::stat( filename.c_str(), &s ) != 0 )
@ -65,10 +65,10 @@ void Stat( const std::string& filename, DateTime *t, off_t *size, bool *is_dir )
*t = DateTime( s.st_ctim.tv_sec, s.st_ctim.tv_nsec); *t = DateTime( s.st_ctim.tv_sec, s.st_ctim.tv_nsec);
#endif #endif
} }
if (size) if ( size )
*size = s.st_size; *size = s.st_size;
if (is_dir) if ( is_dir )
*is_dir = S_ISDIR(s.st_mode) ? true : false; *is_dir = S_ISDIR( s.st_mode ) ? true : false;
} }
void SetFileTime( const fs::path& filename, const DateTime& t ) void SetFileTime( const fs::path& filename, const DateTime& t )

View File

@ -33,8 +33,8 @@ namespace os
{ {
struct Error : virtual Exception {} ; struct Error : virtual Exception {} ;
void Stat( const std::string& filename, DateTime *t, off_t *size, bool *is_dir ) ; void Stat( const std::string& filename, DateTime *t, off64_t *size, bool *is_dir ) ;
void Stat( const fs::path& filename, DateTime *t, off_t *size, bool *is_dir ) ; void Stat( const fs::path& filename, DateTime *t, off64_t *size, bool *is_dir ) ;
void SetFileTime( const std::string& filename, const DateTime& t ) ; void SetFileTime( const std::string& filename, const DateTime& t ) ;
void SetFileTime( const fs::path& filename, const DateTime& t ) ; void SetFileTime( const fs::path& filename, const DateTime& t ) ;