From dd77c998728cd0f72ddc5fc2a4f1cfa6c5f139c3 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Wed, 28 Sep 2016 01:19:13 +0300 Subject: [PATCH] Check MD5 only when it is needed (fixes #101) This makes grive check md5 sums only when: 1) a local rename is supposed (when there are a new file and a deleted file of the same size) 2) local ctime is changed, but file size isn't --- README.md | 2 + libgrive/src/base/Entry.cc | 8 ++- libgrive/src/base/Entry.hh | 3 + libgrive/src/base/Resource.cc | 112 ++++++++++++++++++++---------- libgrive/src/base/Resource.hh | 5 ++ libgrive/src/base/ResourceTree.cc | 8 ++- libgrive/src/base/ResourceTree.hh | 5 ++ libgrive/src/base/State.cc | 26 +++---- libgrive/src/drive2/Entry2.cc | 2 + libgrive/src/util/OS.cc | 8 +-- libgrive/src/util/OS.hh | 4 +- 11 files changed, 128 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index 9cbc4e8..89bfec1 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,8 @@ Grive uses cmake to build. Basic install sequence is - ignore regexp does not persist anymore (note that Grive will still track it to not accidentally delete remote files when changing ignore regexp) - added options to limit upload and download speed +- faster upload of new and changed files. now Grive uploads files without first calculating + md5 checksum when file is created locally or when its size changes. ### Grive2 v0.5 diff --git a/libgrive/src/base/Entry.cc b/libgrive/src/base/Entry.cc index ed25a10..05f346b 100644 --- a/libgrive/src/base/Entry.cc +++ b/libgrive/src/base/Entry.cc @@ -36,7 +36,8 @@ Entry::Entry( ) : m_is_dir ( true ), m_resource_id ( "folder:root" ), m_change_stamp ( -1 ), - m_is_removed ( false ) + m_is_removed ( false ), + m_size ( 0 ) { } @@ -65,6 +66,11 @@ std::string Entry::MD5() const return m_md5 ; } +u64_t Entry::Size() const +{ + return m_size ; +} + DateTime Entry::MTime() const { return m_mtime ; diff --git a/libgrive/src/base/Entry.hh b/libgrive/src/base/Entry.hh index 6472e89..8324f6d 100644 --- a/libgrive/src/base/Entry.hh +++ b/libgrive/src/base/Entry.hh @@ -19,6 +19,7 @@ #pragma once +#include "util/Types.hh" #include "util/DateTime.hh" #include "util/FileSystem.hh" @@ -44,6 +45,7 @@ public : bool IsDir() const ; std::string MD5() const ; DateTime MTime() const ; + u64_t Size() const ; std::string Name() const ; @@ -80,6 +82,7 @@ protected : DateTime m_mtime ; bool m_is_removed ; + u64_t m_size ; } ; } // end of namespace gr diff --git a/libgrive/src/base/Resource.cc b/libgrive/src/base/Resource.cc index 72c3a26..b03aa5c 100644 --- a/libgrive/src/base/Resource.cc +++ b/libgrive/src/base/Resource.cc @@ -47,6 +47,7 @@ namespace gr { Resource::Resource( const fs::path& root_folder ) : m_name ( root_folder.string() ), m_kind ( "folder" ), + m_size ( 0 ), m_id ( "folder:root" ), m_href ( "root" ), m_is_editable( true ), @@ -60,6 +61,7 @@ Resource::Resource( const fs::path& root_folder ) : Resource::Resource( const std::string& name, const std::string& kind ) : m_name ( name ), m_kind ( kind ), + m_size ( 0 ), m_is_editable( true ), m_parent ( 0 ), m_state ( unknown ), @@ -149,6 +151,7 @@ void Resource::AssignIDs( const Entry& remote ) m_content = remote.ContentSrc() ; m_is_editable = remote.IsEditable() ; m_etag = remote.ETag() ; + m_md5 = remote.MD5() ; } } @@ -193,7 +196,7 @@ void Resource::FromRemoteFile( const Entry& remote ) m_state = local_deleted ; } } - + // remote checksum unknown, assume the file is not changed in remote else if ( remote.MD5().empty() ) { @@ -201,16 +204,9 @@ void Resource::FromRemoteFile( const Entry& remote ) Path(), log::verbose ) ; m_state = sync ; } - - // if checksum is equal, no need to compare the mtime - else if ( remote.MD5() == m_md5 ) - { - Log( "file %1% is already in sync", Path(), log::verbose ) ; - m_state = sync ; - } // use mtime to check which one is more recent - else + else if ( remote.Size() != m_size || remote.MD5() != GetMD5() ) { assert( m_state != unknown ) ; @@ -230,6 +226,13 @@ void Resource::FromRemoteFile( const Entry& remote ) else Trace( "file %1% state is %2%", m_name, m_state ) ; } + + // if checksum is equal, no need to compare the mtime + else + { + Log( "file %1% is already in sync", Path(), log::verbose ) ; + m_state = sync ; + } } void Resource::FromDeleted( Val& state ) @@ -242,6 +245,8 @@ void Resource::FromDeleted( Val& state ) m_md5 = state["md5"]; if ( state.Has( "srv_time" ) ) m_mtime.Assign( state[ "srv_time" ].U64(), 0 ) ; + if ( state.Has( "size" ) ) + m_size = state[ "size" ].U64(); m_state = both_deleted; } @@ -259,7 +264,7 @@ void Resource::FromLocal( Val& state ) bool is_dir; try { - os::Stat( path, &m_ctime, NULL, &is_dir ) ; + os::Stat( path, &m_ctime, (off64_t*)&m_size, &is_dir ) ; } catch ( os::Error &e ) { @@ -287,9 +292,9 @@ void Resource::FromLocal( Val& state ) { if ( !is_dir ) { - m_md5 = crypt::MD5::Get( path ); // File is changed locally. TODO: Detect conflicts - is_changed = !state.Has( "md5" ) || m_md5 != state["md5"].Str(); + is_changed = ( state.Has( "size" ) && m_size != state["size"].U64() ) || + !state.Has( "md5" ) || GetMD5() != state["md5"].Str(); } else is_changed = true; @@ -479,30 +484,33 @@ void Resource::Sync( Syncer *syncer, ResourceTree *res_tree, const Val& options } } -void Resource::SyncSelf( Syncer* syncer, ResourceTree *res_tree, const Val& options ) +bool Resource::CheckRename( Syncer* syncer, ResourceTree *res_tree ) { - assert( !IsRoot() || m_state == sync ) ; // root is always sync - assert( IsRoot() || !syncer || m_parent->IsFolder() ) ; - assert( IsRoot() || m_parent->m_state != remote_deleted ) ; - assert( IsRoot() || m_parent->m_state != local_deleted ) ; - - const fs::path path = Path() ; - - // Detect renames - if ( !IsFolder() && ( m_state == local_new || m_state == local_deleted || - m_state == remote_new || m_state == remote_deleted ) ) + if ( !IsFolder() && ( m_state == local_new || m_state == remote_new ) ) { - details::MD5Range moved = res_tree->FindByMD5( m_md5 ); - bool is_local = m_state == local_new || m_state == local_deleted; - State other; - if ( m_state == local_new ) - other = local_deleted; - else if ( m_state == local_deleted ) - other = local_new; - else if ( m_state == remote_new ) - other = remote_deleted; - else - other = remote_new; + bool is_local = m_state == local_new; + State other = is_local ? local_deleted : remote_deleted; + if ( is_local ) + { + // First check size index for locally added files + details::SizeRange moved = res_tree->FindBySize( m_size ); + bool found = false; + for ( details::SizeMap::iterator i = moved.first ; i != moved.second; i++ ) + { + Resource *m = *i; + if ( m->m_state == other ) + { + found = true; + break; + } + } + if ( !found ) + { + // Don't check md5 sums if there are no deleted files with same size + return false; + } + } + details::MD5Range moved = res_tree->FindByMD5( GetMD5() ); for ( details::MD5Map::iterator i = moved.first ; i != moved.second; i++ ) { Resource *m = *i; @@ -530,10 +538,25 @@ void Resource::SyncSelf( Syncer* syncer, ResourceTree *res_tree, const Val& opti } from->m_state = both_deleted; to->m_state = sync; - return; + return true; } } } + return false; +} + +void Resource::SyncSelf( Syncer* syncer, ResourceTree *res_tree, const Val& options ) +{ + assert( !IsRoot() || m_state == sync ) ; // root is always sync + assert( IsRoot() || !syncer || m_parent->IsFolder() ) ; + assert( IsRoot() || m_parent->m_state != remote_deleted ) ; + assert( IsRoot() || m_parent->m_state != local_deleted ) ; + + const fs::path path = Path() ; + + // Detect renames + if ( CheckRename( syncer, res_tree ) ) + return; switch ( m_state ) { @@ -688,6 +711,7 @@ void Resource::SetIndex( bool re_stat ) if ( !is_dir ) { m_json->Set( "md5", Val( m_md5 ) ); + m_json->Set( "size", Val( m_size ) ); m_json->Del( "tree" ); } else @@ -695,6 +719,7 @@ void Resource::SetIndex( bool re_stat ) // add tree item if it does not exist m_json->Item( "tree" ); m_json->Del( "md5" ); + m_json->Del( "size" ); } } @@ -731,11 +756,28 @@ std::string Resource::StateStr() const return ss.str() ; } +u64_t Resource::Size() const +{ + return m_size ; +} + std::string Resource::MD5() const { return m_md5 ; } +std::string Resource::GetMD5() +{ + if ( m_md5.empty() && !IsFolder() && m_local_exists ) + { + // MD5 checksum is calculated lazily and only when really needed: + // 1) when a local rename is supposed (when there are a new file and a deleted file of the same size) + // 2) when local ctime is changed, but file size isn't + m_md5 = crypt::MD5::Get( Path() ); + } + return m_md5 ; +} + bool Resource::IsRoot() const { // Root entry does not show up in file feeds, so we check for empty parent (and self-href) diff --git a/libgrive/src/base/Resource.hh b/libgrive/src/base/Resource.hh index e7ee0ac..57d7319 100644 --- a/libgrive/src/base/Resource.hh +++ b/libgrive/src/base/Resource.hh @@ -19,6 +19,7 @@ #pragma once +#include "util/Types.hh" #include "util/DateTime.hh" #include "util/Exception.hh" #include "util/FileSystem.hh" @@ -108,7 +109,9 @@ public : bool IsInRootTree() const ; bool IsRoot() const ; bool HasID() const ; + u64_t Size() const; std::string MD5() const ; + std::string GetMD5() ; void FromRemote( const Entry& remote ) ; void FromDeleted( Val& state ) ; @@ -141,6 +144,7 @@ private : void DeleteIndex() ; void SetIndex( bool ) ; + bool CheckRename( Syncer* syncer, ResourceTree *res_tree ) ; void SyncSelf( Syncer* syncer, ResourceTree *res_tree, const Val& options ) ; private : @@ -149,6 +153,7 @@ private : std::string m_md5 ; DateTime m_mtime ; DateTime m_ctime ; + u64_t m_size ; std::string m_id ; std::string m_href ; diff --git a/libgrive/src/base/ResourceTree.cc b/libgrive/src/base/ResourceTree.cc index 16ac534..e6eb619 100644 --- a/libgrive/src/base/ResourceTree.cc +++ b/libgrive/src/base/ResourceTree.cc @@ -105,7 +105,13 @@ MD5Range ResourceTree::FindByMD5( const std::string& md5 ) return MD5Range( map.end(), map.end() ) ; } -/// Reinsert should be called when the ID/HREF were updated +SizeRange ResourceTree::FindBySize( u64_t size ) +{ + SizeMap& map = m_set.get() ; + return map.equal_range( size ); +} + +/// Reinsert should be called when the ID/HREF/MD5 were updated bool ResourceTree::ReInsert( Resource *coll ) { Set& s = m_set.get() ; diff --git a/libgrive/src/base/ResourceTree.hh b/libgrive/src/base/ResourceTree.hh index b1fc728..b28c4a3 100644 --- a/libgrive/src/base/ResourceTree.hh +++ b/libgrive/src/base/ResourceTree.hh @@ -36,19 +36,23 @@ namespace details struct ByMD5 {} ; struct ByHref {} ; struct ByIdentity {} ; + struct BySize {} ; typedef multi_index_container< Resource*, indexed_by< hashed_non_unique, const_mem_fun >, hashed_non_unique, const_mem_fun >, + hashed_non_unique, const_mem_fun >, hashed_unique, identity > > > Folders ; typedef Folders::index::type MD5Map ; typedef Folders::index::type HrefMap ; + typedef Folders::index::type SizeMap ; typedef Folders::index::type Set ; + typedef std::pair SizeRange ; typedef std::pair MD5Range ; } @@ -70,6 +74,7 @@ public : Resource* FindByHref( const std::string& href ) ; const Resource* FindByHref( const std::string& href ) const ; details::MD5Range FindByMD5( const std::string& md5 ) ; + details::SizeRange FindBySize( u64_t size ) ; bool ReInsert( Resource *coll ) ; diff --git a/libgrive/src/base/State.cc b/libgrive/src/base/State.cc index 44d8bf9..9fddf4d 100644 --- a/libgrive/src/base/State.cc +++ b/libgrive/src/base/State.cc @@ -104,20 +104,21 @@ void State::FromLocal( const fs::path& p, Resource* folder, Val& tree ) { // if the Resource object of the child already exists, it should // have been so no need to do anything here - Resource *c = folder->FindChild( fname ) ; + Resource *c = folder->FindChild( fname ), *c2 = c ; if ( !c ) { - c = new Resource( fname, "" ) ; - folder->AddChild( c ) ; - m_res.Insert( c ) ; + c2 = new Resource( fname, "" ) ; + folder->AddChild( c2 ) ; } leftover.erase( fname ); Val& rec = tree.Item( fname ); if ( m_force ) rec.Del( "srv_time" ); - c->FromLocal( rec ) ; - if ( c->IsFolder() ) - FromLocal( *i, c, rec.Item( "tree" ) ) ; + c2->FromLocal( rec ) ; + if ( !c ) + m_res.Insert( c2 ) ; + if ( c2->IsFolder() ) + FromLocal( *i, c2, rec.Item( "tree" ) ) ; } } @@ -129,17 +130,18 @@ void State::FromLocal( const fs::path& p, Resource* folder, Val& tree ) else { // Restore state of locally deleted files - Resource *c = folder->FindChild( i->first ) ; + Resource *c = folder->FindChild( i->first ), *c2 ; if ( !c ) { - c = new Resource( i->first, i->second.Has( "tree" ) ? "folder" : "file" ) ; - folder->AddChild( c ) ; - m_res.Insert( c ) ; + c2 = new Resource( i->first, i->second.Has( "tree" ) ? "folder" : "file" ) ; + folder->AddChild( c2 ) ; } Val& rec = tree.Item( i->first ); if ( m_force || m_ign_changed ) rec.Del( "srv_time" ); - c->FromDeleted( rec ); + c2->FromDeleted( rec ); + if ( !c ) + m_res.Insert( c2 ) ; } } } diff --git a/libgrive/src/drive2/Entry2.cc b/libgrive/src/drive2/Entry2.cc index 75f7d96..b445c1b 100644 --- a/libgrive/src/drive2/Entry2.cc +++ b/libgrive/src/drive2/Entry2.cc @@ -44,6 +44,7 @@ void Entry2::Update( const Val& item ) // changestamp only appears in change feed entries m_change_stamp = is_chg ? item["id"].Int() : -1 ; m_is_removed = is_chg && item["deleted"].Bool() ; + m_size = 0 ; const Val& file = is_chg && !m_is_removed ? item["file"] : item; @@ -75,6 +76,7 @@ void Entry2::Update( const Val& item ) else { m_md5 = file["md5Checksum"] ; + m_size = file["fileSize"].U64() ; m_content_src = file["downloadUrl"] ; // convert to lower case for easy comparison std::transform( m_md5.begin(), m_md5.end(), m_md5.begin(), tolower ) ; diff --git a/libgrive/src/util/OS.cc b/libgrive/src/util/OS.cc index 3569aa1..9095e73 100644 --- a/libgrive/src/util/OS.cc +++ b/libgrive/src/util/OS.cc @@ -44,7 +44,7 @@ void Stat( const fs::path& filename, DateTime *t, off_t *size, bool *is_dir ) Stat( filename.string(), t, size, is_dir ) ; } -void Stat( const std::string& filename, DateTime *t, off_t *size, bool *is_dir ) +void Stat( const std::string& filename, DateTime *t, off64_t *size, bool *is_dir ) { struct stat s = {} ; if ( ::stat( filename.c_str(), &s ) != 0 ) @@ -65,10 +65,10 @@ void Stat( const std::string& filename, DateTime *t, off_t *size, bool *is_dir ) *t = DateTime( s.st_ctim.tv_sec, s.st_ctim.tv_nsec); #endif } - if (size) + if ( size ) *size = s.st_size; - if (is_dir) - *is_dir = S_ISDIR(s.st_mode) ? true : false; + if ( is_dir ) + *is_dir = S_ISDIR( s.st_mode ) ? true : false; } void SetFileTime( const fs::path& filename, const DateTime& t ) diff --git a/libgrive/src/util/OS.hh b/libgrive/src/util/OS.hh index edb9a12..4349783 100644 --- a/libgrive/src/util/OS.hh +++ b/libgrive/src/util/OS.hh @@ -33,8 +33,8 @@ namespace os { struct Error : virtual Exception {} ; - void Stat( const std::string& filename, DateTime *t, off_t *size, bool *is_dir ) ; - void Stat( const fs::path& filename, DateTime *t, off_t *size, bool *is_dir ) ; + void Stat( const std::string& filename, DateTime *t, off64_t *size, bool *is_dir ) ; + void Stat( const fs::path& filename, DateTime *t, off64_t *size, bool *is_dir ) ; void SetFileTime( const std::string& filename, const DateTime& t ) ; void SetFileTime( const fs::path& filename, const DateTime& t ) ;