2015-10-02 12:11:10 -04:00
import configparser
2010-10-20 15:08:46 -04:00
import os
2010-12-21 15:29:09 -05:00
import shutil
2017-05-02 13:05:27 -04:00
import stat
2016-05-30 19:18:03 -04:00
from binascii import unhexlify
from collections import namedtuple
2017-06-10 11:59:41 -04:00
from time import perf_counter
2016-05-30 19:18:03 -04:00
2015-10-06 12:33:55 -04:00
from . logger import create_logger
2017-05-02 13:05:27 -04:00
2015-10-06 12:33:55 -04:00
logger = create_logger ( )
2016-05-30 19:18:03 -04:00
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-10 20:54:52 -04:00
from . constants import CACHE_README , DEFAULT_FILES_CACHE_MODE
2017-03-07 09:13:59 -05:00
from . hashindex import ChunkIndex , ChunkIndexEntry , CacheSynchronizer
2016-10-31 00:53:01 -04:00
from . helpers import Location
2016-05-30 18:33:13 -04:00
from . helpers import Error
2017-05-28 12:04:33 -04:00
from . helpers import Manifest
2016-11-27 06:39:49 -05:00
from . helpers import get_cache_dir , get_security_dir
2017-05-28 12:04:33 -04:00
from . helpers import int_to_bigint , bigint_to_int , bin_to_hex , parse_stringified_list
2016-05-30 18:33:13 -04:00
from . helpers import format_file_size
2017-03-15 13:54:34 -04:00
from . helpers import safe_ns
2017-04-01 15:28:41 -04:00
from . helpers import yes , hostname_is_unique
2016-11-26 15:15:59 -05:00
from . helpers import remove_surrogates
from . helpers import ProgressIndicatorPercent , ProgressIndicatorMessage
2017-05-25 09:54:38 -04:00
from . helpers import set_ec , EXIT_WARNING
2017-07-24 04:45:57 -04:00
from . helpers import truncate_and_unlink
2018-06-30 20:34:48 -04:00
from . helpers import msgpack
2017-05-02 13:05:27 -04:00
from . item import ArchiveItem , ChunkListEntry
from . crypto . key import PlaintextKey
2017-05-25 08:00:03 -04:00
from . crypto . file_integrity import IntegrityCheckedFile , DetachedIntegrityCheckedFile , FileIntegrityError
2016-07-23 07:56:06 -04:00
from . locking import Lock
2016-07-09 15:10:46 -04:00
from . platform import SaveFile
2016-05-30 19:18:03 -04:00
from . remote import cache_if_remote
2017-06-10 11:59:41 -04:00
from . repository import LIST_SCAN_LIMIT
2015-10-08 17:03:35 -04:00
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-10 20:54:52 -04:00
# note: cmtime might me either a ctime or a mtime timestamp
FileCacheEntry = namedtuple ( ' FileCacheEntry ' , ' age inode size cmtime chunk_ids ' )
2016-04-16 11:48:47 -04:00
2010-03-06 12:25:35 -05:00
2016-11-27 06:39:49 -05:00
class SecurityManager :
2017-05-10 12:34:22 -04:00
"""
Tracks repositories . Ensures that nothing bad happens ( repository swaps ,
replay attacks , unknown repositories etc . ) .
This is complicated by the Cache being initially used for this , while
only some commands actually use the Cache , which meant that other commands
did not perform these checks .
Further complications were created by the Cache being a cache , so it
could be legitimately deleted , which is annoying because Borg didn ' t
recognize repositories after that .
Therefore a second location , the security database ( see get_security_dir ) ,
was introduced which stores this information . However , this means that
the code has to deal with a cache existing but no security DB entry ,
or inconsistencies between the security DB and the cache which have to
be reconciled , and also with no cache existing but a security DB entry .
"""
2016-11-27 06:39:49 -05:00
def __init__ ( self , repository ) :
self . repository = repository
self . dir = get_security_dir ( repository . id_str )
2017-05-10 09:30:51 -04:00
self . cache_dir = cache_dir ( repository )
2016-11-27 06:39:49 -05:00
self . key_type_file = os . path . join ( self . dir , ' key-type ' )
self . location_file = os . path . join ( self . dir , ' location ' )
self . manifest_ts_file = os . path . join ( self . dir , ' manifest-timestamp ' )
2017-12-16 16:59:47 -05:00
@staticmethod
def destroy ( repository , path = None ) :
""" destroy the security dir for ``repository`` or at ``path`` """
path = path or get_security_dir ( repository . id_str )
if os . path . exists ( path ) :
shutil . rmtree ( path )
2016-11-27 06:39:49 -05:00
def known ( self ) :
return os . path . exists ( self . key_type_file )
def key_matches ( self , key ) :
if not self . known ( ) :
return False
try :
with open ( self . key_type_file , ' r ' ) as fd :
type = fd . read ( )
return type == str ( key . TYPE )
except OSError as exc :
logger . warning ( ' Could not read/parse key type file: %s ' , exc )
2017-05-10 09:30:51 -04:00
def save ( self , manifest , key ) :
2016-11-27 06:39:49 -05:00
logger . debug ( ' security: saving state for %s to %s ' , self . repository . id_str , self . dir )
2017-05-10 09:30:51 -04:00
current_location = self . repository . _location . canonical_path ( )
2016-11-27 06:39:49 -05:00
logger . debug ( ' security: current location %s ' , current_location )
logger . debug ( ' security: key type %s ' , str ( key . TYPE ) )
logger . debug ( ' security: manifest timestamp %s ' , manifest . timestamp )
2017-10-14 15:57:58 -04:00
with SaveFile ( self . location_file ) as fd :
2016-11-27 06:39:49 -05:00
fd . write ( current_location )
2017-10-14 15:57:58 -04:00
with SaveFile ( self . key_type_file ) as fd :
2016-11-27 06:39:49 -05:00
fd . write ( str ( key . TYPE ) )
2017-10-14 15:57:58 -04:00
with SaveFile ( self . manifest_ts_file ) as fd :
2016-11-27 06:39:49 -05:00
fd . write ( manifest . timestamp )
2017-05-10 12:34:22 -04:00
def assert_location_matches ( self , cache_config = None ) :
2016-11-27 06:39:49 -05:00
# Warn user before sending data to a relocated repository
try :
with open ( self . location_file ) as fd :
previous_location = fd . read ( )
2017-05-10 12:34:22 -04:00
logger . debug ( ' security: read previous location %r ' , previous_location )
2016-11-27 06:39:49 -05:00
except FileNotFoundError :
2017-05-10 12:34:22 -04:00
logger . debug ( ' security: previous location file %s not found ' , self . location_file )
2016-11-27 06:39:49 -05:00
previous_location = None
except OSError as exc :
logger . warning ( ' Could not read previous location file: %s ' , exc )
previous_location = None
2017-05-10 12:34:22 -04:00
if cache_config and cache_config . previous_location and previous_location != cache_config . previous_location :
2016-11-27 06:39:49 -05:00
# Reconcile cache and security dir; we take the cache location.
2017-05-10 09:30:51 -04:00
previous_location = cache_config . previous_location
2016-11-27 06:39:49 -05:00
logger . debug ( ' security: using previous_location of cache: %r ' , previous_location )
2017-05-10 09:30:51 -04:00
repository_location = self . repository . _location . canonical_path ( )
if previous_location and previous_location != repository_location :
2016-11-27 06:39:49 -05:00
msg = ( " Warning: The repository at location {} was previously located at {} \n " . format (
2017-05-10 09:30:51 -04:00
repository_location , previous_location ) +
2016-11-27 06:39:49 -05:00
" Do you want to continue? [yN] " )
if not yes ( msg , false_msg = " Aborting. " , invalid_msg = " Invalid answer, aborting. " ,
retry = False , env_var_override = ' BORG_RELOCATED_REPO_ACCESS_IS_OK ' ) :
raise Cache . RepositoryAccessAborted ( )
# adapt on-disk config immediately if the new location was accepted
logger . debug ( ' security: updating location stored in cache and security dir ' )
2017-10-14 15:57:58 -04:00
with SaveFile ( self . location_file ) as fd :
2017-05-10 09:30:51 -04:00
fd . write ( repository_location )
2017-05-10 12:34:22 -04:00
if cache_config :
cache_config . save ( )
2016-11-27 06:39:49 -05:00
2017-05-10 12:34:22 -04:00
def assert_no_manifest_replay ( self , manifest , key , cache_config = None ) :
2016-11-27 06:39:49 -05:00
try :
with open ( self . manifest_ts_file ) as fd :
timestamp = fd . read ( )
logger . debug ( ' security: read manifest timestamp %r ' , timestamp )
except FileNotFoundError :
logger . debug ( ' security: manifest timestamp file %s not found ' , self . manifest_ts_file )
timestamp = ' '
except OSError as exc :
logger . warning ( ' Could not read previous location file: %s ' , exc )
timestamp = ' '
2017-05-10 12:34:22 -04:00
if cache_config :
timestamp = max ( timestamp , cache_config . timestamp or ' ' )
2016-11-27 06:39:49 -05:00
logger . debug ( ' security: determined newest manifest timestamp as %s ' , timestamp )
# If repository is older than the cache or security dir something fishy is going on
if timestamp and timestamp > manifest . timestamp :
if isinstance ( key , PlaintextKey ) :
raise Cache . RepositoryIDNotUnique ( )
else :
raise Cache . RepositoryReplay ( )
2017-05-10 12:34:22 -04:00
def assert_key_type ( self , key , cache_config = None ) :
2016-11-27 06:39:49 -05:00
# Make sure an encrypted repository has not been swapped for an unencrypted repository
2017-05-10 12:34:22 -04:00
if cache_config and cache_config . key_type is not None and cache_config . key_type != str ( key . TYPE ) :
2016-11-27 06:39:49 -05:00
raise Cache . EncryptionMethodMismatch ( )
if self . known ( ) and not self . key_matches ( key ) :
raise Cache . EncryptionMethodMismatch ( )
2018-07-15 04:46:14 -04:00
def assert_secure ( self , manifest , key , * , cache_config = None , warn_if_unencrypted = True , lock_wait = None ) :
2017-05-10 12:34:22 -04:00
# warn_if_unencrypted=False is only used for initializing a new repository.
# Thus, avoiding asking about a repository that's currently initializing.
2017-05-10 09:30:51 -04:00
self . assert_access_unknown ( warn_if_unencrypted , manifest , key )
if cache_config :
self . _assert_secure ( manifest , key , cache_config )
else :
2018-07-15 04:46:14 -04:00
cache_config = CacheConfig ( self . repository , lock_wait = lock_wait )
2017-05-10 12:34:22 -04:00
if cache_config . exists ( ) :
with cache_config :
self . _assert_secure ( manifest , key , cache_config )
else :
self . _assert_secure ( manifest , key )
logger . debug ( ' security: repository checks ok, allowing access ' )
2017-05-10 09:30:51 -04:00
2017-05-10 12:34:22 -04:00
def _assert_secure ( self , manifest , key , cache_config = None ) :
2017-05-10 09:30:51 -04:00
self . assert_location_matches ( cache_config )
self . assert_key_type ( key , cache_config )
self . assert_no_manifest_replay ( manifest , key , cache_config )
2016-11-27 06:39:49 -05:00
if not self . known ( ) :
2017-05-10 12:34:22 -04:00
logger . debug ( ' security: remembering previously unknown repository ' )
2017-05-10 09:30:51 -04:00
self . save ( manifest , key )
2016-11-27 06:39:49 -05:00
2017-05-10 09:30:51 -04:00
def assert_access_unknown ( self , warn_if_unencrypted , manifest , key ) :
2017-05-10 12:34:22 -04:00
# warn_if_unencrypted=False is only used for initializing a new repository.
# Thus, avoiding asking about a repository that's currently initializing.
2017-05-10 09:30:51 -04:00
if not key . logically_encrypted and not self . known ( ) :
2016-11-27 06:39:49 -05:00
msg = ( " Warning: Attempting to access a previously unknown unencrypted repository! \n " +
" Do you want to continue? [yN] " )
2017-05-10 09:30:51 -04:00
allow_access = not warn_if_unencrypted or yes ( msg , false_msg = " Aborting. " ,
invalid_msg = " Invalid answer, aborting. " ,
retry = False , env_var_override = ' BORG_UNKNOWN_UNENCRYPTED_REPO_ACCESS_IS_OK ' )
if allow_access :
if warn_if_unencrypted :
2017-05-10 12:34:22 -04:00
logger . debug ( ' security: remembering unknown unencrypted repository (explicitly allowed) ' )
2017-05-10 09:30:51 -04:00
else :
2017-05-10 12:34:22 -04:00
logger . debug ( ' security: initializing unencrypted repository ' )
2017-05-10 09:30:51 -04:00
self . save ( manifest , key )
else :
2016-11-27 06:39:49 -05:00
raise Cache . CacheInitAbortedError ( )
2018-07-15 04:46:14 -04:00
def assert_secure ( repository , manifest , lock_wait ) :
2017-05-10 09:30:51 -04:00
sm = SecurityManager ( repository )
2018-07-15 04:46:14 -04:00
sm . assert_secure ( manifest , manifest . key , lock_wait = lock_wait )
2017-05-10 09:30:51 -04:00
def recanonicalize_relative_location ( cache_location , repository ) :
# borg < 1.0.8rc1 had different canonicalization for the repo location (see #1655 and #1741).
repo_location = repository . _location . canonical_path ( )
rl = Location ( repo_location )
cl = Location ( cache_location )
if cl . proto == rl . proto and cl . user == rl . user and cl . host == rl . host and cl . port == rl . port \
and \
cl . path and rl . path and \
cl . path . startswith ( ' /~/ ' ) and rl . path . startswith ( ' /./ ' ) and cl . path [ 3 : ] == rl . path [ 3 : ] :
# everything is same except the expected change in relative path canonicalization,
# update previous_location to avoid warning / user query about changed location:
return repo_location
else :
return cache_location
def cache_dir ( repository , path = None ) :
return path or os . path . join ( get_cache_dir ( ) , repository . id_str )
class CacheConfig :
def __init__ ( self , repository , path = None , lock_wait = None ) :
self . repository = repository
self . path = cache_dir ( repository , path )
self . config_path = os . path . join ( self . path , ' config ' )
self . lock = None
self . lock_wait = lock_wait
def __enter__ ( self ) :
self . open ( )
return self
def __exit__ ( self , exc_type , exc_val , exc_tb ) :
self . close ( )
2017-05-10 12:34:22 -04:00
def exists ( self ) :
return os . path . exists ( self . config_path )
2017-05-10 09:30:51 -04:00
def create ( self ) :
assert not self . exists ( )
config = configparser . ConfigParser ( interpolation = None )
config . add_section ( ' cache ' )
config . set ( ' cache ' , ' version ' , ' 1 ' )
config . set ( ' cache ' , ' repository ' , self . repository . id_str )
config . set ( ' cache ' , ' manifest ' , ' ' )
2017-05-25 07:43:15 -04:00
config . add_section ( ' integrity ' )
2017-05-25 10:13:40 -04:00
config . set ( ' integrity ' , ' manifest ' , ' ' )
2017-05-10 09:30:51 -04:00
with SaveFile ( self . config_path ) as fd :
config . write ( fd )
def open ( self ) :
self . lock = Lock ( os . path . join ( self . path , ' lock ' ) , exclusive = True , timeout = self . lock_wait ,
kill_stale_locks = hostname_is_unique ( ) ) . acquire ( )
self . load ( )
def load ( self ) :
self . _config = configparser . ConfigParser ( interpolation = None )
2019-01-26 19:36:52 -05:00
with open ( self . config_path ) as fd :
self . _config . read_file ( fd )
2017-05-10 09:30:51 -04:00
self . _check_upgrade ( self . config_path )
self . id = self . _config . get ( ' cache ' , ' repository ' )
self . manifest_id = unhexlify ( self . _config . get ( ' cache ' , ' manifest ' ) )
self . timestamp = self . _config . get ( ' cache ' , ' timestamp ' , fallback = None )
self . key_type = self . _config . get ( ' cache ' , ' key_type ' , fallback = None )
2017-05-28 12:04:33 -04:00
self . ignored_features = set ( parse_stringified_list ( self . _config . get ( ' cache ' , ' ignored_features ' , fallback = ' ' ) ) )
self . mandatory_features = set ( parse_stringified_list ( self . _config . get ( ' cache ' , ' mandatory_features ' , fallback = ' ' ) ) )
2017-05-25 07:43:15 -04:00
try :
self . integrity = dict ( self . _config . items ( ' integrity ' ) )
2017-05-25 10:13:40 -04:00
if self . _config . get ( ' cache ' , ' manifest ' ) != self . integrity . pop ( ' manifest ' ) :
# The cache config file is updated (parsed with ConfigParser, the state of the ConfigParser
# is modified and then written out.), not re-created.
# Thus, older versions will leave our [integrity] section alone, making the section's data invalid.
# Therefore, we also add the manifest ID to this section and
2017-05-31 12:08:02 -04:00
# can discern whether an older version interfered by comparing the manifest IDs of this section
2017-05-25 10:13:40 -04:00
# and the main [cache] section.
self . integrity = { }
2017-05-31 12:08:02 -04:00
logger . warning ( ' Cache integrity data not available: old Borg version modified the cache. ' )
2017-05-25 07:43:15 -04:00
except configparser . NoSectionError :
2017-05-25 10:13:40 -04:00
logger . debug ( ' Cache integrity: No integrity data found (files, chunks). Cache is from old version. ' )
2017-05-25 07:43:15 -04:00
self . integrity = { }
2017-05-10 09:30:51 -04:00
previous_location = self . _config . get ( ' cache ' , ' previous_location ' , fallback = None )
if previous_location :
self . previous_location = recanonicalize_relative_location ( previous_location , self . repository )
else :
self . previous_location = None
2018-03-09 13:18:25 -05:00
self . _config . set ( ' cache ' , ' previous_location ' , self . repository . _location . canonical_path ( ) )
2017-05-10 09:30:51 -04:00
def save ( self , manifest = None , key = None ) :
if manifest :
self . _config . set ( ' cache ' , ' manifest ' , manifest . id_str )
self . _config . set ( ' cache ' , ' timestamp ' , manifest . timestamp )
2017-05-28 12:04:33 -04:00
self . _config . set ( ' cache ' , ' ignored_features ' , ' , ' . join ( self . ignored_features ) )
self . _config . set ( ' cache ' , ' mandatory_features ' , ' , ' . join ( self . mandatory_features ) )
2017-05-25 10:13:40 -04:00
if not self . _config . has_section ( ' integrity ' ) :
self . _config . add_section ( ' integrity ' )
for file , integrity_data in self . integrity . items ( ) :
self . _config . set ( ' integrity ' , file , integrity_data )
self . _config . set ( ' integrity ' , ' manifest ' , manifest . id_str )
2017-05-10 09:30:51 -04:00
if key :
self . _config . set ( ' cache ' , ' key_type ' , str ( key . TYPE ) )
with SaveFile ( self . config_path ) as fd :
self . _config . write ( fd )
def close ( self ) :
if self . lock is not None :
self . lock . release ( )
self . lock = None
def _check_upgrade ( self , config_path ) :
try :
cache_version = self . _config . getint ( ' cache ' , ' version ' )
wanted_version = 1
if cache_version != wanted_version :
self . close ( )
raise Exception ( ' %s has unexpected cache version %d (wanted: %d ). ' %
( config_path , cache_version , wanted_version ) )
except configparser . NoSectionError :
self . close ( )
raise Exception ( ' %s does not look like a Borg cache. ' % config_path ) from None
2015-03-17 18:03:36 -04:00
class Cache :
2010-03-06 12:25:35 -05:00
""" Client Side cache
"""
2016-10-10 00:10:39 -04:00
class RepositoryIDNotUnique ( Error ) :
""" Cache is newer than repository - do you have multiple, independently updated repos with same ID? """
2013-12-15 14:35:29 -05:00
class RepositoryReplay ( Error ) :
2017-11-19 19:58:53 -05:00
""" Cache, or information obtained from the security directory is newer than repository - this is either an attack or unsafe (multiple repos with same ID) """
2013-08-09 17:23:00 -04:00
2015-04-06 17:07:10 -04:00
class CacheInitAbortedError ( Error ) :
""" Cache initialization aborted """
2015-04-13 16:35:09 -04:00
class RepositoryAccessAborted ( Error ) :
""" Repository access aborted """
2015-04-06 17:07:10 -04:00
class EncryptionMethodMismatch ( Error ) :
2015-10-31 17:23:32 -04:00
""" Repository encryption method changed since last access, refusing to continue """
2015-04-06 17:07:10 -04:00
2015-11-21 14:50:53 -05:00
@staticmethod
def break_lock ( repository , path = None ) :
2017-05-10 09:30:51 -04:00
path = cache_dir ( repository , path )
2016-07-23 07:56:06 -04:00
Lock ( os . path . join ( path , ' lock ' ) , exclusive = True ) . break_lock ( )
2015-11-21 14:50:53 -05:00
2016-04-11 06:50:39 -04:00
@staticmethod
def destroy ( repository , path = None ) :
""" destroy the cache for ``repository`` or at ``path`` """
2016-04-23 16:42:56 -04:00
path = path or os . path . join ( get_cache_dir ( ) , repository . id_str )
2016-04-11 06:50:39 -04:00
config = os . path . join ( path , ' config ' )
if os . path . exists ( config ) :
os . remove ( config ) # kill config first
shutil . rmtree ( path )
2018-12-11 17:04:18 -05:00
def __new__ ( cls , repository , key , manifest , path = None , sync = True , warn_if_unencrypted = True ,
2018-12-11 16:55:33 -05:00
progress = False , lock_wait = None , permit_adhoc_cache = False , cache_mode = DEFAULT_FILES_CACHE_MODE ) :
2018-03-07 21:39:38 -05:00
2017-06-10 11:59:41 -04:00
def local ( ) :
return LocalCache ( repository = repository , key = key , manifest = manifest , path = path , sync = sync ,
2018-03-07 21:39:38 -05:00
warn_if_unencrypted = warn_if_unencrypted , progress = progress ,
2018-03-07 21:20:56 -05:00
lock_wait = lock_wait , cache_mode = cache_mode )
2017-06-10 11:59:41 -04:00
def adhoc ( ) :
2018-07-15 04:46:14 -04:00
return AdHocCache ( repository = repository , key = key , manifest = manifest , lock_wait = lock_wait )
2017-06-10 11:59:41 -04:00
if not permit_adhoc_cache :
return local ( )
# ad-hoc cache may be permitted, but if the local cache is in sync it'd be stupid to invalidate
# it by needlessly using the ad-hoc cache.
# Check if the local cache exists and is in sync.
cache_config = CacheConfig ( repository , path , lock_wait )
if cache_config . exists ( ) :
with cache_config :
cache_in_sync = cache_config . manifest_id == manifest . id
# Don't nest cache locks
if cache_in_sync :
# Local cache is in sync, use it
logger . debug ( ' Cache: choosing local cache (in sync) ' )
return local ( )
logger . debug ( ' Cache: choosing ad-hoc cache (local cache does not exist or is not in sync) ' )
return adhoc ( )
class CacheStatsMixin :
str_format = """ \
All archives : { 0. total_size : > 20 s } { 0. total_csize : > 20 s } { 0. unique_csize : > 20 s }
Unique chunks Total chunks
Chunk index : { 0. total_unique_chunks : 20 d } { 0. total_chunks : 20 d } """
def __str__ ( self ) :
return self . str_format . format ( self . format_tuple ( ) )
Summary = namedtuple ( ' Summary ' , [ ' total_size ' , ' total_csize ' , ' unique_size ' , ' unique_csize ' , ' total_unique_chunks ' ,
' total_chunks ' ] )
def stats ( self ) :
# XXX: this should really be moved down to `hashindex.pyx`
stats = self . Summary ( * self . chunks . summarize ( ) ) . _asdict ( )
return stats
def format_tuple ( self ) :
stats = self . stats ( )
for field in [ ' total_size ' , ' total_csize ' , ' unique_csize ' ] :
stats [ field ] = format_file_size ( stats [ field ] )
return self . Summary ( * * stats )
def chunks_stored_size ( self ) :
return self . stats ( ) [ ' unique_csize ' ]
class LocalCache ( CacheStatsMixin ) :
"""
Persistent , local ( client - side ) cache .
"""
2018-03-07 21:39:38 -05:00
def __init__ ( self , repository , key , manifest , path = None , sync = True , warn_if_unencrypted = True ,
2018-03-07 21:20:56 -05:00
progress = False , lock_wait = None , cache_mode = DEFAULT_FILES_CACHE_MODE ) :
2016-03-15 10:38:55 -04:00
"""
: param warn_if_unencrypted : print warning if accessing unknown unencrypted repository
2018-07-15 04:46:14 -04:00
: param lock_wait : timeout for lock acquisition ( int [ s ] or None [ wait forever ] )
2016-03-15 10:38:55 -04:00
: param sync : do : meth : ` . sync `
2018-03-07 21:20:56 -05:00
: param cache_mode : what shall be compared in the file stat infos vs . cached stat infos comparison
2016-03-15 10:38:55 -04:00
"""
2013-06-20 06:44:58 -04:00
self . repository = repository
2011-07-30 15:13:48 -04:00
self . key = key
2011-09-04 17:02:47 -04:00
self . manifest = manifest
2016-11-26 15:15:59 -05:00
self . progress = progress
2018-03-07 21:20:56 -05:00
self . cache_mode = cache_mode
2017-05-10 09:30:51 -04:00
self . timestamp = None
self . txn_active = False
self . path = cache_dir ( repository , path )
self . security_manager = SecurityManager ( repository )
self . cache_config = CacheConfig ( self . repository , self . path , lock_wait )
2015-04-13 16:35:09 -04:00
# Warn user before sending data to a never seen before unencrypted repository
2010-12-21 15:29:09 -05:00
if not os . path . exists ( self . path ) :
2017-05-10 09:30:51 -04:00
self . security_manager . assert_access_unknown ( warn_if_unencrypted , manifest , key )
2010-12-21 15:29:09 -05:00
self . create ( )
2017-05-10 09:30:51 -04:00
self . open ( )
2016-02-04 17:19:35 -05:00
try :
2017-05-10 09:30:51 -04:00
self . security_manager . assert_secure ( manifest , key , cache_config = self . cache_config )
2017-05-28 12:04:33 -04:00
if not self . check_cache_compatibility ( ) :
self . wipe_cache ( )
self . update_compatibility ( )
2017-05-10 09:30:51 -04:00
if sync and self . manifest . id != self . cache_config . manifest_id :
2016-02-04 17:19:35 -05:00
self . sync ( )
self . commit ( )
except :
self . close ( )
raise
2010-03-06 12:25:35 -05:00
2016-01-16 19:09:13 -05:00
def __enter__ ( self ) :
return self
def __exit__ ( self , exc_type , exc_val , exc_tb ) :
2013-06-24 16:41:05 -04:00
self . close ( )
2010-12-21 15:29:09 -05:00
def create ( self ) :
2015-03-09 11:02:06 -04:00
""" Create a new empty cache at `self.path`
2010-12-21 15:29:09 -05:00
"""
2011-01-04 17:00:39 -05:00
os . makedirs ( self . path )
2013-06-03 07:45:48 -04:00
with open ( os . path . join ( self . path , ' README ' ) , ' w ' ) as fd :
2016-11-11 15:24:16 -05:00
fd . write ( CACHE_README )
2017-05-10 09:30:51 -04:00
self . cache_config . create ( )
2017-05-25 07:43:15 -04:00
ChunkIndex ( ) . write ( os . path . join ( self . path , ' chunks ' ) )
2015-08-29 21:03:48 -04:00
os . makedirs ( os . path . join ( self . path , ' chunks.archive.d ' ) )
2017-10-14 15:57:58 -04:00
with SaveFile ( os . path . join ( self . path , ' files ' ) , binary = True ) :
2012-10-17 05:40:23 -04:00
pass # empty file
2010-12-21 15:29:09 -05:00
2016-10-18 15:36:23 -04:00
def _do_open ( self ) :
2017-05-10 09:30:51 -04:00
self . cache_config . load ( )
2017-05-25 07:43:15 -04:00
with IntegrityCheckedFile ( path = os . path . join ( self . path , ' chunks ' ) , write = False ,
integrity_data = self . cache_config . integrity . get ( ' chunks ' ) ) as fd :
self . chunks = ChunkIndex . read ( fd )
2018-03-07 21:39:38 -05:00
if ' d ' in self . cache_mode : # d(isabled)
2018-03-07 21:20:56 -05:00
self . files = None
else :
self . _read_files ( )
2011-07-02 14:39:35 -04:00
2017-05-10 09:30:51 -04:00
def open ( self ) :
2015-04-19 17:45:05 -04:00
if not os . path . isdir ( self . path ) :
2015-05-09 12:40:55 -04:00
raise Exception ( ' %s Does not look like a Borg cache ' % self . path )
2017-05-10 09:30:51 -04:00
self . cache_config . open ( )
2015-04-19 17:45:05 -04:00
self . rollback ( )
2013-06-24 16:41:05 -04:00
def close ( self ) :
2017-05-10 09:30:51 -04:00
if self . cache_config is not None :
self . cache_config . close ( )
self . cache_config = None
2013-06-24 16:41:05 -04:00
2011-07-02 14:39:35 -04:00
def _read_files ( self ) :
2011-07-06 16:23:41 -04:00
self . files = { }
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-10 20:54:52 -04:00
self . _newest_cmtime = None
2016-03-26 09:31:54 -04:00
logger . debug ( ' Reading files cache ... ' )
2018-01-20 00:36:54 -05:00
msg = None
try :
with IntegrityCheckedFile ( path = os . path . join ( self . path , ' files ' ) , write = False ,
integrity_data = self . cache_config . integrity . get ( ' files ' ) ) as fd :
u = msgpack . Unpacker ( use_list = True )
while True :
data = fd . read ( 64 * 1024 )
if not data :
break
u . feed ( data )
try :
for path_hash , item in u :
entry = FileCacheEntry ( * item )
# in the end, this takes about 240 Bytes per file
self . files [ path_hash ] = msgpack . packb ( entry . _replace ( age = entry . age + 1 ) )
except ( TypeError , ValueError ) as exc :
msg = " The files cache seems invalid. [ %s ] " % str ( exc )
break
except OSError as exc :
msg = " The files cache can ' t be read. [ %s ] " % str ( exc )
except FileIntegrityError as fie :
msg = " The files cache is corrupted. [ %s ] " % str ( fie )
if msg is not None :
logger . warning ( msg )
logger . warning ( ' Continuing without files cache - expect lower performance. ' )
self . files = { }
2010-12-21 15:29:09 -05:00
def begin_txn ( self ) :
# Initialize transaction snapshot
2017-02-27 14:38:02 -05:00
pi = ProgressIndicatorMessage ( msgid = ' cache.begin_transaction ' )
2010-12-21 15:29:09 -05:00
txn_dir = os . path . join ( self . path , ' txn.tmp ' )
os . mkdir ( txn_dir )
2016-11-25 19:28:43 -05:00
pi . output ( ' Initializing cache transaction: Reading config ' )
2010-12-21 15:29:09 -05:00
shutil . copy ( os . path . join ( self . path , ' config ' ) , txn_dir )
2016-11-25 19:28:43 -05:00
pi . output ( ' Initializing cache transaction: Reading chunks ' )
2010-12-21 15:29:09 -05:00
shutil . copy ( os . path . join ( self . path , ' chunks ' ) , txn_dir )
2016-11-25 19:28:43 -05:00
pi . output ( ' Initializing cache transaction: Reading files ' )
2010-12-21 15:29:09 -05:00
shutil . copy ( os . path . join ( self . path , ' files ' ) , txn_dir )
os . rename ( os . path . join ( self . path , ' txn.tmp ' ) ,
os . path . join ( self . path , ' txn.active ' ) )
self . txn_active = True
2016-11-25 19:28:43 -05:00
pi . finish ( )
2010-12-21 15:29:09 -05:00
def commit ( self ) :
""" Commit transaction
"""
2011-01-04 17:16:55 -05:00
if not self . txn_active :
return
2017-05-10 09:30:51 -04:00
self . security_manager . save ( self . manifest , self . key )
2017-02-27 14:38:02 -05:00
pi = ProgressIndicatorMessage ( msgid = ' cache.commit ' )
2011-07-06 16:11:01 -04:00
if self . files is not None :
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-10 20:54:52 -04:00
if self . _newest_cmtime is None :
2016-11-19 18:08:33 -05:00
# was never set because no files were modified/added
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-10 20:54:52 -04:00
self . _newest_cmtime = 2 * * 63 - 1 # nanoseconds, good until y2262
2016-07-30 19:33:46 -04:00
ttl = int ( os . environ . get ( ' BORG_FILES_CACHE_TTL ' , 20 ) )
2016-11-27 20:23:32 -05:00
pi . output ( ' Saving files cache ' )
2017-05-25 07:43:15 -04:00
with IntegrityCheckedFile ( path = os . path . join ( self . path , ' files ' ) , write = True ) as fd :
2014-06-03 17:10:52 -04:00
for path_hash , item in self . files . items ( ) :
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-10 20:54:52 -04:00
# Only keep files seen in this backup that are older than newest cmtime seen in this backup -
# this is to avoid issues with filesystem snapshots and cmtime granularity.
2016-08-03 18:06:15 -04:00
# Also keep files from older backups that have not reached BORG_FILES_CACHE_TTL yet.
2016-04-16 11:48:47 -04:00
entry = FileCacheEntry ( * msgpack . unpackb ( item ) )
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-10 20:54:52 -04:00
if entry . age == 0 and bigint_to_int ( entry . cmtime ) < self . _newest_cmtime or \
2016-08-05 19:24:22 -04:00
entry . age > 0 and entry . age < ttl :
2016-04-16 11:48:47 -04:00
msgpack . pack ( ( path_hash , entry ) , fd )
2017-05-25 07:43:15 -04:00
self . cache_config . integrity [ ' files ' ] = fd . integrity_data
pi . output ( ' Saving chunks cache ' )
with IntegrityCheckedFile ( path = os . path . join ( self . path , ' chunks ' ) , write = True ) as fd :
self . chunks . write ( fd )
self . cache_config . integrity [ ' chunks ' ] = fd . integrity_data
2016-11-23 18:46:50 -05:00
pi . output ( ' Saving cache config ' )
2017-05-10 09:30:51 -04:00
self . cache_config . save ( self . manifest , self . key )
2010-12-21 15:29:09 -05:00
os . rename ( os . path . join ( self . path , ' txn.active ' ) ,
os . path . join ( self . path , ' txn.tmp ' ) )
shutil . rmtree ( os . path . join ( self . path , ' txn.tmp ' ) )
self . txn_active = False
2016-11-23 18:46:50 -05:00
pi . finish ( )
2010-12-21 15:29:09 -05:00
def rollback ( self ) :
""" Roll back partial and aborted transactions
"""
2014-03-30 16:46:57 -04:00
# Remove partial transaction
if os . path . exists ( os . path . join ( self . path , ' txn.tmp ' ) ) :
shutil . rmtree ( os . path . join ( self . path , ' txn.tmp ' ) )
2010-12-21 15:29:09 -05:00
# Roll back active transaction
txn_dir = os . path . join ( self . path , ' txn.active ' )
if os . path . exists ( txn_dir ) :
shutil . copy ( os . path . join ( txn_dir , ' config ' ) , self . path )
shutil . copy ( os . path . join ( txn_dir , ' chunks ' ) , self . path )
shutil . copy ( os . path . join ( txn_dir , ' files ' ) , self . path )
2011-08-14 08:50:56 -04:00
os . rename ( txn_dir , os . path . join ( self . path , ' txn.tmp ' ) )
2014-03-30 16:46:57 -04:00
if os . path . exists ( os . path . join ( self . path , ' txn.tmp ' ) ) :
shutil . rmtree ( os . path . join ( self . path , ' txn.tmp ' ) )
2010-12-21 15:29:09 -05:00
self . txn_active = False
2015-04-19 17:45:05 -04:00
self . _do_open ( )
2010-12-21 15:29:09 -05:00
def sync ( self ) :
2015-05-23 11:07:22 -04:00
""" Re-synchronize chunks cache with repository.
2015-08-29 21:03:48 -04:00
Maintains a directory with known backup archive indexes , so it only
needs to fetch infos from repo and build a chunk index once per backup
archive .
If out of sync , missing archive indexes get added , outdated indexes
get removed and a new master chunks index is built by merging all
archive indexes .
2010-03-06 12:25:35 -05:00
"""
2015-08-29 21:03:48 -04:00
archive_path = os . path . join ( self . path , ' chunks.archive.d ' )
2017-06-11 05:29:41 -04:00
# An index of chunks whose size had to be fetched
2017-06-10 11:59:41 -04:00
chunks_fetched_size_index = ChunkIndex ( )
2017-05-27 15:50:28 -04:00
# Instrumentation
processed_item_metadata_bytes = 0
processed_item_metadata_chunks = 0
compact_chunks_archive_saved_space = 0
2017-06-10 11:59:41 -04:00
fetched_chunks_for_csize = 0
fetched_bytes_for_csize = 0
2015-05-23 11:07:22 -04:00
2015-08-29 21:03:48 -04:00
def mkpath ( id , suffix = ' ' ) :
2016-04-23 16:42:56 -04:00
id_hex = bin_to_hex ( id )
2015-08-30 09:15:15 -04:00
path = os . path . join ( archive_path , id_hex + suffix )
2017-05-25 07:49:03 -04:00
return path
2015-05-23 11:07:22 -04:00
2015-08-30 09:15:15 -04:00
def cached_archives ( ) :
2015-10-02 10:56:31 -04:00
if self . do_cache :
fns = os . listdir ( archive_path )
2017-05-27 15:50:28 -04:00
# filenames with 64 hex digits == 256bit,
# or compact indices which are 64 hex digits + ".compact"
return set ( unhexlify ( fn ) for fn in fns if len ( fn ) == 64 ) | \
set ( unhexlify ( fn [ : 64 ] ) for fn in fns if len ( fn ) == 72 and fn . endswith ( ' .compact ' ) )
2015-10-02 10:56:31 -04:00
else :
return set ( )
2015-08-30 09:15:15 -04:00
def repo_archives ( ) :
2016-08-14 22:17:41 -04:00
return set ( info . id for info in self . manifest . archives . list ( ) )
2015-08-29 21:03:48 -04:00
def cleanup_outdated ( ids ) :
for id in ids :
2017-05-25 08:00:03 -04:00
cleanup_cached_archive ( id )
2017-06-09 05:07:17 -04:00
def cleanup_cached_archive ( id , cleanup_compact = True ) :
2017-05-25 08:00:03 -04:00
try :
2017-05-27 15:50:28 -04:00
os . unlink ( mkpath ( id ) )
2017-05-25 08:00:03 -04:00
os . unlink ( mkpath ( id ) + ' .integrity ' )
except FileNotFoundError :
pass
2017-06-09 05:07:17 -04:00
if not cleanup_compact :
return
2017-05-27 15:50:28 -04:00
try :
os . unlink ( mkpath ( id , suffix = ' .compact ' ) )
os . unlink ( mkpath ( id , suffix = ' .compact ' ) + ' .integrity ' )
except FileNotFoundError :
pass
2015-08-30 09:15:15 -04:00
2017-06-10 11:59:41 -04:00
def fetch_missing_csize ( chunk_idx ) :
"""
Archives created with AdHocCache will have csize = 0 in all chunk list entries whose
chunks were already in the repository .
Scan * chunk_idx * for entries where csize = 0 and fill in the correct information .
"""
nonlocal fetched_chunks_for_csize
nonlocal fetched_bytes_for_csize
all_missing_ids = chunk_idx . zero_csize_ids ( )
fetch_ids = [ ]
2017-06-10 13:22:20 -04:00
if len ( chunks_fetched_size_index ) :
for id_ in all_missing_ids :
already_fetched_entry = chunks_fetched_size_index . get ( id_ )
if already_fetched_entry :
entry = chunk_idx [ id_ ] . _replace ( csize = already_fetched_entry . csize )
assert entry . size == already_fetched_entry . size , ' Chunk size mismatch '
chunk_idx [ id_ ] = entry
else :
fetch_ids . append ( id_ )
else :
fetch_ids = all_missing_ids
2017-06-10 11:59:41 -04:00
2017-06-17 20:04:21 -04:00
# This is potentially a rather expensive operation, but it's hard to tell at this point
# if it's a problem in practice (hence the experimental status of --no-cache-sync).
2017-06-10 11:59:41 -04:00
for id_ , data in zip ( fetch_ids , decrypted_repository . repository . get_many ( fetch_ids ) ) :
entry = chunk_idx [ id_ ] . _replace ( csize = len ( data ) )
chunk_idx [ id_ ] = entry
chunks_fetched_size_index [ id_ ] = entry
fetched_chunks_for_csize + = 1
fetched_bytes_for_csize + = len ( data )
2017-05-31 14:46:57 -04:00
def fetch_and_build_idx ( archive_id , decrypted_repository , chunk_idx ) :
2017-05-27 15:50:28 -04:00
nonlocal processed_item_metadata_bytes
nonlocal processed_item_metadata_chunks
2017-05-28 07:16:52 -04:00
csize , data = decrypted_repository . get ( archive_id )
chunk_idx . add ( archive_id , 1 , len ( data ) , csize )
2016-08-14 19:11:33 -04:00
archive = ArchiveItem ( internal_dict = msgpack . unpackb ( data ) )
if archive . version != 1 :
2014-01-22 16:10:36 -05:00
raise Exception ( ' Unknown archive metadata version ' )
2017-03-07 09:13:59 -05:00
sync = CacheSynchronizer ( chunk_idx )
2017-05-28 07:16:52 -04:00
for item_id , ( csize , data ) in zip ( archive . items , decrypted_repository . get_many ( archive . items ) ) :
chunk_idx . add ( item_id , 1 , len ( data ) , csize )
2017-05-27 15:50:28 -04:00
processed_item_metadata_bytes + = len ( data )
processed_item_metadata_chunks + = 1
2017-03-07 09:13:59 -05:00
sync . feed ( data )
2015-10-02 10:56:31 -04:00
if self . do_cache :
2017-06-10 11:59:41 -04:00
fetch_missing_csize ( chunk_idx )
2017-06-09 05:07:17 -04:00
write_archive_index ( archive_id , chunk_idx )
def write_archive_index ( archive_id , chunk_idx ) :
nonlocal compact_chunks_archive_saved_space
compact_chunks_archive_saved_space + = chunk_idx . compact ( )
fn = mkpath ( archive_id , suffix = ' .compact ' )
fn_tmp = mkpath ( archive_id , suffix = ' .tmp ' )
try :
with DetachedIntegrityCheckedFile ( path = fn_tmp , write = True ,
2017-06-09 05:13:13 -04:00
filename = bin_to_hex ( archive_id ) + ' .compact ' ) as fd :
2017-06-09 05:07:17 -04:00
chunk_idx . write ( fd )
except Exception :
2017-07-24 04:45:57 -04:00
truncate_and_unlink ( fn_tmp )
2017-06-09 05:07:17 -04:00
else :
os . rename ( fn_tmp , fn )
2015-08-30 09:15:15 -04:00
2017-06-09 05:10:49 -04:00
def read_archive_index ( archive_id , archive_name ) :
archive_chunk_idx_path = mkpath ( archive_id )
logger . info ( " Reading cached archive chunk index for %s ... " , archive_name )
try :
try :
# Attempt to load compact index first
with DetachedIntegrityCheckedFile ( path = archive_chunk_idx_path + ' .compact ' , write = False ) as fd :
archive_chunk_idx = ChunkIndex . read ( fd , permit_compact = True )
# In case a non-compact index exists, delete it.
cleanup_cached_archive ( archive_id , cleanup_compact = False )
# Compact index read - return index, no conversion necessary (below).
return archive_chunk_idx
except FileNotFoundError :
# No compact index found, load non-compact index, and convert below.
with DetachedIntegrityCheckedFile ( path = archive_chunk_idx_path , write = False ) as fd :
archive_chunk_idx = ChunkIndex . read ( fd )
except FileIntegrityError as fie :
logger . error ( ' Cached archive chunk index of %s is corrupted: %s ' , archive_name , fie )
# Delete corrupted index, set warning. A new index must be build.
cleanup_cached_archive ( archive_id )
set_ec ( EXIT_WARNING )
return None
# Convert to compact index. Delete the existing index first.
logger . debug ( ' Found non-compact index for %s , converting to compact. ' , archive_name )
cleanup_cached_archive ( archive_id )
write_archive_index ( archive_id , archive_chunk_idx )
return archive_chunk_idx
2017-05-26 07:54:28 -04:00
def get_archive_ids_to_names ( archive_ids ) :
# Pass once over all archives and build a mapping from ids to names.
# The easier approach, doing a similar loop for each archive, has
# square complexity and does about a dozen million functions calls
# with 1100 archives (which takes 30s CPU seconds _alone_).
archive_names = { }
2016-08-14 22:17:41 -04:00
for info in self . manifest . archives . list ( ) :
2017-05-26 07:54:28 -04:00
if info . id in archive_ids :
archive_names [ info . id ] = info . name
2017-05-31 14:46:57 -04:00
assert len ( archive_names ) == len ( archive_ids )
2017-05-26 07:54:28 -04:00
return archive_names
2015-05-23 11:07:22 -04:00
2015-08-29 21:03:48 -04:00
def create_master_idx ( chunk_idx ) :
2015-10-02 10:58:08 -04:00
logger . info ( ' Synchronizing chunks cache... ' )
2015-08-30 09:15:15 -04:00
cached_ids = cached_archives ( )
archive_ids = repo_archives ( )
2017-05-25 08:01:21 -04:00
logger . info ( ' Archives: %d , w/ cached Idx: %d , w/ outdated Idx: %d , w/o cached Idx: %d . ' ,
2015-08-30 09:15:15 -04:00
len ( archive_ids ) , len ( cached_ids ) ,
2017-05-25 08:01:21 -04:00
len ( cached_ids - archive_ids ) , len ( archive_ids - cached_ids ) )
2015-08-29 21:03:48 -04:00
# deallocates old hashindex, creates empty hashindex:
2015-05-23 11:07:22 -04:00
chunk_idx . clear ( )
2015-08-30 09:15:15 -04:00
cleanup_outdated ( cached_ids - archive_ids )
2018-06-12 16:12:02 -04:00
# Explicitly set the usable initial hash table capacity to avoid performance issues
2017-05-26 06:30:15 -04:00
# due to hash table "resonance".
2018-06-12 16:12:02 -04:00
master_index_capacity = len ( self . repository )
2015-08-30 09:15:15 -04:00
if archive_ids :
2018-06-12 16:12:02 -04:00
chunk_idx = None if not self . do_cache else ChunkIndex ( usable = master_index_capacity )
2017-05-31 14:46:57 -04:00
pi = ProgressIndicatorPercent ( total = len ( archive_ids ) , step = 0.1 ,
msg = ' %3.0f %% Syncing chunks cache. Processing archive %s ' ,
msgid = ' cache.sync ' )
2017-05-26 07:54:28 -04:00
archive_ids_to_names = get_archive_ids_to_names ( archive_ids )
2017-05-31 14:46:57 -04:00
for archive_id , archive_name in archive_ids_to_names . items ( ) :
pi . show ( info = [ remove_surrogates ( archive_name ) ] )
2016-12-03 06:06:22 -05:00
if self . do_cache :
if archive_id in cached_ids :
2017-06-09 05:10:49 -04:00
archive_chunk_idx = read_archive_index ( archive_id , archive_name )
if archive_chunk_idx is None :
2017-05-25 08:00:03 -04:00
cached_ids . remove ( archive_id )
if archive_id not in cached_ids :
# Do not make this an else branch; the FileIntegrityError exception handler
# above can remove *archive_id* from *cached_ids*.
2017-05-25 08:01:21 -04:00
logger . info ( ' Fetching and building archive index for %s ... ' , archive_name )
2017-05-26 16:54:27 -04:00
archive_chunk_idx = ChunkIndex ( )
2017-05-31 14:46:57 -04:00
fetch_and_build_idx ( archive_id , decrypted_repository , archive_chunk_idx )
2016-12-03 06:06:22 -05:00
logger . info ( " Merging into master chunks index ... " )
2017-05-27 15:50:28 -04:00
chunk_idx . merge ( archive_chunk_idx )
2015-08-29 21:03:48 -04:00
else :
2018-06-12 16:12:02 -04:00
chunk_idx = chunk_idx or ChunkIndex ( usable = master_index_capacity )
2017-05-25 08:01:21 -04:00
logger . info ( ' Fetching archive index for %s ... ' , archive_name )
2017-05-31 14:46:57 -04:00
fetch_and_build_idx ( archive_id , decrypted_repository , chunk_idx )
2017-06-10 11:59:41 -04:00
if not self . do_cache :
fetch_missing_csize ( chunk_idx )
2017-05-31 14:46:57 -04:00
pi . finish ( )
2017-06-10 11:59:41 -04:00
logger . debug ( ' Cache sync: had to fetch %s ( %d chunks) because no archive had a csize set for them '
' (due to --no-cache-sync) ' ,
format_file_size ( fetched_bytes_for_csize ) , fetched_chunks_for_csize )
logger . debug ( ' Cache sync: processed %s ( %d chunks) of metadata ' ,
2017-05-27 15:50:28 -04:00
format_file_size ( processed_item_metadata_bytes ) , processed_item_metadata_chunks )
logger . debug ( ' Cache sync: compact chunks.archive.d storage saved %s bytes ' ,
format_file_size ( compact_chunks_archive_saved_space ) )
2015-10-02 10:58:08 -04:00
logger . info ( ' Done. ' )
2015-08-29 21:03:48 -04:00
return chunk_idx
2015-08-30 09:15:15 -04:00
def legacy_cleanup ( ) :
""" bring old cache dirs into the desired state (cleanup and adapt) """
2015-08-29 21:03:48 -04:00
try :
os . unlink ( os . path . join ( self . path , ' chunks.archive ' ) )
except :
pass
try :
2015-08-30 09:15:15 -04:00
os . unlink ( os . path . join ( self . path , ' chunks.archive.tmp ' ) )
except :
pass
try :
2015-08-29 21:03:48 -04:00
os . mkdir ( archive_path )
except :
pass
2017-06-11 06:37:20 -04:00
# The cache can be used by a command that e.g. only checks against Manifest.Operation.WRITE,
# which does not have to include all flags from Manifest.Operation.READ.
# Since the sync will attempt to read archives, check compatibility with Manifest.Operation.READ.
self . manifest . check_repository_compatibility ( ( Manifest . Operation . READ , ) )
2015-05-23 11:07:22 -04:00
self . begin_txn ( )
2017-05-28 07:16:52 -04:00
with cache_if_remote ( self . repository , decrypted_cache = self . key ) as decrypted_repository :
2016-01-16 17:42:54 -05:00
legacy_cleanup ( )
# TEMPORARY HACK: to avoid archive index caching, create a FILE named ~/.cache/borg/REPOID/chunks.archive.d -
# this is only recommended if you have a fast, low latency connection to your repo (e.g. if repo is local disk)
self . do_cache = os . path . isdir ( archive_path )
self . chunks = create_master_idx ( self . chunks )
2010-03-06 12:25:35 -05:00
2017-05-28 12:04:33 -04:00
def check_cache_compatibility ( self ) :
my_features = Manifest . SUPPORTED_REPO_FEATURES
if self . cache_config . ignored_features & my_features :
# The cache might not contain references of chunks that need a feature that is mandatory for some operation
# and which this version supports. To avoid corruption while executing that operation force rebuild.
return False
if not self . cache_config . mandatory_features < = my_features :
# The cache was build with consideration to at least one feature that this version does not understand.
# This client might misinterpret the cache. Thus force a rebuild.
return False
return True
def wipe_cache ( self ) :
logger . warning ( " Discarding incompatible cache and forcing a cache rebuild " )
archive_path = os . path . join ( self . path , ' chunks.archive.d ' )
if os . path . isdir ( archive_path ) :
shutil . rmtree ( os . path . join ( self . path , ' chunks.archive.d ' ) )
os . makedirs ( os . path . join ( self . path , ' chunks.archive.d ' ) )
self . chunks = ChunkIndex ( )
2017-10-14 15:57:58 -04:00
with SaveFile ( os . path . join ( self . path , ' files ' ) , binary = True ) :
2017-05-28 12:04:33 -04:00
pass # empty file
self . cache_config . manifest_id = ' '
self . cache_config . _config . set ( ' cache ' , ' manifest ' , ' ' )
self . cache_config . ignored_features = set ( )
self . cache_config . mandatory_features = set ( )
def update_compatibility ( self ) :
operation_to_features_map = self . manifest . get_all_mandatory_features ( )
my_features = Manifest . SUPPORTED_REPO_FEATURES
repo_features = set ( )
for operation , features in operation_to_features_map . items ( ) :
repo_features . update ( features )
self . cache_config . ignored_features . update ( repo_features - my_features )
self . cache_config . mandatory_features . update ( repo_features & my_features )
2017-03-04 23:19:32 -05:00
def add_chunk ( self , id , chunk , stats , overwrite = False , wait = True ) :
2010-12-21 15:29:09 -05:00
if not self . txn_active :
self . begin_txn ( )
2017-04-03 16:05:53 -04:00
size = len ( chunk )
2016-04-07 05:29:52 -04:00
refcount = self . seen_chunk ( id , size )
if refcount and not overwrite :
2015-09-05 19:10:43 -04:00
return self . chunk_incref ( id , stats )
2016-03-17 22:16:12 -04:00
data = self . key . encrypt ( chunk )
2010-03-15 16:23:34 -04:00
csize = len ( data )
2017-03-04 23:19:32 -05:00
self . repository . put ( id , data , wait = wait )
2016-04-16 18:40:08 -04:00
self . chunks . add ( id , 1 , size , csize )
stats . update ( size , csize , not refcount )
2016-04-16 11:48:47 -04:00
return ChunkListEntry ( id , size , csize )
2010-03-06 12:25:35 -05:00
2015-09-05 19:10:43 -04:00
def seen_chunk ( self , id , size = None ) :
2016-04-16 11:48:47 -04:00
refcount , stored_size , _ = self . chunks . get ( id , ChunkIndexEntry ( 0 , None , None ) )
2015-09-05 19:10:43 -04:00
if size is not None and stored_size is not None and size != stored_size :
# we already have a chunk with that id, but different size.
# this is either a hash collision (unlikely) or corruption or a bug.
raise Exception ( " chunk has same id [ %r ], but different size (stored: %d new: %d )! " % (
id , stored_size , size ) )
return refcount
2010-03-06 12:25:35 -05:00
2017-07-23 07:48:45 -04:00
def chunk_incref ( self , id , stats , size = None ) :
2010-12-21 15:29:09 -05:00
if not self . txn_active :
self . begin_txn ( )
2017-07-23 07:48:45 -04:00
count , _size , csize = self . chunks . incref ( id )
stats . update ( _size , csize , False )
return ChunkListEntry ( id , _size , csize )
2010-03-06 12:25:35 -05:00
2017-03-04 23:19:32 -05:00
def chunk_decref ( self , id , stats , wait = True ) :
2010-12-21 15:29:09 -05:00
if not self . txn_active :
self . begin_txn ( )
2016-04-11 18:10:44 -04:00
count , size , csize = self . chunks . decref ( id )
if count == 0 :
2010-12-21 15:29:09 -05:00
del self . chunks [ id ]
2017-03-04 23:19:32 -05:00
self . repository . delete ( id , wait = wait )
2014-03-19 17:32:07 -04:00
stats . update ( - size , - csize , True )
2010-03-15 16:23:34 -04:00
else :
2014-03-19 17:32:07 -04:00
stats . update ( - size , - csize , False )
2010-10-13 16:07:55 -04:00
2018-03-07 22:10:43 -05:00
def file_known_and_unchanged ( self , path_hash , st ) :
2018-02-23 08:48:24 -05:00
"""
Check if we know the file that has this path_hash ( know == it is in our files cache ) and
whether it is unchanged ( the size / inode number / cmtime is same for stuff we check in this cache_mode ) .
: param path_hash : hash ( file_path ) , to save some memory in the files cache
: param st : the file ' s stat() result
: return : known , ids ( known is True if we have infos about this file in the cache ,
ids is the list of chunk ids IF the file has not changed , otherwise None ) .
"""
2018-03-07 21:20:56 -05:00
cache_mode = self . cache_mode
2018-03-07 21:39:38 -05:00
if ' d ' in cache_mode or not stat . S_ISREG ( st . st_mode ) : # d(isabled)
2018-02-23 08:48:24 -05:00
return False , None
# note: r(echunk) does not need the files cache in this method, but the files cache will
# be updated and saved to disk to memorize the files. To preserve previous generations in
2018-03-07 21:20:56 -05:00
# the cache, this means that it also needs to get loaded from disk first.
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-10 20:54:52 -04:00
if ' r ' in cache_mode : # r(echunk)
2018-02-23 08:48:24 -05:00
return False , None
2010-12-21 15:29:09 -05:00
entry = self . files . get ( path_hash )
2014-06-03 15:19:02 -04:00
if not entry :
2018-02-23 08:48:24 -05:00
return False , None
# we know the file!
2016-04-16 11:48:47 -04:00
entry = FileCacheEntry ( * msgpack . unpackb ( entry ) )
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-10 20:54:52 -04:00
if ' s ' in cache_mode and entry . size != st . st_size :
2018-02-23 08:48:24 -05:00
return True , None
2018-03-07 22:10:43 -05:00
if ' i ' in cache_mode and entry . inode != st . st_ino :
2018-02-23 08:48:24 -05:00
return True , None
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-10 20:54:52 -04:00
if ' c ' in cache_mode and bigint_to_int ( entry . cmtime ) != st . st_ctime_ns :
2018-02-23 08:48:24 -05:00
return True , None
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-10 20:54:52 -04:00
elif ' m ' in cache_mode and bigint_to_int ( entry . cmtime ) != st . st_mtime_ns :
2018-02-23 08:48:24 -05:00
return True , None
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-10 20:54:52 -04:00
# we ignored the inode number in the comparison above or it is still same.
# if it is still the same, replacing it in the tuple doesn't change it.
# if we ignored it, a reason for doing that is that files were moved to a new
# disk / new fs (so a one-time change of inode number is expected) and we wanted
# to avoid everything getting chunked again. to be able to re-enable the inode
# number comparison in a future backup run (and avoid chunking everything
# again at that time), we need to update the inode number in the cache with what
# we see in the filesystem.
self . files [ path_hash ] = msgpack . packb ( entry . _replace ( inode = st . st_ino , age = 0 ) )
2018-02-23 08:48:24 -05:00
return True , entry . chunk_ids
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-10 20:54:52 -04:00
2018-03-07 21:20:56 -05:00
def memorize_file ( self , path_hash , st , ids ) :
cache_mode = self . cache_mode
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-10 20:54:52 -04:00
# note: r(echunk) modes will update the files cache, d(isabled) mode won't
2018-03-07 21:39:38 -05:00
if ' d ' in cache_mode or not stat . S_ISREG ( st . st_mode ) :
2015-03-08 10:01:24 -04:00
return
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-10 20:54:52 -04:00
if ' c ' in cache_mode :
cmtime_ns = safe_ns ( st . st_ctime_ns )
elif ' m ' in cache_mode :
cmtime_ns = safe_ns ( st . st_mtime_ns )
entry = FileCacheEntry ( age = 0 , inode = st . st_ino , size = st . st_size , cmtime = int_to_bigint ( cmtime_ns ) , chunk_ids = ids )
2016-04-16 11:48:47 -04:00
self . files [ path_hash ] = msgpack . packb ( entry )
implement files cache mode control, fixes #911
You can now control the files cache mode using this option:
--files-cache={ctime,mtime,size,inode,rechunk,disabled}*
(only some combinations are supported)
Previously, only these modes were supported:
- mtime,size,inode (default of borg < 1.1.0rc4)
- mtime,size (by using --ignore-inode)
- disabled (by using --no-files-cache)
Now, you additionally get:
- ctime alternatively to mtime (more safe), e.g.:
ctime,size,inode (this is the new default of borg >= 1.1.0rc4)
- rechunk (consider all files as changed, rechunk them)
Deprecated:
- --ignore-inodes (use modes without "inode")
- --no-files-cache (use "disabled" mode)
The tests needed some changes:
- previously, we use os.utime() to set a files mtime (atime) to specific
values, but that does not work for ctime.
- now use time.sleep() to create the "latest file" that usually does
not end up in the files cache (see FAQ)
2017-09-10 20:54:52 -04:00
self . _newest_cmtime = max ( self . _newest_cmtime or 0 , cmtime_ns )
2017-06-10 11:59:41 -04:00
class AdHocCache ( CacheStatsMixin ) :
"""
Ad - hoc , non - persistent cache .
Compared to the standard LocalCache the AdHocCache does not maintain accurate reference count ,
nor does it provide a files cache ( which would require persistence ) . Chunks that were not added
during the current AdHocCache lifetime won ' t have correct size/csize set (0 bytes) and will
have an infinite reference count ( MAX_VALUE ) .
"""
str_format = """ \
All archives : unknown unknown unknown
Unique chunks Total chunks
Chunk index : { 0. total_unique_chunks : 20 d } unknown """
2018-07-15 04:46:14 -04:00
def __init__ ( self , repository , key , manifest , warn_if_unencrypted = True , lock_wait = None ) :
2017-06-10 11:59:41 -04:00
self . repository = repository
self . key = key
self . manifest = manifest
self . _txn_active = False
self . security_manager = SecurityManager ( repository )
2018-07-15 04:46:14 -04:00
self . security_manager . assert_secure ( manifest , key , lock_wait = lock_wait )
2017-06-10 11:59:41 -04:00
logger . warning ( ' Note: --no-cache-sync is an experimental feature. ' )
# Public API
def __enter__ ( self ) :
return self
def __exit__ ( self , exc_type , exc_val , exc_tb ) :
pass
files = None
2018-03-07 21:39:38 -05:00
cache_mode = ' d '
2017-06-10 11:59:41 -04:00
2018-03-07 22:10:43 -05:00
def file_known_and_unchanged ( self , path_hash , st ) :
2018-02-23 08:48:24 -05:00
return False , None
2017-06-10 11:59:41 -04:00
2018-03-07 21:20:56 -05:00
def memorize_file ( self , path_hash , st , ids ) :
2017-06-10 11:59:41 -04:00
pass
def add_chunk ( self , id , chunk , stats , overwrite = False , wait = True ) :
assert not overwrite , ' AdHocCache does not permit overwrites — trying to use it for recreate? '
if not self . _txn_active :
2017-07-30 15:50:57 -04:00
self . begin_txn ( )
2017-06-10 11:59:41 -04:00
size = len ( chunk )
refcount = self . seen_chunk ( id , size )
if refcount :
2017-07-23 07:48:45 -04:00
return self . chunk_incref ( id , stats , size = size )
2017-06-10 11:59:41 -04:00
data = self . key . encrypt ( chunk )
csize = len ( data )
self . repository . put ( id , data , wait = wait )
self . chunks . add ( id , 1 , size , csize )
stats . update ( size , csize , not refcount )
return ChunkListEntry ( id , size , csize )
def seen_chunk ( self , id , size = None ) :
2017-06-11 05:29:41 -04:00
if not self . _txn_active :
2017-07-30 15:50:57 -04:00
self . begin_txn ( )
2017-06-11 05:29:41 -04:00
entry = self . chunks . get ( id , ChunkIndexEntry ( 0 , None , None ) )
if entry . refcount and size and not entry . size :
# The LocalCache has existing size information and uses *size* to make an effort at detecting collisions.
# This is of course not possible for the AdHocCache.
# Here *size* is used to update the chunk's size information, which will be zero for existing chunks.
self . chunks [ id ] = entry . _replace ( size = size )
return entry . refcount
2017-06-10 11:59:41 -04:00
2017-07-23 07:48:45 -04:00
def chunk_incref ( self , id , stats , size = None ) :
2017-06-10 11:59:41 -04:00
if not self . _txn_active :
2017-07-30 15:50:57 -04:00
self . begin_txn ( )
2017-07-23 07:48:45 -04:00
count , _size , csize = self . chunks . incref ( id )
# When _size is 0 and size is not given, then this chunk has not been locally visited yet (seen_chunk with
2017-06-11 14:11:34 -04:00
# size or add_chunk); we can't add references to those (size=0 is invalid) and generally don't try to.
2017-07-23 07:51:35 -04:00
size = _size or size
assert size
stats . update ( size , csize , False )
return ChunkListEntry ( id , size , csize )
2017-06-10 11:59:41 -04:00
def chunk_decref ( self , id , stats , wait = True ) :
if not self . _txn_active :
2017-07-30 15:50:57 -04:00
self . begin_txn ( )
2017-06-10 11:59:41 -04:00
count , size , csize = self . chunks . decref ( id )
if count == 0 :
del self . chunks [ id ]
self . repository . delete ( id , wait = wait )
stats . update ( - size , - csize , True )
else :
stats . update ( - size , - csize , False )
def commit ( self ) :
if not self . _txn_active :
return
self . security_manager . save ( self . manifest , self . key )
self . _txn_active = False
def rollback ( self ) :
self . _txn_active = False
del self . chunks
2017-07-30 15:50:57 -04:00
def begin_txn ( self ) :
2017-06-10 11:59:41 -04:00
self . _txn_active = True
2018-06-12 16:12:02 -04:00
# Explicitly set the initial usable hash table capacity to avoid performance issues
2017-06-10 11:59:41 -04:00
# due to hash table "resonance".
# Since we're creating an archive, add 10 % from the start.
num_chunks = len ( self . repository )
2018-06-12 16:12:02 -04:00
self . chunks = ChunkIndex ( usable = num_chunks * 1.1 )
2017-06-10 11:59:41 -04:00
pi = ProgressIndicatorPercent ( total = num_chunks , msg = ' Downloading chunk list... %3.0f %% ' ,
msgid = ' cache.download_chunks ' )
t0 = perf_counter ( )
num_requests = 0
marker = None
while True :
result = self . repository . list ( limit = LIST_SCAN_LIMIT , marker = marker )
num_requests + = 1
if not result :
break
pi . show ( increase = len ( result ) )
marker = result [ - 1 ]
# All chunks from the repository have a refcount of MAX_VALUE, which is sticky,
# therefore we can't/won't delete them. Chunks we added ourselves in this transaction
# (e.g. checkpoint archives) are tracked correctly.
init_entry = ChunkIndexEntry ( refcount = ChunkIndex . MAX_VALUE , size = 0 , csize = 0 )
for id_ in result :
self . chunks [ id_ ] = init_entry
assert len ( self . chunks ) == num_chunks
# LocalCache does not contain the manifest, either.
del self . chunks [ self . manifest . MANIFEST_ID ]
2017-06-18 07:32:12 -04:00
duration = perf_counter ( ) - t0 or 0.01
2017-06-10 11:59:41 -04:00
pi . finish ( )
logger . debug ( ' AdHocCache: downloaded %d chunk IDs in %.2f s ( %d requests), ~ %s /s ' ,
num_chunks , duration , num_requests , format_file_size ( num_chunks * 34 / duration ) )
# Chunk IDs in a list are encoded in 34 bytes: 1 byte msgpack header, 1 byte length, 32 ID bytes.
# Protocol overhead is neglected in this calculation.