Merge pull request #2342 from ThomasWaldmann/generic-hardlinks

Generic hardlinks
This commit is contained in:
enkore 2017-04-05 14:34:29 +02:00 committed by GitHub
commit 736a815972
5 changed files with 175 additions and 144 deletions

View file

@ -124,7 +124,6 @@ Features & platforms
Besides regular file and directory structures, |project_name| can preserve
* Hardlinks (considering all files in the same archive)
* Symlinks (stored as symlink, the symlink is not followed)
* Special files:
@ -132,6 +131,7 @@ Besides regular file and directory structures, |project_name| can preserve
* FIFOs ("named pipes")
* Special file *contents* can be backed up in ``--read-special`` mode.
By default the metadata to create them with mknod(2), mkfifo(2) etc. is stored.
* Hardlinked regular files, devices, FIFOs (considering all items in the same archive)
* Timestamps in nanosecond precision: mtime, atime, ctime
* Permissions:

View file

@ -25,6 +25,7 @@ from .compress import Compressor, CompressionSpec
from .constants import * # NOQA
from .hashindex import ChunkIndex, ChunkIndexEntry
from .helpers import Manifest
from .helpers import hardlinkable
from .helpers import ChunkIteratorFileWrapper, open_item
from .helpers import Error, IntegrityError, set_ec
from .helpers import uid2user, user2uid, gid2group, group2gid
@ -502,6 +503,26 @@ Utilization of max. archive size: {csize_max:.0%}
cache.rollback()
return stats
@contextmanager
def extract_helper(self, dest, item, path, stripped_components, original_path, hardlink_masters):
hardlink_set = False
# Hard link?
if 'source' in item:
source = os.path.join(dest, *item.source.split(os.sep)[stripped_components:])
chunks, link_target = hardlink_masters.get(item.source, (None, source))
if link_target:
# Hard link was extracted previously, just link
with backup_io('link'):
os.link(link_target, path)
hardlink_set = True
elif chunks is not None:
# assign chunks to this item, since the item which had the chunks was not extracted
item.chunks = chunks
yield hardlink_set
if not hardlink_set and hardlink_masters:
# Update master entry with extracted item path, so that following hardlinks don't extract twice.
hardlink_masters[item.get('source') or original_path] = (None, path)
def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sparse=False,
hardlink_masters=None, stripped_components=0, original_path=None, pi=None):
"""
@ -566,51 +587,38 @@ Utilization of max. archive size: {csize_max:.0%}
if stat.S_ISREG(mode):
with backup_io('makedirs'):
make_parent(path)
# Hard link?
if 'source' in item:
source = os.path.join(dest, *item.source.split(os.sep)[stripped_components:])
with backup_io('link'):
if item.source not in hardlink_masters:
os.link(source, path)
return
item.chunks, link_target = hardlink_masters[item.source]
if link_target:
# Hard link was extracted previously, just link
with backup_io:
os.link(link_target, path)
with self.extract_helper(dest, item, path, stripped_components, original_path,
hardlink_masters) as hardlink_set:
if hardlink_set:
return
# Extract chunks, since the item which had the chunks was not extracted
with backup_io('open'):
fd = open(path, 'wb')
if sparse and self.zeros is None:
self.zeros = b'\0' * (1 << self.chunker_params[1])
with fd:
ids = [c.id for c in item.chunks]
for data in self.pipeline.fetch_many(ids, is_preloaded=True):
if pi:
pi.show(increase=len(data), info=[remove_surrogates(item.path)])
with backup_io('write'):
if sparse and self.zeros.startswith(data):
# all-zero chunk: create a hole in a sparse file
fd.seek(len(data), 1)
else:
fd.write(data)
with backup_io('truncate'):
pos = item_chunks_size = fd.tell()
fd.truncate(pos)
fd.flush()
self.restore_attrs(path, item, fd=fd.fileno())
if 'size' in item:
item_size = item.size
if item_size != item_chunks_size:
logger.warning('{}: size inconsistency detected: size {}, chunks size {}'.format(
item.path, item_size, item_chunks_size))
if has_damaged_chunks:
logger.warning('File %s has damaged (all-zero) chunks. Try running borg check --repair.' %
remove_surrogates(item.path))
if hardlink_masters:
# Update master entry with extracted file path, so that following hardlinks don't extract twice.
hardlink_masters[item.get('source') or original_path] = (None, path)
if sparse and self.zeros is None:
self.zeros = b'\0' * (1 << self.chunker_params[1])
with backup_io('open'):
fd = open(path, 'wb')
with fd:
ids = [c.id for c in item.chunks]
for data in self.pipeline.fetch_many(ids, is_preloaded=True):
if pi:
pi.show(increase=len(data), info=[remove_surrogates(item.path)])
with backup_io('write'):
if sparse and self.zeros.startswith(data):
# all-zero chunk: create a hole in a sparse file
fd.seek(len(data), 1)
else:
fd.write(data)
with backup_io('truncate_and_attrs'):
pos = item_chunks_size = fd.tell()
fd.truncate(pos)
fd.flush()
self.restore_attrs(path, item, fd=fd.fileno())
if 'size' in item:
item_size = item.size
if item_size != item_chunks_size:
logger.warning('{}: size inconsistency detected: size {}, chunks size {}'.format(
item.path, item_size, item_chunks_size))
if has_damaged_chunks:
logger.warning('File %s has damaged (all-zero) chunks. Try running borg check --repair.' %
remove_surrogates(item.path))
return
with backup_io:
# No repository access beyond this point.
@ -630,12 +638,20 @@ Utilization of max. archive size: {csize_max:.0%}
self.restore_attrs(path, item, symlink=True)
elif stat.S_ISFIFO(mode):
make_parent(path)
os.mkfifo(path)
self.restore_attrs(path, item)
with self.extract_helper(dest, item, path, stripped_components, original_path,
hardlink_masters) as hardlink_set:
if hardlink_set:
return
os.mkfifo(path)
self.restore_attrs(path, item)
elif stat.S_ISCHR(mode) or stat.S_ISBLK(mode):
make_parent(path)
os.mknod(path, item.mode, item.rdev)
self.restore_attrs(path, item)
with self.extract_helper(dest, item, path, stripped_components, original_path,
hardlink_masters) as hardlink_set:
if hardlink_set:
return
os.mknod(path, item.mode, item.rdev)
self.restore_attrs(path, item)
else:
raise Exception('Unknown archive item type %r' % item.mode)
@ -834,34 +850,53 @@ Utilization of max. archive size: {csize_max:.0%}
attrs.update(self.stat_ext_attrs(st, path))
return attrs
def process_dir(self, path, st):
item = Item(path=make_path_safe(path))
item.update(self.stat_attrs(st, path))
@contextmanager
def create_helper(self, path, st, status=None, hardlinkable=True):
safe_path = make_path_safe(path)
item = Item(path=safe_path)
hardlink_master = False
hardlinked = hardlinkable and st.st_nlink > 1
if hardlinked:
source = self.hard_links.get((st.st_ino, st.st_dev))
if source is not None:
item.source = source
status = 'h' # hardlink (to already seen inodes)
else:
hardlink_master = True
yield item, status, hardlinked, hardlink_master
# if we get here, "with"-block worked ok without error/exception, the item was processed ok...
self.add_item(item)
return 'd' # directory
# ... and added to the archive, so we can remember it to refer to it later in the archive:
if hardlink_master:
self.hard_links[(st.st_ino, st.st_dev)] = safe_path
def process_dir(self, path, st):
with self.create_helper(path, st, 'd', hardlinkable=False) as (item, status, hardlinked, hardlink_master):
item.update(self.stat_attrs(st, path))
return status
def process_fifo(self, path, st):
item = Item(path=make_path_safe(path))
item.update(self.stat_attrs(st, path))
self.add_item(item)
return 'f' # fifo
with self.create_helper(path, st, 'f') as (item, status, hardlinked, hardlink_master): # fifo
item.update(self.stat_attrs(st, path))
return status
def process_dev(self, path, st):
item = Item(path=make_path_safe(path), rdev=st.st_rdev)
item.update(self.stat_attrs(st, path))
self.add_item(item)
if stat.S_ISCHR(st.st_mode):
return 'c' # char device
elif stat.S_ISBLK(st.st_mode):
return 'b' # block device
def process_dev(self, path, st, dev_type):
with self.create_helper(path, st, dev_type) as (item, status, hardlinked, hardlink_master): # char/block device
item.rdev = st.st_rdev
item.update(self.stat_attrs(st, path))
return status
def process_symlink(self, path, st):
with backup_io('readlink'):
source = os.readlink(path)
item = Item(path=make_path_safe(path), source=source)
item.update(self.stat_attrs(st, path))
self.add_item(item)
return 's' # symlink
# note: using hardlinkable=False because we can not support hardlinked symlinks,
# due to the dual-use of item.source, see issue #2343:
with self.create_helper(path, st, 's', hardlinkable=False) as (item, status, hardlinked, hardlink_master):
with backup_io('readlink'):
source = os.readlink(path)
item.source = source
if st.st_nlink > 1:
logger.warning('hardlinked symlinks will be archived as non-hardlinked symlinks!')
item.update(self.stat_attrs(st, path))
return status
def write_part_file(self, item, from_chunk, number):
item = Item(internal_dict=item.as_dict())
@ -925,70 +960,54 @@ Utilization of max. archive size: {csize_max:.0%}
return 'i' # stdin
def process_file(self, path, st, cache, ignore_inode=False):
status = None
safe_path = make_path_safe(path)
# Is it a hard link?
if st.st_nlink > 1:
source = self.hard_links.get((st.st_ino, st.st_dev))
if source is not None:
item = Item(path=safe_path, source=source)
item.update(self.stat_attrs(st, path))
self.add_item(item)
status = 'h' # regular file, hardlink (to already seen inodes)
return status
is_special_file = is_special(st.st_mode)
if not is_special_file:
path_hash = self.key.id_hash(safe_encode(os.path.join(self.cwd, path)))
ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode)
else:
# in --read-special mode, we may be called for special files.
# there should be no information in the cache about special files processed in
# read-special mode, but we better play safe as this was wrong in the past:
path_hash = ids = None
first_run = not cache.files and cache.do_files
if first_run:
logger.debug('Processing files ...')
chunks = None
if ids is not None:
# Make sure all ids are available
for id_ in ids:
if not cache.seen_chunk(id_):
break
else:
chunks = [cache.chunk_incref(id_, self.stats) for id_ in ids]
status = 'U' # regular file, unchanged
else:
status = 'A' # regular file, added
item = Item(
path=safe_path,
hardlink_master=st.st_nlink > 1, # item is a hard link and has the chunks
)
item.update(self.stat_simple_attrs(st))
# Only chunkify the file if needed
if chunks is not None:
item.chunks = chunks
else:
with backup_io('open'):
fh = Archive._open_rb(path)
with os.fdopen(fh, 'rb') as fd:
self.chunk_file(item, cache, self.stats, backup_io_iter(self.chunker.chunkify(fd, fh)))
if not is_special_file:
# we must not memorize special files, because the contents of e.g. a
# block or char device will change without its mtime/size/inode changing.
cache.memorize_file(path_hash, st, [c.id for c in item.chunks])
status = status or 'M' # regular file, modified (if not 'A' already)
item.update(self.stat_attrs(st, path))
item.get_size(memorize=True)
if is_special_file:
# we processed a special file like a regular file. reflect that in mode,
# so it can be extracted / accessed in FUSE mount like a regular file:
item.mode = stat.S_IFREG | stat.S_IMODE(item.mode)
self.stats.nfiles += 1
self.add_item(item)
if st.st_nlink > 1 and source is None:
# Add the hard link reference *after* the file has been added to the archive.
self.hard_links[st.st_ino, st.st_dev] = safe_path
return status
with self.create_helper(path, st, None) as (item, status, hardlinked, hardlink_master): # no status yet
is_special_file = is_special(st.st_mode)
if not hardlinked or hardlink_master:
if not is_special_file:
path_hash = self.key.id_hash(safe_encode(os.path.join(self.cwd, path)))
ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode)
else:
# in --read-special mode, we may be called for special files.
# there should be no information in the cache about special files processed in
# read-special mode, but we better play safe as this was wrong in the past:
path_hash = ids = None
first_run = not cache.files and cache.do_files
if first_run:
logger.debug('Processing files ...')
chunks = None
if ids is not None:
# Make sure all ids are available
for id_ in ids:
if not cache.seen_chunk(id_):
break
else:
chunks = [cache.chunk_incref(id_, self.stats) for id_ in ids]
status = 'U' # regular file, unchanged
else:
status = 'A' # regular file, added
item.hardlink_master = hardlinked
item.update(self.stat_simple_attrs(st))
# Only chunkify the file if needed
if chunks is not None:
item.chunks = chunks
else:
with backup_io('open'):
fh = Archive._open_rb(path)
with os.fdopen(fh, 'rb') as fd:
self.chunk_file(item, cache, self.stats, backup_io_iter(self.chunker.chunkify(fd, fh)))
if not is_special_file:
# we must not memorize special files, because the contents of e.g. a
# block or char device will change without its mtime/size/inode changing.
cache.memorize_file(path_hash, st, [c.id for c in item.chunks])
status = status or 'M' # regular file, modified (if not 'A' already)
self.stats.nfiles += 1
item.update(self.stat_attrs(st, path))
item.get_size(memorize=True)
if is_special_file:
# we processed a special file like a regular file. reflect that in mode,
# so it can be extracted / accessed in FUSE mount like a regular file:
item.mode = stat.S_IFREG | stat.S_IMODE(item.mode)
return status
@staticmethod
def list_archives(repository, key, manifest, cache=None):
@ -1605,7 +1624,7 @@ class ArchiveRecreater:
def item_is_hardlink_master(item):
return (target_is_subset and
stat.S_ISREG(item.mode) and
hardlinkable(item.mode) and
item.get('hardlink_master', True) and
'source' not in item)
@ -1615,7 +1634,7 @@ class ArchiveRecreater:
if item_is_hardlink_master(item):
hardlink_masters[item.path] = (item.get('chunks'), None)
continue
if target_is_subset and stat.S_ISREG(item.mode) and item.get('source') in hardlink_masters:
if target_is_subset and hardlinkable(item.mode) and item.get('source') in hardlink_masters:
# master of this hard link is outside the target subset
chunks, new_source = hardlink_masters[item.source]
if new_source is None:

View file

@ -48,6 +48,7 @@ from .helpers import prune_within, prune_split
from .helpers import to_localtime, timestamp
from .helpers import get_cache_dir
from .helpers import Manifest
from .helpers import hardlinkable
from .helpers import StableDict
from .helpers import check_extension_modules
from .helpers import ArgparsePatternAction, ArgparseExcludeFileAction, ArgparsePatternFileAction, parse_exclude_pattern
@ -555,10 +556,16 @@ class Archiver:
status = archive.process_fifo(path, st)
else:
status = archive.process_file(path, st, cache)
elif stat.S_ISCHR(st.st_mode) or stat.S_ISBLK(st.st_mode):
elif stat.S_ISCHR(st.st_mode):
if not dry_run:
if not read_special:
status = archive.process_dev(path, st)
status = archive.process_dev(path, st, 'c')
else:
status = archive.process_file(path, st, cache)
elif stat.S_ISBLK(st.st_mode):
if not dry_run:
if not read_special:
status = archive.process_dev(path, st, 'b')
else:
status = archive.process_file(path, st, cache)
elif stat.S_ISSOCK(st.st_mode):
@ -621,7 +628,7 @@ class Archiver:
hardlink_masters = {} if partial_extract else None
def peek_and_store_hardlink_masters(item, matched):
if (partial_extract and not matched and stat.S_ISREG(item.mode) and
if (partial_extract and not matched and hardlinkable(item.mode) and
item.get('hardlink_master', True) and 'source' not in item):
hardlink_masters[item.get('path')] = (item.get('chunks'), None)
@ -713,7 +720,7 @@ class Archiver:
return [None]
def has_hardlink_master(item, hardlink_masters):
return stat.S_ISREG(item.mode) and item.get('source') in hardlink_masters
return hardlinkable(item.mode) and item.get('source') in hardlink_masters
def compare_link(item1, item2):
# These are the simple link cases. For special cases, e.g. if a
@ -809,7 +816,7 @@ class Archiver:
def compare_archives(archive1, archive2, matcher):
def hardlink_master_seen(item):
return 'source' not in item or not stat.S_ISREG(item.mode) or item.source in hardlink_masters
return 'source' not in item or not hardlinkable(item.mode) or item.source in hardlink_masters
def is_hardlink_master(item):
return item.get('hardlink_master', True) and 'source' not in item

View file

@ -16,7 +16,7 @@ from .logger import create_logger
logger = create_logger()
from .archive import Archive
from .helpers import daemonize
from .helpers import daemonize, hardlinkable
from .item import Item
from .lrucache import LRUCache
@ -193,7 +193,7 @@ class FuseOperations(llfuse.Operations):
path = item.path
del item.path # safe some space
if 'source' in item and stat.S_ISREG(item.mode):
if 'source' in item and hardlinkable(item.mode):
# a hardlink, no contents, <source> is the hardlink master
source = os.fsencode(os.path.normpath(item.source))
if self.versions:

View file

@ -1987,6 +1987,11 @@ def file_status(mode):
return '?'
def hardlinkable(mode):
"""return True if we support hardlinked items of this type"""
return stat.S_ISREG(mode) or stat.S_ISBLK(mode) or stat.S_ISCHR(mode) or stat.S_ISFIFO(mode)
def chunkit(it, size):
"""
Chunk an iterator <it> into pieces of <size>.