From 66f4cd1a29868f96881bd1b08d77e954b7ba2c58 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 26 Mar 2017 06:01:13 +0200 Subject: [PATCH 01/12] minor refactor for regular file hardlink processing --- src/borg/archive.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 995e0e4fd..996031b0a 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -927,15 +927,19 @@ Utilization of max. archive size: {csize_max:.0%} def process_file(self, path, st, cache, ignore_inode=False): status = None safe_path = make_path_safe(path) - # Is it a hard link? - if st.st_nlink > 1: + item = Item(path=safe_path) + hardlink_master = False + hardlinked = st.st_nlink > 1 + if hardlinked: source = self.hard_links.get((st.st_ino, st.st_dev)) if source is not None: - item = Item(path=safe_path, source=source) + item.source = source item.update(self.stat_attrs(st, path)) self.add_item(item) status = 'h' # regular file, hardlink (to already seen inodes) return status + else: + hardlink_master = True is_special_file = is_special(st.st_mode) if not is_special_file: path_hash = self.key.id_hash(safe_encode(os.path.join(self.cwd, path))) @@ -959,10 +963,7 @@ Utilization of max. archive size: {csize_max:.0%} status = 'U' # regular file, unchanged else: status = 'A' # regular file, added - item = Item( - path=safe_path, - hardlink_master=st.st_nlink > 1, # item is a hard link and has the chunks - ) + item.hardlink_master = hardlinked item.update(self.stat_simple_attrs(st)) # Only chunkify the file if needed if chunks is not None: @@ -985,7 +986,7 @@ Utilization of max. archive size: {csize_max:.0%} item.mode = stat.S_IFREG | stat.S_IMODE(item.mode) self.stats.nfiles += 1 self.add_item(item) - if st.st_nlink > 1 and source is None: + if hardlinked and hardlink_master: # Add the hard link reference *after* the file has been added to the archive. self.hard_links[st.st_ino, st.st_dev] = safe_path return status From a206a85890bf8b0af414926257087b29ec3869bb Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 26 Mar 2017 06:07:02 +0200 Subject: [PATCH 02/12] indent block, no semantics change --- src/borg/archive.py | 73 +++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 996031b0a..c5124b6d5 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -941,43 +941,44 @@ Utilization of max. archive size: {csize_max:.0%} else: hardlink_master = True is_special_file = is_special(st.st_mode) - if not is_special_file: - path_hash = self.key.id_hash(safe_encode(os.path.join(self.cwd, path))) - ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode) - else: - # in --read-special mode, we may be called for special files. - # there should be no information in the cache about special files processed in - # read-special mode, but we better play safe as this was wrong in the past: - path_hash = ids = None - first_run = not cache.files and cache.do_files - if first_run: - logger.debug('Processing files ...') - chunks = None - if ids is not None: - # Make sure all ids are available - for id_ in ids: - if not cache.seen_chunk(id_): - break - else: - chunks = [cache.chunk_incref(id_, self.stats) for id_ in ids] - status = 'U' # regular file, unchanged - else: - status = 'A' # regular file, added - item.hardlink_master = hardlinked - item.update(self.stat_simple_attrs(st)) - # Only chunkify the file if needed - if chunks is not None: - item.chunks = chunks - else: - with backup_io('open'): - fh = Archive._open_rb(path) - with os.fdopen(fh, 'rb') as fd: - self.chunk_file(item, cache, self.stats, backup_io_iter(self.chunker.chunkify(fd, fh))) + if True: if not is_special_file: - # we must not memorize special files, because the contents of e.g. a - # block or char device will change without its mtime/size/inode changing. - cache.memorize_file(path_hash, st, [c.id for c in item.chunks]) - status = status or 'M' # regular file, modified (if not 'A' already) + path_hash = self.key.id_hash(safe_encode(os.path.join(self.cwd, path))) + ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode) + else: + # in --read-special mode, we may be called for special files. + # there should be no information in the cache about special files processed in + # read-special mode, but we better play safe as this was wrong in the past: + path_hash = ids = None + first_run = not cache.files and cache.do_files + if first_run: + logger.debug('Processing files ...') + chunks = None + if ids is not None: + # Make sure all ids are available + for id_ in ids: + if not cache.seen_chunk(id_): + break + else: + chunks = [cache.chunk_incref(id_, self.stats) for id_ in ids] + status = 'U' # regular file, unchanged + else: + status = 'A' # regular file, added + item.hardlink_master = hardlinked + item.update(self.stat_simple_attrs(st)) + # Only chunkify the file if needed + if chunks is not None: + item.chunks = chunks + else: + with backup_io('open'): + fh = Archive._open_rb(path) + with os.fdopen(fh, 'rb') as fd: + self.chunk_file(item, cache, self.stats, backup_io_iter(self.chunker.chunkify(fd, fh))) + if not is_special_file: + # we must not memorize special files, because the contents of e.g. a + # block or char device will change without its mtime/size/inode changing. + cache.memorize_file(path_hash, st, [c.id for c in item.chunks]) + status = status or 'M' # regular file, modified (if not 'A' already) item.update(self.stat_attrs(st, path)) item.get_size(memorize=True) if is_special_file: From e5d094d0ceafcbc5ff781a2285c842c10453c48b Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 26 Mar 2017 06:15:36 +0200 Subject: [PATCH 03/12] use same finalizing code for hardlink masters and slaves hardlink slaves get a precomputed size attribute now. --- src/borg/archive.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index c5124b6d5..e38049745 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -934,14 +934,11 @@ Utilization of max. archive size: {csize_max:.0%} source = self.hard_links.get((st.st_ino, st.st_dev)) if source is not None: item.source = source - item.update(self.stat_attrs(st, path)) - self.add_item(item) status = 'h' # regular file, hardlink (to already seen inodes) - return status else: hardlink_master = True is_special_file = is_special(st.st_mode) - if True: + if not hardlinked or hardlink_master: if not is_special_file: path_hash = self.key.id_hash(safe_encode(os.path.join(self.cwd, path))) ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode) @@ -979,15 +976,15 @@ Utilization of max. archive size: {csize_max:.0%} # block or char device will change without its mtime/size/inode changing. cache.memorize_file(path_hash, st, [c.id for c in item.chunks]) status = status or 'M' # regular file, modified (if not 'A' already) + self.stats.nfiles += 1 item.update(self.stat_attrs(st, path)) item.get_size(memorize=True) if is_special_file: # we processed a special file like a regular file. reflect that in mode, # so it can be extracted / accessed in FUSE mount like a regular file: item.mode = stat.S_IFREG | stat.S_IMODE(item.mode) - self.stats.nfiles += 1 self.add_item(item) - if hardlinked and hardlink_master: + if hardlink_master: # Add the hard link reference *after* the file has been added to the archive. self.hard_links[st.st_ino, st.st_dev] = safe_path return status From 9478e8abd092a3871861992de6639aa77656e7e7 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 26 Mar 2017 13:51:04 +0200 Subject: [PATCH 04/12] support hardlinks via create_helper context manager also: reduce code duplication --- src/borg/archive.py | 171 +++++++++++++++++++++++--------------------- 1 file changed, 88 insertions(+), 83 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index e38049745..4e331a776 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -834,34 +834,54 @@ Utilization of max. archive size: {csize_max:.0%} attrs.update(self.stat_ext_attrs(st, path)) return attrs - def process_dir(self, path, st): - item = Item(path=make_path_safe(path)) - item.update(self.stat_attrs(st, path)) + @contextmanager + def create_helper(self, path, st, status=None): + safe_path = make_path_safe(path) + item = Item(path=safe_path) + hardlink_master = False + hardlinked = st.st_nlink > 1 + if hardlinked: + source = self.hard_links.get((st.st_ino, st.st_dev)) + if source is not None: + item.source = source + status = 'h' # hardlink (to already seen inodes) + else: + hardlink_master = True + yield item, status, hardlinked, hardlink_master + # if we get here, "with"-block worked ok without error/exception, the item was processed ok... self.add_item(item) - return 'd' # directory + # ... and added to the archive, so we can remember it to refer to it later in the archive: + if hardlink_master: + self.hard_links[(st.st_ino, st.st_dev)] = safe_path + + def process_dir(self, path, st): + with self.create_helper(path, st, 'd') as (item, status, hardlinked, hardlink_master): # directory + item.update(self.stat_attrs(st, path)) + return status def process_fifo(self, path, st): - item = Item(path=make_path_safe(path)) - item.update(self.stat_attrs(st, path)) - self.add_item(item) - return 'f' # fifo + with self.create_helper(path, st, 'f') as (item, status, hardlinked, hardlink_master): # fifo + item.update(self.stat_attrs(st, path)) + return status def process_dev(self, path, st): - item = Item(path=make_path_safe(path), rdev=st.st_rdev) - item.update(self.stat_attrs(st, path)) - self.add_item(item) - if stat.S_ISCHR(st.st_mode): - return 'c' # char device - elif stat.S_ISBLK(st.st_mode): - return 'b' # block device + with self.create_helper(path, st, None) as (item, status, hardlinked, hardlink_master): # no status yet + item.rdev = st.st_rdev + item.update(self.stat_attrs(st, path)) + if stat.S_ISCHR(st.st_mode): + return 'c' # char device + elif stat.S_ISBLK(st.st_mode): + return 'b' # block device def process_symlink(self, path, st): - with backup_io('readlink'): - source = os.readlink(path) - item = Item(path=make_path_safe(path), source=source) - item.update(self.stat_attrs(st, path)) - self.add_item(item) - return 's' # symlink + with self.create_helper(path, st, 's') as (item, status, hardlinked, hardlink_master): # symlink + with backup_io('readlink'): + source = os.readlink(path) + item.source = source # XXX this overwrites hardlink slave's usage of item.source + if hardlinked: + logger.warning('hardlinked symlinks will be extracted as non-hardlinked symlinks!') + item.update(self.stat_attrs(st, path)) + return status def write_part_file(self, item, from_chunk, number): item = Item(internal_dict=item.as_dict()) @@ -925,69 +945,54 @@ Utilization of max. archive size: {csize_max:.0%} return 'i' # stdin def process_file(self, path, st, cache, ignore_inode=False): - status = None - safe_path = make_path_safe(path) - item = Item(path=safe_path) - hardlink_master = False - hardlinked = st.st_nlink > 1 - if hardlinked: - source = self.hard_links.get((st.st_ino, st.st_dev)) - if source is not None: - item.source = source - status = 'h' # regular file, hardlink (to already seen inodes) - else: - hardlink_master = True - is_special_file = is_special(st.st_mode) - if not hardlinked or hardlink_master: - if not is_special_file: - path_hash = self.key.id_hash(safe_encode(os.path.join(self.cwd, path))) - ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode) - else: - # in --read-special mode, we may be called for special files. - # there should be no information in the cache about special files processed in - # read-special mode, but we better play safe as this was wrong in the past: - path_hash = ids = None - first_run = not cache.files and cache.do_files - if first_run: - logger.debug('Processing files ...') - chunks = None - if ids is not None: - # Make sure all ids are available - for id_ in ids: - if not cache.seen_chunk(id_): - break - else: - chunks = [cache.chunk_incref(id_, self.stats) for id_ in ids] - status = 'U' # regular file, unchanged - else: - status = 'A' # regular file, added - item.hardlink_master = hardlinked - item.update(self.stat_simple_attrs(st)) - # Only chunkify the file if needed - if chunks is not None: - item.chunks = chunks - else: - with backup_io('open'): - fh = Archive._open_rb(path) - with os.fdopen(fh, 'rb') as fd: - self.chunk_file(item, cache, self.stats, backup_io_iter(self.chunker.chunkify(fd, fh))) + with self.create_helper(path, st, None) as (item, status, hardlinked, hardlink_master): # no status yet + is_special_file = is_special(st.st_mode) + if not hardlinked or hardlink_master: if not is_special_file: - # we must not memorize special files, because the contents of e.g. a - # block or char device will change without its mtime/size/inode changing. - cache.memorize_file(path_hash, st, [c.id for c in item.chunks]) - status = status or 'M' # regular file, modified (if not 'A' already) - self.stats.nfiles += 1 - item.update(self.stat_attrs(st, path)) - item.get_size(memorize=True) - if is_special_file: - # we processed a special file like a regular file. reflect that in mode, - # so it can be extracted / accessed in FUSE mount like a regular file: - item.mode = stat.S_IFREG | stat.S_IMODE(item.mode) - self.add_item(item) - if hardlink_master: - # Add the hard link reference *after* the file has been added to the archive. - self.hard_links[st.st_ino, st.st_dev] = safe_path - return status + path_hash = self.key.id_hash(safe_encode(os.path.join(self.cwd, path))) + ids = cache.file_known_and_unchanged(path_hash, st, ignore_inode) + else: + # in --read-special mode, we may be called for special files. + # there should be no information in the cache about special files processed in + # read-special mode, but we better play safe as this was wrong in the past: + path_hash = ids = None + first_run = not cache.files and cache.do_files + if first_run: + logger.debug('Processing files ...') + chunks = None + if ids is not None: + # Make sure all ids are available + for id_ in ids: + if not cache.seen_chunk(id_): + break + else: + chunks = [cache.chunk_incref(id_, self.stats) for id_ in ids] + status = 'U' # regular file, unchanged + else: + status = 'A' # regular file, added + item.hardlink_master = hardlinked + item.update(self.stat_simple_attrs(st)) + # Only chunkify the file if needed + if chunks is not None: + item.chunks = chunks + else: + with backup_io('open'): + fh = Archive._open_rb(path) + with os.fdopen(fh, 'rb') as fd: + self.chunk_file(item, cache, self.stats, backup_io_iter(self.chunker.chunkify(fd, fh))) + if not is_special_file: + # we must not memorize special files, because the contents of e.g. a + # block or char device will change without its mtime/size/inode changing. + cache.memorize_file(path_hash, st, [c.id for c in item.chunks]) + status = status or 'M' # regular file, modified (if not 'A' already) + self.stats.nfiles += 1 + item.update(self.stat_attrs(st, path)) + item.get_size(memorize=True) + if is_special_file: + # we processed a special file like a regular file. reflect that in mode, + # so it can be extracted / accessed in FUSE mount like a regular file: + item.mode = stat.S_IFREG | stat.S_IMODE(item.mode) + return status @staticmethod def list_archives(repository, key, manifest, cache=None): From 1f6dc55eab6f2859746c9fd2fcffe3d09c517d40 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 26 Mar 2017 14:03:39 +0200 Subject: [PATCH 05/12] simplify char/block device file dispatching --- src/borg/archive.py | 9 +++------ src/borg/archiver.py | 10 ++++++++-- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 4e331a776..d602faf45 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -864,14 +864,11 @@ Utilization of max. archive size: {csize_max:.0%} item.update(self.stat_attrs(st, path)) return status - def process_dev(self, path, st): - with self.create_helper(path, st, None) as (item, status, hardlinked, hardlink_master): # no status yet + def process_dev(self, path, st, dev_type): + with self.create_helper(path, st, dev_type) as (item, status, hardlinked, hardlink_master): # char/block device item.rdev = st.st_rdev item.update(self.stat_attrs(st, path)) - if stat.S_ISCHR(st.st_mode): - return 'c' # char device - elif stat.S_ISBLK(st.st_mode): - return 'b' # block device + return status def process_symlink(self, path, st): with self.create_helper(path, st, 's') as (item, status, hardlinked, hardlink_master): # symlink diff --git a/src/borg/archiver.py b/src/borg/archiver.py index e73aaf2dd..26cacba34 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -562,10 +562,16 @@ class Archiver: status = archive.process_fifo(path, st) else: status = archive.process_file(path, st, cache) - elif stat.S_ISCHR(st.st_mode) or stat.S_ISBLK(st.st_mode): + elif stat.S_ISCHR(st.st_mode): if not dry_run: if not read_special: - status = archive.process_dev(path, st) + status = archive.process_dev(path, st, 'c') + else: + status = archive.process_file(path, st, cache) + elif stat.S_ISBLK(st.st_mode): + if not dry_run: + if not read_special: + status = archive.process_dev(path, st, 'b') else: status = archive.process_file(path, st, cache) elif stat.S_ISSOCK(st.st_mode): From 23cc6796177568f1ba2b3d56d823e4b0d7ed8043 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 26 Mar 2017 14:25:33 +0200 Subject: [PATCH 06/12] no hardlinking for directories and symlinks - nlink > 1 for dirs does not mean hardlinking (at least not everywhere, wondering how apple does it) - we can not archive hardlinked symlinks due to item.source dual-use, see issue #2343. likely nobody uses this anyway. --- src/borg/archive.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index d602faf45..8d1c8b954 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -835,11 +835,11 @@ Utilization of max. archive size: {csize_max:.0%} return attrs @contextmanager - def create_helper(self, path, st, status=None): + def create_helper(self, path, st, status=None, hardlinkable=True): safe_path = make_path_safe(path) item = Item(path=safe_path) hardlink_master = False - hardlinked = st.st_nlink > 1 + hardlinked = hardlinkable and st.st_nlink > 1 if hardlinked: source = self.hard_links.get((st.st_ino, st.st_dev)) if source is not None: @@ -855,7 +855,7 @@ Utilization of max. archive size: {csize_max:.0%} self.hard_links[(st.st_ino, st.st_dev)] = safe_path def process_dir(self, path, st): - with self.create_helper(path, st, 'd') as (item, status, hardlinked, hardlink_master): # directory + with self.create_helper(path, st, 'd', hardlinkable=False) as (item, status, hardlinked, hardlink_master): item.update(self.stat_attrs(st, path)) return status @@ -871,12 +871,14 @@ Utilization of max. archive size: {csize_max:.0%} return status def process_symlink(self, path, st): - with self.create_helper(path, st, 's') as (item, status, hardlinked, hardlink_master): # symlink + # note: using hardlinkable=False because we can not support hardlinked symlinks, + # due to the dual-use of item.source, see issue #2343: + with self.create_helper(path, st, 's', hardlinkable=False) as (item, status, hardlinked, hardlink_master): with backup_io('readlink'): source = os.readlink(path) - item.source = source # XXX this overwrites hardlink slave's usage of item.source - if hardlinked: - logger.warning('hardlinked symlinks will be extracted as non-hardlinked symlinks!') + item.source = source + if st.st_nlink > 1: + logger.warning('hardlinked symlinks will be archived as non-hardlinked symlinks!') item.update(self.stat_attrs(st, path)) return status From 32c6e3ad95b7607774591a79c5e0f9fe5b38ad85 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 28 Mar 2017 20:49:38 +0200 Subject: [PATCH 07/12] docs: tell what kind of hardlinks we support --- docs/installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/installation.rst b/docs/installation.rst index 1d2c316a3..26a200afc 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -124,7 +124,6 @@ Features & platforms Besides regular file and directory structures, |project_name| can preserve - * Hardlinks (considering all files in the same archive) * Symlinks (stored as symlink, the symlink is not followed) * Special files: @@ -132,6 +131,7 @@ Besides regular file and directory structures, |project_name| can preserve * FIFOs ("named pipes") * Special file *contents* can be backed up in ``--read-special`` mode. By default the metadata to create them with mknod(2), mkfifo(2) etc. is stored. + * Hardlinked regular files, devices, FIFOs (considering all items in the same archive) * Timestamps in nanosecond precision: mtime, atime, ctime * Permissions: From 3cc1cdd2eddd6e78750bc8df8e98f6eff1fe4f7c Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 2 Apr 2017 01:19:46 +0200 Subject: [PATCH 08/12] extract: refactor hardlinks related code prepare for a extract_helper context manager (some changes may seem superfluous, but see the following changesets) --- src/borg/archive.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 8d1c8b954..9b3a7b3f5 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -566,24 +566,25 @@ Utilization of max. archive size: {csize_max:.0%} if stat.S_ISREG(mode): with backup_io('makedirs'): make_parent(path) + hardlink_set = False # Hard link? if 'source' in item: source = os.path.join(dest, *item.source.split(os.sep)[stripped_components:]) - with backup_io('link'): - if item.source not in hardlink_masters: - os.link(source, path) - return - item.chunks, link_target = hardlink_masters[item.source] + chunks, link_target = hardlink_masters.get(item.source, (None, source)) if link_target: # Hard link was extracted previously, just link - with backup_io: + with backup_io('link'): os.link(link_target, path) - return - # Extract chunks, since the item which had the chunks was not extracted - with backup_io('open'): - fd = open(path, 'wb') + hardlink_set = True + elif chunks is not None: + # assign chunks to this item, since the item which had the chunks was not extracted + item.chunks = chunks + if hardlink_set: + return if sparse and self.zeros is None: self.zeros = b'\0' * (1 << self.chunker_params[1]) + with backup_io('open'): + fd = open(path, 'wb') with fd: ids = [c.id for c in item.chunks] for data in self.pipeline.fetch_many(ids, is_preloaded=True): @@ -595,7 +596,7 @@ Utilization of max. archive size: {csize_max:.0%} fd.seek(len(data), 1) else: fd.write(data) - with backup_io('truncate'): + with backup_io('truncate_and_attrs'): pos = item_chunks_size = fd.tell() fd.truncate(pos) fd.flush() @@ -608,7 +609,7 @@ Utilization of max. archive size: {csize_max:.0%} if has_damaged_chunks: logger.warning('File %s has damaged (all-zero) chunks. Try running borg check --repair.' % remove_surrogates(item.path)) - if hardlink_masters: + if not hardlink_set and hardlink_masters: # 2nd term, is it correct/needed? # Update master entry with extracted file path, so that following hardlinks don't extract twice. hardlink_masters[item.get('source') or original_path] = (None, path) return From cda74650385ed33af63090392febf23189cfa2e8 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 2 Apr 2017 01:22:25 +0200 Subject: [PATCH 09/12] extract: indent code, no semantics change prepare for a extract_helper context manager (some changes may seem superfluous, but see the following changesets) --- src/borg/archive.py | 61 +++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 9b3a7b3f5..7db9826ef 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -579,36 +579,37 @@ Utilization of max. archive size: {csize_max:.0%} elif chunks is not None: # assign chunks to this item, since the item which had the chunks was not extracted item.chunks = chunks - if hardlink_set: - return - if sparse and self.zeros is None: - self.zeros = b'\0' * (1 << self.chunker_params[1]) - with backup_io('open'): - fd = open(path, 'wb') - with fd: - ids = [c.id for c in item.chunks] - for data in self.pipeline.fetch_many(ids, is_preloaded=True): - if pi: - pi.show(increase=len(data), info=[remove_surrogates(item.path)]) - with backup_io('write'): - if sparse and self.zeros.startswith(data): - # all-zero chunk: create a hole in a sparse file - fd.seek(len(data), 1) - else: - fd.write(data) - with backup_io('truncate_and_attrs'): - pos = item_chunks_size = fd.tell() - fd.truncate(pos) - fd.flush() - self.restore_attrs(path, item, fd=fd.fileno()) - if 'size' in item: - item_size = item.size - if item_size != item_chunks_size: - logger.warning('{}: size inconsistency detected: size {}, chunks size {}'.format( - item.path, item_size, item_chunks_size)) - if has_damaged_chunks: - logger.warning('File %s has damaged (all-zero) chunks. Try running borg check --repair.' % - remove_surrogates(item.path)) + if True: + if hardlink_set: + return + if sparse and self.zeros is None: + self.zeros = b'\0' * (1 << self.chunker_params[1]) + with backup_io('open'): + fd = open(path, 'wb') + with fd: + ids = [c.id for c in item.chunks] + for data in self.pipeline.fetch_many(ids, is_preloaded=True): + if pi: + pi.show(increase=len(data), info=[remove_surrogates(item.path)]) + with backup_io('write'): + if sparse and self.zeros.startswith(data): + # all-zero chunk: create a hole in a sparse file + fd.seek(len(data), 1) + else: + fd.write(data) + with backup_io('truncate_and_attrs'): + pos = item_chunks_size = fd.tell() + fd.truncate(pos) + fd.flush() + self.restore_attrs(path, item, fd=fd.fileno()) + if 'size' in item: + item_size = item.size + if item_size != item_chunks_size: + logger.warning('{}: size inconsistency detected: size {}, chunks size {}'.format( + item.path, item_size, item_chunks_size)) + if has_damaged_chunks: + logger.warning('File %s has damaged (all-zero) chunks. Try running borg check --repair.' % + remove_surrogates(item.path)) if not hardlink_set and hardlink_masters: # 2nd term, is it correct/needed? # Update master entry with extracted file path, so that following hardlinks don't extract twice. hardlink_masters[item.get('source') or original_path] = (None, path) From cb86bda4131e8fc0afb58378ac61b3705e8cfa25 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 2 Apr 2017 01:38:58 +0200 Subject: [PATCH 10/12] extract: implement extract_helper context manager Most code of the CM is just moved 1:1 from the regular file block. Use the CM for regular files, FIFOs and devices, but not for: - directories (can not have hardlinks) - symlinks (we can not support hardlinked symlinks) --- src/borg/archive.py | 55 ++++++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 7db9826ef..417bf0d67 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -502,6 +502,26 @@ Utilization of max. archive size: {csize_max:.0%} cache.rollback() return stats + @contextmanager + def extract_helper(self, dest, item, path, stripped_components, original_path, hardlink_masters): + hardlink_set = False + # Hard link? + if 'source' in item: + source = os.path.join(dest, *item.source.split(os.sep)[stripped_components:]) + chunks, link_target = hardlink_masters.get(item.source, (None, source)) + if link_target: + # Hard link was extracted previously, just link + with backup_io('link'): + os.link(link_target, path) + hardlink_set = True + elif chunks is not None: + # assign chunks to this item, since the item which had the chunks was not extracted + item.chunks = chunks + yield hardlink_set + if not hardlink_set and hardlink_masters: # 2nd term, is it correct/needed? + # Update master entry with extracted item path, so that following hardlinks don't extract twice. + hardlink_masters[item.get('source') or original_path] = (None, path) + def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sparse=False, hardlink_masters=None, stripped_components=0, original_path=None, pi=None): """ @@ -566,20 +586,8 @@ Utilization of max. archive size: {csize_max:.0%} if stat.S_ISREG(mode): with backup_io('makedirs'): make_parent(path) - hardlink_set = False - # Hard link? - if 'source' in item: - source = os.path.join(dest, *item.source.split(os.sep)[stripped_components:]) - chunks, link_target = hardlink_masters.get(item.source, (None, source)) - if link_target: - # Hard link was extracted previously, just link - with backup_io('link'): - os.link(link_target, path) - hardlink_set = True - elif chunks is not None: - # assign chunks to this item, since the item which had the chunks was not extracted - item.chunks = chunks - if True: + with self.extract_helper(dest, item, path, stripped_components, original_path, + hardlink_masters) as hardlink_set: if hardlink_set: return if sparse and self.zeros is None: @@ -610,9 +618,6 @@ Utilization of max. archive size: {csize_max:.0%} if has_damaged_chunks: logger.warning('File %s has damaged (all-zero) chunks. Try running borg check --repair.' % remove_surrogates(item.path)) - if not hardlink_set and hardlink_masters: # 2nd term, is it correct/needed? - # Update master entry with extracted file path, so that following hardlinks don't extract twice. - hardlink_masters[item.get('source') or original_path] = (None, path) return with backup_io: # No repository access beyond this point. @@ -632,12 +637,20 @@ Utilization of max. archive size: {csize_max:.0%} self.restore_attrs(path, item, symlink=True) elif stat.S_ISFIFO(mode): make_parent(path) - os.mkfifo(path) - self.restore_attrs(path, item) + with self.extract_helper(dest, item, path, stripped_components, original_path, + hardlink_masters) as hardlink_set: + if hardlink_set: + return + os.mkfifo(path) + self.restore_attrs(path, item) elif stat.S_ISCHR(mode) or stat.S_ISBLK(mode): make_parent(path) - os.mknod(path, item.mode, item.rdev) - self.restore_attrs(path, item) + with self.extract_helper(dest, item, path, stripped_components, original_path, + hardlink_masters) as hardlink_set: + if hardlink_set: + return + os.mknod(path, item.mode, item.rdev) + self.restore_attrs(path, item) else: raise Exception('Unknown archive item type %r' % item.mode) From 8f769a9b24ed55367d44ce362084e81e2539576c Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 2 Apr 2017 02:46:44 +0200 Subject: [PATCH 11/12] implement and use hardlinkable() helper --- src/borg/archive.py | 5 +++-- src/borg/archiver.py | 7 ++++--- src/borg/fuse.py | 4 ++-- src/borg/helpers.py | 5 +++++ 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 417bf0d67..27917ed96 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -25,6 +25,7 @@ from .compress import Compressor, CompressionSpec from .constants import * # NOQA from .hashindex import ChunkIndex, ChunkIndexEntry from .helpers import Manifest +from .helpers import hardlinkable from .helpers import ChunkIteratorFileWrapper, open_item from .helpers import Error, IntegrityError, set_ec from .helpers import uid2user, user2uid, gid2group, group2gid @@ -1623,7 +1624,7 @@ class ArchiveRecreater: def item_is_hardlink_master(item): return (target_is_subset and - stat.S_ISREG(item.mode) and + hardlinkable(item.mode) and item.get('hardlink_master', True) and 'source' not in item) @@ -1633,7 +1634,7 @@ class ArchiveRecreater: if item_is_hardlink_master(item): hardlink_masters[item.path] = (item.get('chunks'), None) continue - if target_is_subset and stat.S_ISREG(item.mode) and item.get('source') in hardlink_masters: + if target_is_subset and hardlinkable(item.mode) and item.get('source') in hardlink_masters: # master of this hard link is outside the target subset chunks, new_source = hardlink_masters[item.source] if new_source is None: diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 26cacba34..002e664a7 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -48,6 +48,7 @@ from .helpers import prune_within, prune_split from .helpers import to_localtime, timestamp from .helpers import get_cache_dir from .helpers import Manifest +from .helpers import hardlinkable from .helpers import StableDict from .helpers import check_extension_modules from .helpers import ArgparsePatternAction, ArgparseExcludeFileAction, ArgparsePatternFileAction, parse_exclude_pattern @@ -634,7 +635,7 @@ class Archiver: hardlink_masters = {} if partial_extract else None def peek_and_store_hardlink_masters(item, matched): - if (partial_extract and not matched and stat.S_ISREG(item.mode) and + if (partial_extract and not matched and hardlinkable(item.mode) and item.get('hardlink_master', True) and 'source' not in item): hardlink_masters[item.get('path')] = (item.get('chunks'), None) @@ -726,7 +727,7 @@ class Archiver: return [None] def has_hardlink_master(item, hardlink_masters): - return stat.S_ISREG(item.mode) and item.get('source') in hardlink_masters + return hardlinkable(item.mode) and item.get('source') in hardlink_masters def compare_link(item1, item2): # These are the simple link cases. For special cases, e.g. if a @@ -822,7 +823,7 @@ class Archiver: def compare_archives(archive1, archive2, matcher): def hardlink_master_seen(item): - return 'source' not in item or not stat.S_ISREG(item.mode) or item.source in hardlink_masters + return 'source' not in item or not hardlinkable(item.mode) or item.source in hardlink_masters def is_hardlink_master(item): return item.get('hardlink_master', True) and 'source' not in item diff --git a/src/borg/fuse.py b/src/borg/fuse.py index fc19e6e04..20782544d 100644 --- a/src/borg/fuse.py +++ b/src/borg/fuse.py @@ -16,7 +16,7 @@ from .logger import create_logger logger = create_logger() from .archive import Archive -from .helpers import daemonize +from .helpers import daemonize, hardlinkable from .item import Item from .lrucache import LRUCache @@ -193,7 +193,7 @@ class FuseOperations(llfuse.Operations): path = item.path del item.path # safe some space - if 'source' in item and stat.S_ISREG(item.mode): + if 'source' in item and hardlinkable(item.mode): # a hardlink, no contents, is the hardlink master source = os.fsencode(os.path.normpath(item.source)) if self.versions: diff --git a/src/borg/helpers.py b/src/borg/helpers.py index 902132432..c1da47291 100644 --- a/src/borg/helpers.py +++ b/src/borg/helpers.py @@ -1974,6 +1974,11 @@ def file_status(mode): return '?' +def hardlinkable(mode): + """return True if we support hardlinked items of this type""" + return stat.S_ISREG(mode) or stat.S_ISBLK(mode) or stat.S_ISCHR(mode) or stat.S_ISFIFO(mode) + + def chunkit(it, size): """ Chunk an iterator into pieces of . From 155f38c2333b3892527796d96e999149d253616e Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 5 Apr 2017 13:54:58 +0200 Subject: [PATCH 12/12] remove comment about strange hardlink_masters term (maybe revisit this later, this is not in scope of the generic hardlinks refactor) --- src/borg/archive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 27917ed96..cdbd483e3 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -519,7 +519,7 @@ Utilization of max. archive size: {csize_max:.0%} # assign chunks to this item, since the item which had the chunks was not extracted item.chunks = chunks yield hardlink_set - if not hardlink_set and hardlink_masters: # 2nd term, is it correct/needed? + if not hardlink_set and hardlink_masters: # Update master entry with extracted item path, so that following hardlinks don't extract twice. hardlink_masters[item.get('source') or original_path] = (None, path)