From 972392e290e3e38beef32ce455154312452e7869 Mon Sep 17 00:00:00 2001 From: Martin Hostettler Date: Mon, 22 Aug 2016 22:58:54 +0200 Subject: [PATCH 1/5] extract: When doing a partial restore don't leak prefetched chunks. The filter function passed to iter_items (with preload=True) may never return True for items that are not really extracted later because that would leak prefetched items. For restoring hard linked files the item containing the actual chunks might not be matched or implicitly removed from the restore by strip_components. For this reason the chunk list or all items that can potentially be used as hardlink target needs to be stored. To achive both requirements at the same time the filter function needs to store the needed information for the hardlinks while not returning True just because it could be a hardlink target. Known problems: When using progress indication the calculated extracted_size now can be smaller than the actual extracted size in presence of hard links (master is not restored) instead of bigger (potential master not used in restore). --- src/borg/archive.py | 4 ++-- src/borg/archiver.py | 23 ++++++++++------------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index b17685429..f4acf8749 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -161,11 +161,11 @@ class DownloadPipeline: for _, data in self.fetch_many(ids): unpacker.feed(data) items = [Item(internal_dict=item) for item in unpacker] - if filter: - items = [item for item in items if filter(item)] for item in items: if 'chunks' in item: item.chunks = [ChunkListEntry(*e) for e in item.chunks] + if filter: + items = [item for item in items if filter(item)] if preload: for item in items: if 'chunks' in item: diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 619dbd7e9..b712d2a8e 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -417,15 +417,15 @@ class Archiver: self.print_file_status(status, path) @staticmethod - def build_filter(matcher, is_hardlink_master, strip_components=0): + def build_filter(matcher, peek_and_store_hardlink_masters, strip_components=0): if strip_components: def item_filter(item): - return (is_hardlink_master(item) or - matcher.match(item.path) and os.sep.join(item.path.split(os.sep)[strip_components:])) + peek_and_store_hardlink_masters(item) + return matcher.match(item.path) and os.sep.join(item.path.split(os.sep)[strip_components:]) else: def item_filter(item): - return (is_hardlink_master(item) or - matcher.match(item.path)) + peek_and_store_hardlink_masters(item) + return matcher.match(item.path) return item_filter @with_repository() @@ -450,11 +450,12 @@ class Archiver: partial_extract = not matcher.empty() or strip_components hardlink_masters = {} if partial_extract else None - def item_is_hardlink_master(item): - return (partial_extract and stat.S_ISREG(item.mode) and - item.get('hardlink_master', True) and 'source' not in item) + def peek_and_store_hardlink_masters(item): + if (partial_extract and stat.S_ISREG(item.mode) and + item.get('hardlink_master', True) and 'source' not in item): + hardlink_masters[item.get('path')] = (item.get('chunks'), None) - filter = self.build_filter(matcher, item_is_hardlink_master, strip_components) + filter = self.build_filter(matcher, peek_and_store_hardlink_masters, strip_components) if progress: progress_logger = logging.getLogger(ProgressIndicatorPercent.LOGGER) progress_logger.info('Calculating size') @@ -465,10 +466,6 @@ class Archiver: for item in archive.iter_items(filter, preload=True): orig_path = item.path - if item_is_hardlink_master(item): - hardlink_masters[orig_path] = (item.get('chunks'), None) - if not matcher.match(item.path): - continue if strip_components: item.path = os.sep.join(orig_path.split(os.sep)[strip_components:]) if not args.dry_run: From a026febdb03e8beaa8c267de9cfbea46aec28a45 Mon Sep 17 00:00:00 2001 From: Martin Hostettler Date: Mon, 22 Aug 2016 23:07:38 +0200 Subject: [PATCH 2/5] Archiver.build_filter: strip_components is no longer a optional parameter. --- src/borg/archiver.py | 2 +- src/borg/testsuite/archiver.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index b712d2a8e..d3a8f76d7 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -417,7 +417,7 @@ class Archiver: self.print_file_status(status, path) @staticmethod - def build_filter(matcher, peek_and_store_hardlink_masters, strip_components=0): + def build_filter(matcher, peek_and_store_hardlink_masters, strip_components): if strip_components: def item_filter(item): peek_and_store_hardlink_masters(item) diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index 9d68e3eea..34a8acb35 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -2210,14 +2210,14 @@ class TestBuildFilter: def test_basic(self): matcher = PatternMatcher() matcher.add([parse_pattern('included')], True) - filter = Archiver.build_filter(matcher, self.item_is_hardlink_master) + filter = Archiver.build_filter(matcher, self.item_is_hardlink_master, 0) assert filter(Item(path='included')) assert filter(Item(path='included/file')) assert not filter(Item(path='something else')) def test_empty(self): matcher = PatternMatcher(fallback=True) - filter = Archiver.build_filter(matcher, self.item_is_hardlink_master) + filter = Archiver.build_filter(matcher, self.item_is_hardlink_master, 0) assert filter(Item(path='anything')) def test_strip_components(self): From b845a074cb8135b9c4bb782a81ec5616350d834d Mon Sep 17 00:00:00 2001 From: Martin Hostettler Date: Mon, 22 Aug 2016 23:36:43 +0200 Subject: [PATCH 3/5] tests: TestBuildFilter: Adjust from item_is_hardlink_master to peek_and_store_hardlink_masters. --- src/borg/testsuite/archiver.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index 34a8acb35..aa6614316 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -2204,25 +2204,25 @@ def test_compare_chunk_contents(): class TestBuildFilter: @staticmethod - def item_is_hardlink_master(item): + def peek_and_store_hardlink_masters(item): return False def test_basic(self): matcher = PatternMatcher() matcher.add([parse_pattern('included')], True) - filter = Archiver.build_filter(matcher, self.item_is_hardlink_master, 0) + filter = Archiver.build_filter(matcher, self.peek_and_store_hardlink_masters, 0) assert filter(Item(path='included')) assert filter(Item(path='included/file')) assert not filter(Item(path='something else')) def test_empty(self): matcher = PatternMatcher(fallback=True) - filter = Archiver.build_filter(matcher, self.item_is_hardlink_master, 0) + filter = Archiver.build_filter(matcher, self.peek_and_store_hardlink_masters, 0) assert filter(Item(path='anything')) def test_strip_components(self): matcher = PatternMatcher(fallback=True) - filter = Archiver.build_filter(matcher, self.item_is_hardlink_master, strip_components=1) + filter = Archiver.build_filter(matcher, self.peek_and_store_hardlink_masters, strip_components=1) assert not filter(Item(path='shallow')) assert not filter(Item(path='shallow/')) # can this even happen? paths are normalized... assert filter(Item(path='deep enough/file')) From b5d7f1df26391b71ac789bf13d2a81af4b00d3a2 Mon Sep 17 00:00:00 2001 From: Marian Beermann Date: Thu, 25 Aug 2016 01:12:30 +0200 Subject: [PATCH 4/5] extract: fix incorrect progress output for hard links this produces correct output if any (non proper) subset of hardlinks are extracted. --- src/borg/archiver.py | 2 +- src/borg/item.py | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index d3a8f76d7..cf2f233f1 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -459,7 +459,7 @@ class Archiver: if progress: progress_logger = logging.getLogger(ProgressIndicatorPercent.LOGGER) progress_logger.info('Calculating size') - extracted_size = sum(item.file_size() for item in archive.iter_items(filter)) + extracted_size = sum(item.file_size(hardlink_masters) for item in archive.iter_items(filter)) pi = ProgressIndicatorPercent(total=extracted_size, msg='Extracting files %5.1f%%', step=0.1) else: pi = None diff --git a/src/borg/item.py b/src/borg/item.py index 0bc336239..052478705 100644 --- a/src/borg/item.py +++ b/src/borg/item.py @@ -157,10 +157,13 @@ class Item(PropDict): part = PropDict._make_property('part', int) - def file_size(self): - if 'chunks' not in self: + def file_size(self, hardlink_masters=None): + hardlink_masters = hardlink_masters or {} + chunks, _ = hardlink_masters.get(self.get('source'), (None, None)) + chunks = self.get('chunks', chunks) + if chunks is None: return 0 - return sum(chunk.size for chunk in self.chunks) + return sum(chunk.size for chunk in chunks) class EncryptedKey(PropDict): From cd39ccb82106089129ced9ccccabf7a4703f0207 Mon Sep 17 00:00:00 2001 From: Marian Beermann Date: Thu, 25 Aug 2016 21:16:20 +0200 Subject: [PATCH 5/5] fix overeager storing of hardlink masters n.b. we only need to store them for items that we wouldn't extract. this also fixes an intersting edge case in extracting hard links with --strip-components --- src/borg/archive.py | 8 +++++--- src/borg/archiver.py | 16 +++++++++------- src/borg/testsuite/archiver.py | 4 ++-- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index f4acf8749..c27faf67c 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -422,7 +422,7 @@ Number of files: {0.stats.nfiles}'''.format( return stats def extract_item(self, item, restore_attrs=True, dry_run=False, stdout=False, sparse=False, - hardlink_masters=None, original_path=None, pi=None): + hardlink_masters=None, stripped_components=0, original_path=None, pi=None): """ Extract archive item. @@ -432,9 +432,11 @@ Number of files: {0.stats.nfiles}'''.format( :param stdout: write extracted data to stdout :param sparse: write sparse files (chunk-granularity, independent of the original being sparse) :param hardlink_masters: maps paths to (chunks, link_target) for extracting subtrees with hardlinks correctly + :param stripped_components: stripped leading path components to correct hard link extraction :param original_path: 'path' key as stored in archive :param pi: ProgressIndicatorPercent (or similar) for file extraction progress (in bytes) """ + hardlink_masters = hardlink_masters or {} has_damaged_chunks = 'chunks_healthy' in item if dry_run or stdout: if 'chunks' in item: @@ -473,11 +475,11 @@ Number of files: {0.stats.nfiles}'''.format( os.makedirs(os.path.dirname(path)) # Hard link? if 'source' in item: - source = os.path.join(dest, item.source) + source = os.path.join(dest, *item.source.split(os.sep)[stripped_components:]) with backup_io(): if os.path.exists(path): os.unlink(path) - if not hardlink_masters: + if item.source not in hardlink_masters: os.link(source, path) return item.chunks, link_target = hardlink_masters[item.source] diff --git a/src/borg/archiver.py b/src/borg/archiver.py index cf2f233f1..f5ebd7308 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -420,12 +420,14 @@ class Archiver: def build_filter(matcher, peek_and_store_hardlink_masters, strip_components): if strip_components: def item_filter(item): - peek_and_store_hardlink_masters(item) - return matcher.match(item.path) and os.sep.join(item.path.split(os.sep)[strip_components:]) + matched = matcher.match(item.path) and os.sep.join(item.path.split(os.sep)[strip_components:]) + peek_and_store_hardlink_masters(item, matched) + return matched else: def item_filter(item): - peek_and_store_hardlink_masters(item) - return matcher.match(item.path) + matched = matcher.match(item.path) + peek_and_store_hardlink_masters(item, matched) + return matched return item_filter @with_repository() @@ -450,8 +452,8 @@ class Archiver: partial_extract = not matcher.empty() or strip_components hardlink_masters = {} if partial_extract else None - def peek_and_store_hardlink_masters(item): - if (partial_extract and stat.S_ISREG(item.mode) and + def peek_and_store_hardlink_masters(item, matched): + if (partial_extract and not matched and stat.S_ISREG(item.mode) and item.get('hardlink_master', True) and 'source' not in item): hardlink_masters[item.get('path')] = (item.get('chunks'), None) @@ -486,7 +488,7 @@ class Archiver: archive.extract_item(item, restore_attrs=False) else: archive.extract_item(item, stdout=stdout, sparse=sparse, hardlink_masters=hardlink_masters, - original_path=orig_path, pi=pi) + stripped_components=strip_components, original_path=orig_path, pi=pi) except BackupOSError as e: self.print_warning('%s: %s', remove_surrogates(orig_path), e) diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index aa6614316..fd7eb5fc8 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -2204,8 +2204,8 @@ def test_compare_chunk_contents(): class TestBuildFilter: @staticmethod - def peek_and_store_hardlink_masters(item): - return False + def peek_and_store_hardlink_masters(item, matched): + pass def test_basic(self): matcher = PatternMatcher()