From a52b54dc3c864d0fbb0d21707ac93f8fe421fdb8 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 14 Feb 2017 06:35:54 +0100 Subject: [PATCH 1/8] archived file items: add size metadata if an item has a chunk list, pre-compute the total size and store it into "size" metadata entry. this speeds up access to item size (e.g. for regular files) and could also be used to verify the validity of the chunks list. note about hardlinks: size is only stored for hardlink masters (only they have an own chunk list) --- src/borg/archive.py | 3 +++ src/borg/archiver.py | 11 ++++++++--- src/borg/constants.py | 2 +- src/borg/fuse.py | 1 + src/borg/helpers.py | 5 ++++- src/borg/item.pyx | 9 ++++++++- 6 files changed, 25 insertions(+), 6 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 07d62e168..8727b418e 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -777,6 +777,7 @@ Utilization of max. archive size: {csize_max:.0%} length = len(item.chunks) # the item should only have the *additional* chunks we processed after the last partial item: item.chunks = item.chunks[from_chunk:] + item.size = sum(chunk.size for chunk in item.chunks) item.path += '.borg_part_%d' % number item.part = number number += 1 @@ -825,6 +826,7 @@ Utilization of max. archive size: {csize_max:.0%} ) fd = sys.stdin.buffer # binary self.chunk_file(item, cache, self.stats, backup_io_iter(self.chunker.chunkify(fd))) + item.size = sum(chunk.size for chunk in item.chunks) self.stats.nfiles += 1 self.add_item(item) return 'i' # stdin @@ -885,6 +887,7 @@ Utilization of max. archive size: {csize_max:.0%} cache.memorize_file(path_hash, st, [c.id for c in item.chunks]) status = status or 'M' # regular file, modified (if not 'A' already) item.update(self.stat_attrs(st, path)) + item.size = sum(chunk.size for chunk in item.chunks) if is_special_file: # we processed a special file like a regular file. reflect that in mode, # so it can be extracted / accessed in FUSE mount like a regular file: diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 041749a19..d526006f5 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -600,10 +600,15 @@ class Archiver: def sum_chunk_size(item, consider_ids=None): if item.get('deleted'): - return None + size = None else: - return sum(c.size for c in item.chunks - if consider_ids is None or c.id in consider_ids) + if consider_ids is not None: # consider only specific chunks + size = sum(chunk.size for chunk in item.chunks if chunk.id in consider_ids) + else: # consider all chunks + size = item.get('size') + if size is None: + size = sum(chunk.size for chunk in item.chunks) + return size def get_owner(item): if args.numeric_owner: diff --git a/src/borg/constants.py b/src/borg/constants.py index 610486d08..f7cb11c92 100644 --- a/src/borg/constants.py +++ b/src/borg/constants.py @@ -1,6 +1,6 @@ # this set must be kept complete, otherwise the RobustUnpacker might malfunction: ITEM_KEYS = frozenset(['path', 'source', 'rdev', 'chunks', 'chunks_healthy', 'hardlink_master', - 'mode', 'user', 'group', 'uid', 'gid', 'mtime', 'atime', 'ctime', + 'mode', 'user', 'group', 'uid', 'gid', 'mtime', 'atime', 'ctime', 'size', 'xattrs', 'bsdflags', 'acl_nfs4', 'acl_access', 'acl_default', 'acl_extended', 'part']) diff --git a/src/borg/fuse.py b/src/borg/fuse.py index dbf34e1a6..53f60462f 100644 --- a/src/borg/fuse.py +++ b/src/borg/fuse.py @@ -260,6 +260,7 @@ class FuseOperations(llfuse.Operations): size = 0 dsize = 0 if 'chunks' in item: + # if we would not need to compute dsize, we could get size quickly from item.size, if present. for key, chunksize, _ in item.chunks: size += chunksize if self.accounted_chunks.get(key, inode) == inode: diff --git a/src/borg/helpers.py b/src/borg/helpers.py index 89b557e57..21a451886 100644 --- a/src/borg/helpers.py +++ b/src/borg/helpers.py @@ -104,7 +104,7 @@ def check_extension_modules(): raise ExtensionModuleError if platform.API_VERSION != platform.OS_API_VERSION != '1.1_01': raise ExtensionModuleError - if item.API_VERSION != '1.1_01': + if item.API_VERSION != '1.1_02': raise ExtensionModuleError @@ -1701,6 +1701,9 @@ class ItemFormatter(BaseFormatter): return len(item.get('chunks', [])) def calculate_size(self, item): + size = item.get('size') + if size is not None: + return size return sum(c.size for c in item.get('chunks', [])) def calculate_csize(self, item): diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 4ac960a63..a3e78c211 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -2,7 +2,7 @@ from .constants import ITEM_KEYS from .helpers import safe_encode, safe_decode from .helpers import StableDict -API_VERSION = '1.1_01' +API_VERSION = '1.1_02' class PropDict: @@ -156,6 +156,10 @@ class Item(PropDict): ctime = PropDict._make_property('ctime', int) mtime = PropDict._make_property('mtime', int) + # size is only present for items with a chunk list and then it is sum(chunk_sizes) + # compatibility note: this is a new feature, in old archives size will be missing. + size = PropDict._make_property('size', int) + hardlink_master = PropDict._make_property('hardlink_master', bool) chunks = PropDict._make_property('chunks', (list, type(None)), 'list or None') @@ -169,6 +173,9 @@ class Item(PropDict): part = PropDict._make_property('part', int) def file_size(self, hardlink_masters=None): + size = self.get('size') + if size is not None: + return size hardlink_masters = hardlink_masters or {} chunks, _ = hardlink_masters.get(self.get('source'), (None, None)) chunks = self.get('chunks', chunks) From fe8e14cb2ce030a22f3a982db156a38770706b0c Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 14 Feb 2017 20:54:25 +0100 Subject: [PATCH 2/8] fuse: get rid of chunk accounting the chunk accounting code tried to reflect repo space usage via the st_blocks of the files. so, a specific chunk that was shared between multiple files [inodes] was only accounted for one specific file. thus, the overall "du" of everything in the fuse mounted repo was maybe correctly reflecting the repo space usage, but the decision which file has the chunk (the space) was kind of arbitrary and not really useful. otoh, a simple fuse getattr() was rather expensive due to this as it needed to iterate over the chunks list to compute the st_blocks value. also it needed quite some memory for the accounting. thus, st_blocks is now just ceil(size / blocksize). also: fixed bug that st_blocks was a floating point value previously. also: preparing for further optimization of size computation (see next cs) --- src/borg/fuse.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/borg/fuse.py b/src/borg/fuse.py index 53f60462f..db84fcdeb 100644 --- a/src/borg/fuse.py +++ b/src/borg/fuse.py @@ -72,7 +72,6 @@ class FuseOperations(llfuse.Operations): self.contents = defaultdict(dict) self.default_dir = Item(mode=0o40755, mtime=int(time.time() * 1e9), uid=os.getuid(), gid=os.getgid()) self.pending_archives = {} - self.accounted_chunks = {} self.cache = ItemCache() data_cache_capacity = int(os.environ.get('BORG_MOUNT_DATA_CACHE_ENTRIES', os.cpu_count() or 1)) logger.debug('mount data cache capacity: %d chunks', data_cache_capacity) @@ -258,14 +257,9 @@ class FuseOperations(llfuse.Operations): def getattr(self, inode, ctx=None): item = self.get_item(inode) size = 0 - dsize = 0 if 'chunks' in item: - # if we would not need to compute dsize, we could get size quickly from item.size, if present. for key, chunksize, _ in item.chunks: size += chunksize - if self.accounted_chunks.get(key, inode) == inode: - self.accounted_chunks[key] = inode - dsize += chunksize entry = llfuse.EntryAttributes() entry.st_ino = inode entry.generation = 0 @@ -278,7 +272,7 @@ class FuseOperations(llfuse.Operations): entry.st_rdev = item.get('rdev', 0) entry.st_size = size entry.st_blksize = 512 - entry.st_blocks = dsize / 512 + entry.st_blocks = (size + entry.st_blksize - 1) // entry.st_blksize # note: older archives only have mtime (not atime nor ctime) mtime_ns = item.mtime if have_fuse_xtime_ns: From ae6742fb34740499a97b4edce36f984f4e9cd1ba Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 14 Feb 2017 21:08:38 +0100 Subject: [PATCH 3/8] fuse: use precomputed size from Item --- src/borg/fuse.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/borg/fuse.py b/src/borg/fuse.py index db84fcdeb..3b3b37717 100644 --- a/src/borg/fuse.py +++ b/src/borg/fuse.py @@ -256,10 +256,6 @@ class FuseOperations(llfuse.Operations): def getattr(self, inode, ctx=None): item = self.get_item(inode) - size = 0 - if 'chunks' in item: - for key, chunksize, _ in item.chunks: - size += chunksize entry = llfuse.EntryAttributes() entry.st_ino = inode entry.generation = 0 @@ -270,9 +266,9 @@ class FuseOperations(llfuse.Operations): entry.st_uid = item.uid entry.st_gid = item.gid entry.st_rdev = item.get('rdev', 0) - entry.st_size = size + entry.st_size = item.file_size() entry.st_blksize = 512 - entry.st_blocks = (size + entry.st_blksize - 1) // entry.st_blksize + entry.st_blocks = (entry.st_size + entry.st_blksize - 1) // entry.st_blksize # note: older archives only have mtime (not atime nor ctime) mtime_ns = item.mtime if have_fuse_xtime_ns: From 0021052dbdd7147c027745c594e98424031855c8 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 15 Feb 2017 01:24:20 +0100 Subject: [PATCH 4/8] reduce code duplication --- src/borg/archive.py | 6 +++--- src/borg/archiver.py | 4 +--- src/borg/helpers.py | 5 +---- src/borg/item.pyx | 22 +++++++++++++++------- 4 files changed, 20 insertions(+), 17 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 8727b418e..3c5dbe8c2 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -777,7 +777,7 @@ Utilization of max. archive size: {csize_max:.0%} length = len(item.chunks) # the item should only have the *additional* chunks we processed after the last partial item: item.chunks = item.chunks[from_chunk:] - item.size = sum(chunk.size for chunk in item.chunks) + item.file_size(memorize=True) item.path += '.borg_part_%d' % number item.part = number number += 1 @@ -826,7 +826,7 @@ Utilization of max. archive size: {csize_max:.0%} ) fd = sys.stdin.buffer # binary self.chunk_file(item, cache, self.stats, backup_io_iter(self.chunker.chunkify(fd))) - item.size = sum(chunk.size for chunk in item.chunks) + item.file_size(memorize=True) self.stats.nfiles += 1 self.add_item(item) return 'i' # stdin @@ -887,7 +887,7 @@ Utilization of max. archive size: {csize_max:.0%} cache.memorize_file(path_hash, st, [c.id for c in item.chunks]) status = status or 'M' # regular file, modified (if not 'A' already) item.update(self.stat_attrs(st, path)) - item.size = sum(chunk.size for chunk in item.chunks) + item.file_size(memorize=True) if is_special_file: # we processed a special file like a regular file. reflect that in mode, # so it can be extracted / accessed in FUSE mount like a regular file: diff --git a/src/borg/archiver.py b/src/borg/archiver.py index d526006f5..a1bc65b68 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -605,9 +605,7 @@ class Archiver: if consider_ids is not None: # consider only specific chunks size = sum(chunk.size for chunk in item.chunks if chunk.id in consider_ids) else: # consider all chunks - size = item.get('size') - if size is None: - size = sum(chunk.size for chunk in item.chunks) + size = item.file_size() return size def get_owner(item): diff --git a/src/borg/helpers.py b/src/borg/helpers.py index 21a451886..f6247cd38 100644 --- a/src/borg/helpers.py +++ b/src/borg/helpers.py @@ -1701,10 +1701,7 @@ class ItemFormatter(BaseFormatter): return len(item.get('chunks', [])) def calculate_size(self, item): - size = item.get('size') - if size is not None: - return size - return sum(c.size for c in item.get('chunks', [])) + return item.file_size() def calculate_csize(self, item): return sum(c.csize for c in item.get('chunks', [])) diff --git a/src/borg/item.pyx b/src/borg/item.pyx index a3e78c211..a0b9e3efd 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -172,16 +172,24 @@ class Item(PropDict): part = PropDict._make_property('part', int) - def file_size(self, hardlink_masters=None): + def file_size(self, hardlink_masters=None, memorize=False): + """determine the size of this item""" size = self.get('size') if size is not None: return size - hardlink_masters = hardlink_masters or {} - chunks, _ = hardlink_masters.get(self.get('source'), (None, None)) - chunks = self.get('chunks', chunks) - if chunks is None: - return 0 - return sum(chunk.size for chunk in chunks) + chunks = self.get('chunks') + having_chunks = chunks is not None + if not having_chunks: + # this item has no (own) chunks, but if this is a hardlink slave + # and we know the master, we can still compute the size. + hardlink_masters = hardlink_masters or {} + chunks, _ = hardlink_masters.get(self.get('source'), (None, None)) + if chunks is None: + return 0 + size = sum(chunk.size for chunk in chunks) + if memorize and having_chunks: + self.size = size + return size class EncryptedKey(PropDict): From 97bb1b7d9afe7b3f6117b7a27fccb2fd23c41103 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 18 Feb 2017 06:47:39 +0100 Subject: [PATCH 5/8] deduplicate / refactor item (c)size code --- src/borg/cache.py | 3 +-- src/borg/helpers.py | 6 +++-- src/borg/item.pyx | 53 ++++++++++++++++++++++++++++++--------------- 3 files changed, 41 insertions(+), 21 deletions(-) diff --git a/src/borg/cache.py b/src/borg/cache.py index 21efcbc9e..cc57e7bc9 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -20,13 +20,12 @@ from .helpers import format_file_size from .helpers import yes from .helpers import remove_surrogates from .helpers import ProgressIndicatorPercent, ProgressIndicatorMessage -from .item import Item, ArchiveItem +from .item import Item, ArchiveItem, ChunkListEntry from .key import PlaintextKey from .locking import Lock from .platform import SaveFile from .remote import cache_if_remote -ChunkListEntry = namedtuple('ChunkListEntry', 'id size csize') FileCacheEntry = namedtuple('FileCacheEntry', 'age inode size mtime chunk_ids') diff --git a/src/borg/helpers.py b/src/borg/helpers.py index f6247cd38..2bd5f4071 100644 --- a/src/borg/helpers.py +++ b/src/borg/helpers.py @@ -1701,10 +1701,12 @@ class ItemFormatter(BaseFormatter): return len(item.get('chunks', [])) def calculate_size(self, item): - return item.file_size() + # note: does not support hardlink slaves, they will be size 0 + return item.file_size(compressed=False) def calculate_csize(self, item): - return sum(c.csize for c in item.get('chunks', [])) + # note: does not support hardlink slaves, they will be csize 0 + return item.file_size(compressed=True) def hash_item(self, hash_function, item): if 'chunks' not in item: diff --git a/src/borg/item.pyx b/src/borg/item.pyx index a0b9e3efd..c3125c577 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -1,3 +1,5 @@ +from collections import namedtuple + from .constants import ITEM_KEYS from .helpers import safe_encode, safe_decode from .helpers import StableDict @@ -113,6 +115,8 @@ class PropDict: return property(_get, _set, _del, doc=doc) +ChunkListEntry = namedtuple('ChunkListEntry', 'id size csize') + class Item(PropDict): """ Item abstraction that deals with validation and the low-level details internally: @@ -172,23 +176,38 @@ class Item(PropDict): part = PropDict._make_property('part', int) - def file_size(self, hardlink_masters=None, memorize=False): - """determine the size of this item""" - size = self.get('size') - if size is not None: - return size - chunks = self.get('chunks') - having_chunks = chunks is not None - if not having_chunks: - # this item has no (own) chunks, but if this is a hardlink slave - # and we know the master, we can still compute the size. - hardlink_masters = hardlink_masters or {} - chunks, _ = hardlink_masters.get(self.get('source'), (None, None)) - if chunks is None: - return 0 - size = sum(chunk.size for chunk in chunks) - if memorize and having_chunks: - self.size = size + def file_size(self, hardlink_masters=None, memorize=False, compressed=False): + """determine the (uncompressed or compressed) size of this item""" + attr = 'csize' if compressed else 'size' + try: + size = getattr(self, attr) + except AttributeError: + # no precomputed (c)size value available, compute it: + try: + chunks = getattr(self, 'chunks') + having_chunks = True + except AttributeError: + having_chunks = False + # this item has no (own) chunks list, but if this is a hardlink slave + # and we know the master, we can still compute the size. + if hardlink_masters is None: + chunks = None + else: + try: + master = getattr(self, 'source') + except AttributeError: + # not a hardlink slave, likely a directory or special file w/o chunks + chunks = None + else: + # hardlink slave, try to fetch hardlink master's chunks list + # todo: put precomputed size into hardlink_masters' values and use it, if present + chunks, _ = hardlink_masters.get(master, (None, None)) + if chunks is None: + return 0 + size = sum(getattr(ChunkListEntry(*chunk), attr) for chunk in chunks) + # if requested, memorize the precomputed (c)size for items that have an own chunks list: + if memorize and having_chunks: + setattr(self, attr, size) return size From 50068c596dc4f843ef90def50ef530cb5926f20f Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 18 Feb 2017 07:02:11 +0100 Subject: [PATCH 6/8] rename Item.file_size -> get_size file_size is misleading here because one thinks of on-disk file size, but for compressed=True, there is no such on-disk file. --- src/borg/archive.py | 6 +++--- src/borg/archiver.py | 4 ++-- src/borg/fuse.py | 2 +- src/borg/helpers.py | 4 ++-- src/borg/item.pyx | 11 +++++++++-- src/borg/testsuite/item.py | 4 ++-- 6 files changed, 19 insertions(+), 12 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 3c5dbe8c2..7f89a2159 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -777,7 +777,7 @@ Utilization of max. archive size: {csize_max:.0%} length = len(item.chunks) # the item should only have the *additional* chunks we processed after the last partial item: item.chunks = item.chunks[from_chunk:] - item.file_size(memorize=True) + item.get_size(memorize=True) item.path += '.borg_part_%d' % number item.part = number number += 1 @@ -826,7 +826,7 @@ Utilization of max. archive size: {csize_max:.0%} ) fd = sys.stdin.buffer # binary self.chunk_file(item, cache, self.stats, backup_io_iter(self.chunker.chunkify(fd))) - item.file_size(memorize=True) + item.get_size(memorize=True) self.stats.nfiles += 1 self.add_item(item) return 'i' # stdin @@ -887,7 +887,7 @@ Utilization of max. archive size: {csize_max:.0%} cache.memorize_file(path_hash, st, [c.id for c in item.chunks]) status = status or 'M' # regular file, modified (if not 'A' already) item.update(self.stat_attrs(st, path)) - item.file_size(memorize=True) + item.get_size(memorize=True) if is_special_file: # we processed a special file like a regular file. reflect that in mode, # so it can be extracted / accessed in FUSE mount like a regular file: diff --git a/src/borg/archiver.py b/src/borg/archiver.py index a1bc65b68..ccf3c6474 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -541,7 +541,7 @@ class Archiver: if progress: pi = ProgressIndicatorPercent(msg='%5.1f%% Extracting: %s', step=0.1) pi.output('Calculating size') - extracted_size = sum(item.file_size(hardlink_masters) for item in archive.iter_items(filter)) + extracted_size = sum(item.get_size(hardlink_masters) for item in archive.iter_items(filter)) pi.total = extracted_size else: pi = None @@ -605,7 +605,7 @@ class Archiver: if consider_ids is not None: # consider only specific chunks size = sum(chunk.size for chunk in item.chunks if chunk.id in consider_ids) else: # consider all chunks - size = item.file_size() + size = item.get_size() return size def get_owner(item): diff --git a/src/borg/fuse.py b/src/borg/fuse.py index 3b3b37717..33c6b3897 100644 --- a/src/borg/fuse.py +++ b/src/borg/fuse.py @@ -266,7 +266,7 @@ class FuseOperations(llfuse.Operations): entry.st_uid = item.uid entry.st_gid = item.gid entry.st_rdev = item.get('rdev', 0) - entry.st_size = item.file_size() + entry.st_size = item.get_size() entry.st_blksize = 512 entry.st_blocks = (entry.st_size + entry.st_blksize - 1) // entry.st_blksize # note: older archives only have mtime (not atime nor ctime) diff --git a/src/borg/helpers.py b/src/borg/helpers.py index 2bd5f4071..ad03dca43 100644 --- a/src/borg/helpers.py +++ b/src/borg/helpers.py @@ -1702,11 +1702,11 @@ class ItemFormatter(BaseFormatter): def calculate_size(self, item): # note: does not support hardlink slaves, they will be size 0 - return item.file_size(compressed=False) + return item.get_size(compressed=False) def calculate_csize(self, item): # note: does not support hardlink slaves, they will be csize 0 - return item.file_size(compressed=True) + return item.get_size(compressed=True) def hash_item(self, hash_function, item): if 'chunks' not in item: diff --git a/src/borg/item.pyx b/src/borg/item.pyx index c3125c577..a71da55e7 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -176,8 +176,15 @@ class Item(PropDict): part = PropDict._make_property('part', int) - def file_size(self, hardlink_masters=None, memorize=False, compressed=False): - """determine the (uncompressed or compressed) size of this item""" + def get_size(self, hardlink_masters=None, memorize=False, compressed=False): + """ + Determine the (uncompressed or compressed) size of this item. + + For hardlink slaves, the size is computed via the hardlink master's + chunk list, if available (otherwise size will be returned as 0). + + If memorize is True, the computed size value will be stored into the item. + """ attr = 'csize' if compressed else 'size' try: size = getattr(self, attr) diff --git a/src/borg/testsuite/item.py b/src/borg/testsuite/item.py index 35934f3ba..9c66b6a67 100644 --- a/src/borg/testsuite/item.py +++ b/src/borg/testsuite/item.py @@ -142,9 +142,9 @@ def test_item_file_size(): ChunkListEntry(csize=1, size=1000, id=None), ChunkListEntry(csize=1, size=2000, id=None), ]) - assert item.file_size() == 3000 + assert item.get_size() == 3000 def test_item_file_size_no_chunks(): item = Item() - assert item.file_size() == 0 + assert item.get_size() == 0 From adc4da280de41379cd5e9cebee32a40c78148006 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 18 Feb 2017 23:09:40 +0100 Subject: [PATCH 7/8] borg check: check file size consistency --- src/borg/archive.py | 7 +++++++ src/borg/item.pyx | 4 +++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 7f89a2159..852da6e02 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -1306,6 +1306,13 @@ class ArchiveChecker: logger.info('{}: Completely healed previously damaged file!'.format(item.path)) del item.chunks_healthy item.chunks = chunk_list + if 'size' in item: + item_size = item.size + item_chunks_size = item.get_size(compressed=False, from_chunks=True) + if item_size != item_chunks_size: + # just warn, but keep the inconsistency, so that borg extract can warn about it. + logger.warning('{}: size inconsistency detected: size {}, chunks size {}'.format( + item.path, item_size, item_chunks_size)) def robust_iterator(archive): """Iterates through all archive items diff --git a/src/borg/item.pyx b/src/borg/item.pyx index a71da55e7..627ffd1fb 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -176,7 +176,7 @@ class Item(PropDict): part = PropDict._make_property('part', int) - def get_size(self, hardlink_masters=None, memorize=False, compressed=False): + def get_size(self, hardlink_masters=None, memorize=False, compressed=False, from_chunks=False): """ Determine the (uncompressed or compressed) size of this item. @@ -187,6 +187,8 @@ class Item(PropDict): """ attr = 'csize' if compressed else 'size' try: + if from_chunks: + raise AttributeError size = getattr(self, attr) except AttributeError: # no precomputed (c)size value available, compute it: From 7da0a9c98232850068e9dbc8114a4e7e48e8e7bc Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Mon, 20 Feb 2017 22:24:19 +0100 Subject: [PATCH 8/8] borg extract: check file size consistency --- src/borg/archive.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 852da6e02..91e94fa5d 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -467,13 +467,20 @@ Utilization of max. archive size: {csize_max:.0%} has_damaged_chunks = 'chunks_healthy' in item if dry_run or stdout: if 'chunks' in item: + item_chunks_size = 0 for _, data in self.pipeline.fetch_many([c.id for c in item.chunks], is_preloaded=True): if pi: pi.show(increase=len(data), info=[remove_surrogates(item.path)]) if stdout: sys.stdout.buffer.write(data) + item_chunks_size += len(data) if stdout: sys.stdout.buffer.flush() + if 'size' in item: + item_size = item.size + if item_size != item_chunks_size: + logger.warning('{}: size inconsistency detected: size {}, chunks size {}'.format( + item.path, item_size, item_chunks_size)) if has_damaged_chunks: logger.warning('File %s has damaged (all-zero) chunks. Try running borg check --repair.' % remove_surrogates(item.path)) @@ -530,10 +537,15 @@ Utilization of max. archive size: {csize_max:.0%} else: fd.write(data) with backup_io('truncate'): - pos = fd.tell() + pos = item_chunks_size = fd.tell() fd.truncate(pos) fd.flush() self.restore_attrs(path, item, fd=fd.fileno()) + if 'size' in item: + item_size = item.size + if item_size != item_chunks_size: + logger.warning('{}: size inconsistency detected: size {}, chunks size {}'.format( + item.path, item_size, item_chunks_size)) if has_damaged_chunks: logger.warning('File %s has damaged (all-zero) chunks. Try running borg check --repair.' % remove_surrogates(item.path))