diff --git a/borg/archive.py b/borg/archive.py index 1a4e8dd20..9964f5f27 100644 --- a/borg/archive.py +++ b/borg/archive.py @@ -920,31 +920,56 @@ class ArchiveChecker: self.repository.put(id_, cdata) def verify_file_chunks(item): - """Verifies that all file chunks are present + """Verifies that all file chunks are present. - Missing file chunks will be replaced with new chunks of the same - length containing all zeros. + Missing file chunks will be replaced with new chunks of the same length containing all zeros. + If a previously missing file chunk re-appears, the replacement chunk is replaced by the correct one. """ offset = 0 chunk_list = [] chunks_replaced = False - for chunk_id, size, csize in item[b'chunks']: + has_chunks_healthy = b'chunks_healthy' in item + chunks_current = item[b'chunks'] + chunks_healthy = item[b'chunks_healthy'] if has_chunks_healthy else chunks_current + assert len(chunks_current) == len(chunks_healthy) + for chunk_current, chunk_healthy in zip(chunks_current, chunks_healthy): + chunk_id, size, csize = chunk_healthy if chunk_id not in self.chunks: - # If a file chunk is missing, create an all empty replacement chunk - logger.error('{}: Missing file chunk detected (Byte {}-{})'.format(item[b'path'].decode('utf-8', 'surrogateescape'), offset, offset + size)) - self.error_found = chunks_replaced = True - data = bytes(size) - chunk_id = self.key.id_hash(data) - cdata = self.key.encrypt(data) - csize = len(cdata) - add_reference(chunk_id, size, csize, cdata) + # a chunk of the healthy list is missing + if chunk_current == chunk_healthy: + logger.error('{}: New missing file chunk detected (Byte {}-{}). ' + 'Replacing with all-zero chunk.'.format( + item[b'path'].decode('utf-8', 'surrogateescape'), offset, offset + size)) + self.error_found = chunks_replaced = True + data = bytes(size) + chunk_id = self.key.id_hash(data) + cdata = self.key.encrypt(data) + csize = len(cdata) + add_reference(chunk_id, size, csize, cdata) + else: + logger.info('{}: Previously missing file chunk is still missing (Byte {}-{}). ' + 'It has a all-zero replacement chunk already.'.format( + item[b'path'].decode('utf-8', 'surrogateescape'), offset, offset + size)) + chunk_id, size, csize = chunk_current + add_reference(chunk_id, size, csize) else: - add_reference(chunk_id, size, csize) - chunk_list.append((chunk_id, size, csize)) + if chunk_current == chunk_healthy: + # normal case, all fine. + add_reference(chunk_id, size, csize) + else: + logger.info('{}: Healed previously missing file chunk! (Byte {}-{}).'.format( + item[b'path'].decode('utf-8', 'surrogateescape'), offset, offset + size)) + add_reference(chunk_id, size, csize) + mark_as_possibly_superseded(chunk_current[0]) # maybe orphaned the all-zero replacement chunk + chunk_list.append([chunk_id, size, csize]) # list-typed element as chunks_healthy is list-of-lists offset += size - if chunks_replaced and b'chunks_healthy' not in item: + if chunks_replaced and not has_chunks_healthy: # if this is first repair, remember the correct chunk IDs, so we can maybe heal the file later item[b'chunks_healthy'] = item[b'chunks'] + if has_chunks_healthy and chunk_list == chunks_healthy: + logger.info('{}: Completely healed previously damaged file!'.format( + item[b'path'].decode('utf-8', 'surrogateescape'))) + del item[b'chunks_healthy'] item[b'chunks'] = chunk_list def robust_iterator(archive): diff --git a/borg/archiver.py b/borg/archiver.py index ff1463708..309950d0b 100644 --- a/borg/archiver.py +++ b/borg/archiver.py @@ -973,9 +973,12 @@ class Archiver: - Check if archive metadata chunk is present. if not, remove archive from manifest. - For all files (items) in the archive, for all chunks referenced by these - files, check if chunk is present (if not and we are in repair mode, replace - it with a same-size chunk of zeros). This requires reading of archive and - file metadata, but not data. + files, check if chunk is present. + If a chunk is not present and we are in repair mode, replace it with a same-size + replacement chunk of zeros. + If a previously lost chunk reappears (e.g. via a later backup) and we are in + repair mode, the all-zero replacement chunk will be replaced by the correct chunk. + This requires reading of archive and file metadata, but not data. - If we are in repair mode and we checked all the archives: delete orphaned chunks from the repo. - if you use a remote repo server via ssh:, the archive check is executed on diff --git a/borg/testsuite/archiver.py b/borg/testsuite/archiver.py index d52391239..0179a882e 100644 --- a/borg/testsuite/archiver.py +++ b/borg/testsuite/archiver.py @@ -1142,12 +1142,45 @@ class ArchiverCheckTestCase(ArchiverTestCaseBase): with repository: for item in archive.iter_items(): if item[b'path'].endswith('testsuite/archiver.py'): - repository.delete(item[b'chunks'][-1][0]) + valid_chunks = item[b'chunks'] + killed_chunk = valid_chunks[-1] + repository.delete(killed_chunk[0]) break + else: + self.assert_true(False) # should not happen repository.commit() self.cmd('check', self.repository_location, exit_code=1) - self.cmd('check', '--repair', self.repository_location, exit_code=0) + output = self.cmd('check', '--repair', self.repository_location, exit_code=0) + self.assert_in('New missing file chunk detected', output) self.cmd('check', self.repository_location, exit_code=0) + # check that the file in the old archives has now a different chunk list without the killed chunk + for archive_name in ('archive1', 'archive2'): + archive, repository = self.open_archive(archive_name) + with repository: + for item in archive.iter_items(): + if item[b'path'].endswith('testsuite/archiver.py'): + self.assert_not_equal(valid_chunks, item[b'chunks']) + self.assert_not_in(killed_chunk, item[b'chunks']) + break + else: + self.assert_true(False) # should not happen + # do a fresh backup (that will include the killed chunk) + with patch.object(ChunkBuffer, 'BUFFER_SIZE', 10): + self.create_src_archive('archive3') + # check should be able to heal the file now: + output = self.cmd('check', '-v', '--repair', self.repository_location, exit_code=0) + self.assert_in('Healed previously missing file chunk', output) + self.assert_in('testsuite/archiver.py: Completely healed previously damaged file!', output) + # check that the file in the old archives has the correct chunks again + for archive_name in ('archive1', 'archive2'): + archive, repository = self.open_archive(archive_name) + with repository: + for item in archive.iter_items(): + if item[b'path'].endswith('testsuite/archiver.py'): + self.assert_equal(valid_chunks, item[b'chunks']) + break + else: + self.assert_true(False) # should not happen def test_missing_archive_item_chunk(self): archive, repository = self.open_archive('archive1')