diff --git a/src/borg/hashindex.pyx b/src/borg/hashindex.pyx index 97fc213f1..511a395b1 100644 --- a/src/borg/hashindex.pyx +++ b/src/borg/hashindex.pyx @@ -201,6 +201,10 @@ class NSIndex1(HTProxyMixin, MutableMapping): used = len(self.ht) header_bytes = struct.pack(self.HEADER_FMT, self.MAGIC, used, used, self.KEY_SIZE, self.VALUE_SIZE) fd.write(header_bytes) + # record the header as a separate integrity-hash part if supported + hash_part = getattr(fd, "hash_part", None) + if hash_part: + hash_part("HashHeader") count = 0 for key, _ in self.ht.items(): value = self.ht._get_raw(key) @@ -214,6 +218,10 @@ class NSIndex1(HTProxyMixin, MutableMapping): header_bytes = fd.read(header_size) if len(header_bytes) < header_size: raise ValueError(f"Invalid file, file is too short (header).") + # verify the header as a separate integrity-hash part if supported + hash_part = getattr(fd, "hash_part", None) + if hash_part: + hash_part("HashHeader") magic, entries, buckets, ksize, vsize = struct.unpack(self.HEADER_FMT, header_bytes) if magic != self.MAGIC: raise ValueError(f"Invalid file, magic {self.MAGIC.decode()} not found.") @@ -228,6 +236,10 @@ class NSIndex1(HTProxyMixin, MutableMapping): for i in range(buckets): key = fd.read(ksize) value = fd.read(vsize) + if value.startswith(b'\xFF\xFF\xFF\xFF'): # LE for 0xffffffff (empty/unused bucket) + continue + if value.startswith(b'\xFE\xFF\xFF\xFF'): # LE for 0xfffffffe (deleted/tombstone bucket) + continue self.ht._set_raw(key, value) pos = fd.tell() assert pos == end_of_file diff --git a/src/borg/legacyrepository.py b/src/borg/legacyrepository.py index b8719480f..18a3389e9 100644 --- a/src/borg/legacyrepository.py +++ b/src/borg/legacyrepository.py @@ -515,23 +515,13 @@ class LegacyRepository: return return integrity[key] - def open_index(self, transaction_id, auto_recover=True): + def open_index(self, transaction_id): if transaction_id is None: return NSIndex1() index_path = os.path.join(self.path, "index.%d" % transaction_id) integrity_data = self._read_integrity(transaction_id, "index") - try: - with IntegrityCheckedFile(index_path, write=False, integrity_data=integrity_data) as fd: - return NSIndex1.read(fd) - except (ValueError, OSError, FileIntegrityError) as exc: - logger.warning("Repository index missing or corrupted, trying to recover from: %s", exc) - os.unlink(index_path) - if not auto_recover: - raise - self.prepare_txn(self.get_transaction_id()) - # don't leave an open transaction around - self.commit(compact=False) - return self.open_index(self.get_transaction_id()) + with IntegrityCheckedFile(index_path, write=False, integrity_data=integrity_data) as fd: + return NSIndex1.read(fd) def _unpack_hints(self, transaction_id): hints_path = os.path.join(self.path, "hints.%d" % transaction_id) @@ -560,11 +550,11 @@ class LegacyRepository: raise if not self.index or transaction_id is None: try: - self.index = self.open_index(transaction_id, auto_recover=False) + self.index = self.open_index(transaction_id) except (ValueError, OSError, FileIntegrityError) as exc: logger.warning("Checking repository transaction due to previous error: %s", exc) self.check_transaction() - self.index = self.open_index(transaction_id, auto_recover=False) + self.index = self.open_index(transaction_id) if transaction_id is None: self.segments = {} # XXX bad name: usage_count_of_segment_x = self.segments[x] self.compact = FreeSpace() # XXX bad name: freeable_space_of_segment_x = self.compact[x] diff --git a/src/borg/testsuite/archiver/transfer_cmd_test.py b/src/borg/testsuite/archiver/transfer_cmd_test.py index 39d570d76..034651c36 100644 --- a/src/borg/testsuite/archiver/transfer_cmd_test.py +++ b/src/borg/testsuite/archiver/transfer_cmd_test.py @@ -1,3 +1,4 @@ +import glob import hashlib import json import os @@ -469,3 +470,58 @@ def test_transfer_rechunk(archivers, request, monkeypatch): # Verify that the file hash is identical to the source assert item.path in source_file_hashes, f"File {item.path} not found in source archive" assert dest_hash == source_file_hashes[item.path], f"Content hash mismatch for {item.path}" + + +def test_issue_9022(archivers, request, monkeypatch): + """ + Regression test for borgbackup/borg#9022: After "borg transfer --from-borg1", + the source Borg 1.x repository index must not be changed. + """ + archiver = request.getfixturevalue(archivers) + if archiver.get_kind() in ["remote", "binary"]: + pytest.skip("only works locally") + + # Prepare source (borg 1.2) repo from tarball next to this test file + repo12_tar = os.path.join(os.path.dirname(__file__), "repo12.tar.gz") + + original_location = archiver.repository_location + extract_dir = f"{original_location}1" + os.makedirs(extract_dir) + with tarfile.open(repo12_tar) as tf: + tf.extractall(extract_dir) + + def index_meta(repo_path): + index_files = sorted(glob.glob(os.path.join(repo_path, "index.*"))) + assert len(index_files) == 1, f"Expected exactly 1 index file before transfer, found {len(index_files)}" + st = os.stat(index_files[0]) + # Return (mtime_ns, size, inode). Use fallbacks where attributes may not exist on some platforms. + mtime_ns = getattr(st, "st_mtime_ns", int(st.st_mtime * 1e9)) + inode = getattr(st, "st_ino", None) + return (mtime_ns, st.st_size, inode) + + # Record pre-transfer index file metadata + pre_meta = index_meta(extract_dir) + + other_repo1 = f"--other-repo={original_location}1" + + # Destination repo where we transfer to (borg 2 repo) + archiver.repository_location = f"{original_location}2" + + # Set passphrases: repo12 testdata uses "waytooeasyonlyfortests" + monkeypatch.setenv("BORG_PASSPHRASE", "pw2") + monkeypatch.setenv("BORG_OTHER_PASSPHRASE", "waytooeasyonlyfortests") + # For this test, we must not weaken KDF, otherwise borg2 couldn't decrypt the borg1 key + os.environ["BORG_TESTONLY_WEAKEN_KDF"] = "0" + + # Create destination repo and run transfer from borg1 source + cmd(archiver, "repo-create", RK_ENCRYPTION, other_repo1, "--from-borg1") + cmd(archiver, "transfer", other_repo1, "--from-borg1") + + # After transfer, ensure the source borg1 index file looks valid and unchanged. + post_meta = index_meta(extract_dir) + + assert post_meta == pre_meta, ( + f"Index file metadata changed after transfer!\n" + f"Before: mtime_ns={pre_meta[0]}, size={pre_meta[1]}, inode={pre_meta[2]}\n" + f"After: mtime_ns={post_meta[0]}, size={post_meta[1]}, inode={post_meta[2]}" + ) diff --git a/src/borg/testsuite/legacyrepository_test.py b/src/borg/testsuite/legacyrepository_test.py index ca431b2b1..643ecdca6 100644 --- a/src/borg/testsuite/legacyrepository_test.py +++ b/src/borg/testsuite/legacyrepository_test.py @@ -571,21 +571,6 @@ def test_unreadable_hints(repository): do_commit(repository) -def test_index(repository): - make_auxiliary(repository) - with open(os.path.join(repository.path, "index.1"), "wb") as fd: - fd.write(b"123456789") - do_commit(repository) - - -def test_index_outside_transaction(repository): - make_auxiliary(repository) - with open(os.path.join(repository.path, "index.1"), "wb") as fd: - fd.write(b"123456789") - with repository: - assert len(repository) == 1 - - def _corrupt_index(repository): # HashIndex is able to detect incorrect headers and file lengths, # but on its own it can't tell if the data is correct. @@ -601,15 +586,6 @@ def _corrupt_index(repository): fd.write(corrupted_index_data) -def test_index_corrupted(repository): - make_auxiliary(repository) - _corrupt_index(repository) - with repository: - # data corruption is detected due to mismatching checksums, and fixed by rebuilding the index. - assert len(repository) == 1 - assert pdchunk(repository.get(H(0))) == b"foo" - - def test_index_corrupted_without_integrity(repository): make_auxiliary(repository) _corrupt_index(repository)