Merge pull request #9024 from ThomasWaldmann/transfer-corrupts-src-repo

fix borg transfer corrupting the src repo index
This commit is contained in:
TW 2025-09-22 21:08:04 +02:00 committed by GitHub
commit 3bff0c31cf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 73 additions and 39 deletions

View file

@ -201,6 +201,10 @@ class NSIndex1(HTProxyMixin, MutableMapping):
used = len(self.ht)
header_bytes = struct.pack(self.HEADER_FMT, self.MAGIC, used, used, self.KEY_SIZE, self.VALUE_SIZE)
fd.write(header_bytes)
# record the header as a separate integrity-hash part if supported
hash_part = getattr(fd, "hash_part", None)
if hash_part:
hash_part("HashHeader")
count = 0
for key, _ in self.ht.items():
value = self.ht._get_raw(key)
@ -214,6 +218,10 @@ class NSIndex1(HTProxyMixin, MutableMapping):
header_bytes = fd.read(header_size)
if len(header_bytes) < header_size:
raise ValueError(f"Invalid file, file is too short (header).")
# verify the header as a separate integrity-hash part if supported
hash_part = getattr(fd, "hash_part", None)
if hash_part:
hash_part("HashHeader")
magic, entries, buckets, ksize, vsize = struct.unpack(self.HEADER_FMT, header_bytes)
if magic != self.MAGIC:
raise ValueError(f"Invalid file, magic {self.MAGIC.decode()} not found.")
@ -228,6 +236,10 @@ class NSIndex1(HTProxyMixin, MutableMapping):
for i in range(buckets):
key = fd.read(ksize)
value = fd.read(vsize)
if value.startswith(b'\xFF\xFF\xFF\xFF'): # LE for 0xffffffff (empty/unused bucket)
continue
if value.startswith(b'\xFE\xFF\xFF\xFF'): # LE for 0xfffffffe (deleted/tombstone bucket)
continue
self.ht._set_raw(key, value)
pos = fd.tell()
assert pos == end_of_file

View file

@ -515,23 +515,13 @@ class LegacyRepository:
return
return integrity[key]
def open_index(self, transaction_id, auto_recover=True):
def open_index(self, transaction_id):
if transaction_id is None:
return NSIndex1()
index_path = os.path.join(self.path, "index.%d" % transaction_id)
integrity_data = self._read_integrity(transaction_id, "index")
try:
with IntegrityCheckedFile(index_path, write=False, integrity_data=integrity_data) as fd:
return NSIndex1.read(fd)
except (ValueError, OSError, FileIntegrityError) as exc:
logger.warning("Repository index missing or corrupted, trying to recover from: %s", exc)
os.unlink(index_path)
if not auto_recover:
raise
self.prepare_txn(self.get_transaction_id())
# don't leave an open transaction around
self.commit(compact=False)
return self.open_index(self.get_transaction_id())
with IntegrityCheckedFile(index_path, write=False, integrity_data=integrity_data) as fd:
return NSIndex1.read(fd)
def _unpack_hints(self, transaction_id):
hints_path = os.path.join(self.path, "hints.%d" % transaction_id)
@ -560,11 +550,11 @@ class LegacyRepository:
raise
if not self.index or transaction_id is None:
try:
self.index = self.open_index(transaction_id, auto_recover=False)
self.index = self.open_index(transaction_id)
except (ValueError, OSError, FileIntegrityError) as exc:
logger.warning("Checking repository transaction due to previous error: %s", exc)
self.check_transaction()
self.index = self.open_index(transaction_id, auto_recover=False)
self.index = self.open_index(transaction_id)
if transaction_id is None:
self.segments = {} # XXX bad name: usage_count_of_segment_x = self.segments[x]
self.compact = FreeSpace() # XXX bad name: freeable_space_of_segment_x = self.compact[x]

View file

@ -1,3 +1,4 @@
import glob
import hashlib
import json
import os
@ -469,3 +470,58 @@ def test_transfer_rechunk(archivers, request, monkeypatch):
# Verify that the file hash is identical to the source
assert item.path in source_file_hashes, f"File {item.path} not found in source archive"
assert dest_hash == source_file_hashes[item.path], f"Content hash mismatch for {item.path}"
def test_issue_9022(archivers, request, monkeypatch):
"""
Regression test for borgbackup/borg#9022: After "borg transfer --from-borg1",
the source Borg 1.x repository index must not be changed.
"""
archiver = request.getfixturevalue(archivers)
if archiver.get_kind() in ["remote", "binary"]:
pytest.skip("only works locally")
# Prepare source (borg 1.2) repo from tarball next to this test file
repo12_tar = os.path.join(os.path.dirname(__file__), "repo12.tar.gz")
original_location = archiver.repository_location
extract_dir = f"{original_location}1"
os.makedirs(extract_dir)
with tarfile.open(repo12_tar) as tf:
tf.extractall(extract_dir)
def index_meta(repo_path):
index_files = sorted(glob.glob(os.path.join(repo_path, "index.*")))
assert len(index_files) == 1, f"Expected exactly 1 index file before transfer, found {len(index_files)}"
st = os.stat(index_files[0])
# Return (mtime_ns, size, inode). Use fallbacks where attributes may not exist on some platforms.
mtime_ns = getattr(st, "st_mtime_ns", int(st.st_mtime * 1e9))
inode = getattr(st, "st_ino", None)
return (mtime_ns, st.st_size, inode)
# Record pre-transfer index file metadata
pre_meta = index_meta(extract_dir)
other_repo1 = f"--other-repo={original_location}1"
# Destination repo where we transfer to (borg 2 repo)
archiver.repository_location = f"{original_location}2"
# Set passphrases: repo12 testdata uses "waytooeasyonlyfortests"
monkeypatch.setenv("BORG_PASSPHRASE", "pw2")
monkeypatch.setenv("BORG_OTHER_PASSPHRASE", "waytooeasyonlyfortests")
# For this test, we must not weaken KDF, otherwise borg2 couldn't decrypt the borg1 key
os.environ["BORG_TESTONLY_WEAKEN_KDF"] = "0"
# Create destination repo and run transfer from borg1 source
cmd(archiver, "repo-create", RK_ENCRYPTION, other_repo1, "--from-borg1")
cmd(archiver, "transfer", other_repo1, "--from-borg1")
# After transfer, ensure the source borg1 index file looks valid and unchanged.
post_meta = index_meta(extract_dir)
assert post_meta == pre_meta, (
f"Index file metadata changed after transfer!\n"
f"Before: mtime_ns={pre_meta[0]}, size={pre_meta[1]}, inode={pre_meta[2]}\n"
f"After: mtime_ns={post_meta[0]}, size={post_meta[1]}, inode={post_meta[2]}"
)

View file

@ -571,21 +571,6 @@ def test_unreadable_hints(repository):
do_commit(repository)
def test_index(repository):
make_auxiliary(repository)
with open(os.path.join(repository.path, "index.1"), "wb") as fd:
fd.write(b"123456789")
do_commit(repository)
def test_index_outside_transaction(repository):
make_auxiliary(repository)
with open(os.path.join(repository.path, "index.1"), "wb") as fd:
fd.write(b"123456789")
with repository:
assert len(repository) == 1
def _corrupt_index(repository):
# HashIndex is able to detect incorrect headers and file lengths,
# but on its own it can't tell if the data is correct.
@ -601,15 +586,6 @@ def _corrupt_index(repository):
fd.write(corrupted_index_data)
def test_index_corrupted(repository):
make_auxiliary(repository)
_corrupt_index(repository)
with repository:
# data corruption is detected due to mismatching checksums, and fixed by rebuilding the index.
assert len(repository) == 1
assert pdchunk(repository.get(H(0))) == b"foo"
def test_index_corrupted_without_integrity(repository):
make_auxiliary(repository)
_corrupt_index(repository)