Merge pull request #9677 from mr-raj12/pack-files-step1-remove-xxh64

repoobj: remove xxh64 checksums from blob header
This commit is contained in:
TW 2026-05-30 17:58:39 +02:00 committed by GitHub
commit 910f223e59
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 39 additions and 70 deletions

View file

@ -1,8 +1,6 @@
from collections import namedtuple
from struct import Struct
from xxhash import xxh64
from .constants import * # NOQA
from .helpers import msgpack, workarounds
from .helpers.errors import IntegrityError
@ -13,10 +11,9 @@ AUTHENTICATED_NO_KEY = "authenticated_no_key" in workarounds
class RepoObj:
# Object header format includes size information for parsing the object into meta and data,
# as well as hashes to enable checking consistency without having the borg key.
obj_header = Struct("<II8s8s") # meta size (32b), data size (32b), meta hash (64b), data hash (64b)
ObjHeader = namedtuple("ObjHeader", "meta_size data_size meta_hash data_hash")
# Object header: sizes of the encrypted meta and data sections.
obj_header = Struct("<II") # meta size (32b), data size (32b)
ObjHeader = namedtuple("ObjHeader", "meta_size data_size")
@classmethod
def extract_crypted_data(cls, data: bytes) -> bytes:
@ -67,9 +64,7 @@ class RepoObj:
data_encrypted = self.key.encrypt(id, data_compressed)
meta_packed = msgpack.packb(meta)
meta_encrypted = self.key.encrypt(id, meta_packed)
hdr = self.ObjHeader(
len(meta_encrypted), len(data_encrypted), xxh64(meta_encrypted).digest(), xxh64(data_encrypted).digest()
)
hdr = self.ObjHeader(len(meta_encrypted), len(data_encrypted))
hdr_packed = self.obj_header.pack(*hdr)
return hdr_packed + meta_encrypted + data_encrypted

View file

@ -2,8 +2,6 @@ import os
import time
from pathlib import Path
from xxhash import xxh64
from borgstore.store import Store
from borgstore.store import ObjectNotFound as StoreObjectNotFound
from borgstore.backends.errors import BackendError as StoreBackendError
@ -307,13 +305,9 @@ class Repository:
meta = obj[hdr_size : hdr_size + hdr.meta_size]
if hdr.meta_size != len(meta):
log_error("metadata size incorrect.")
elif hdr.meta_hash != xxh64(meta).digest():
log_error("metadata does not match checksum.")
data = obj[hdr_size + hdr.meta_size : hdr_size + hdr.meta_size + hdr.data_size]
if hdr.data_size != len(data):
log_error("data size incorrect.")
elif hdr.data_hash != xxh64(data).digest():
log_error("data does not match checksum.")
else:
log_error("too small.")

View file

@ -362,50 +362,34 @@ def test_verify_data(archivers, request, init_args):
if archiver.get_kind() != "local":
pytest.skip("only works locally, patches objects")
# it's tricky to test the cryptographic data verification, because usually already the
# repository-level xxh64 hash fails to verify. So we use a fake one that doesn't.
# note: it only works like tested here for a highly engineered data corruption attack,
# because with accidental corruption, usually already the xxh64 low-level check fails.
def fake_xxh64(data, seed=0):
# xxhash.xxh64.digest() returns -> bytes
class FakeDigest:
def digest(self):
return b"fakefake"
check_cmd_setup(archiver)
shutil.rmtree(archiver.repository_path)
cmd(archiver, "repo-create", *init_args)
create_src_archive(archiver, "archive1")
archive, repository = open_archive(archiver.repository_path, "archive1")
with repository:
for item in archive.iter_items():
if item.path.endswith(src_file):
chunk = item.chunks[-1]
data = repository.get(chunk.id)
data = data[0:123] + b"x" + data[123:]
repository.put(chunk.id, data)
break
return FakeDigest()
# the normal archives check does not read file content data.
cmd(archiver, "check", "--archives-only", exit_code=0)
# but with --verify-data, it does and notices the issue.
output = cmd(archiver, "check", "--archives-only", "--verify-data", exit_code=1)
assert f"{bin_to_hex(chunk.id)}, integrity error" in output
import borg.repoobj
import borg.repository
# repair will find the defect chunk and remove it
output = cmd(archiver, "check", "--repair", "--verify-data", exit_code=0)
assert f"{bin_to_hex(chunk.id)}, integrity error" in output
assert f"{src_file}: Missing file chunk detected" in output
with patch.object(borg.repoobj, "xxh64", fake_xxh64), patch.object(borg.repository, "xxh64", fake_xxh64):
check_cmd_setup(archiver)
shutil.rmtree(archiver.repository_path)
cmd(archiver, "repo-create", *init_args)
create_src_archive(archiver, "archive1")
archive, repository = open_archive(archiver.repository_path, "archive1")
with repository:
for item in archive.iter_items():
if item.path.endswith(src_file):
chunk = item.chunks[-1]
data = repository.get(chunk.id)
data = data[0:123] + b"x" + data[123:]
repository.put(chunk.id, data)
break
# the normal archives check does not read file content data.
cmd(archiver, "check", "--archives-only", exit_code=0)
# but with --verify-data, it does and notices the issue.
output = cmd(archiver, "check", "--archives-only", "--verify-data", exit_code=1)
assert f"{bin_to_hex(chunk.id)}, integrity error" in output
# repair will find the defect chunk and remove it
output = cmd(archiver, "check", "--repair", "--verify-data", exit_code=0)
assert f"{bin_to_hex(chunk.id)}, integrity error" in output
assert f"{src_file}: Missing file chunk detected" in output
# run with --verify-data again, it will notice the missing chunk.
output = cmd(archiver, "check", "--archives-only", "--verify-data", exit_code=1)
assert f"{src_file}: Missing file chunk detected" in output
# run with --verify-data again, it will notice the missing chunk.
output = cmd(archiver, "check", "--archives-only", "--verify-data", exit_code=1)
assert f"{src_file}: Missing file chunk detected" in output
@pytest.mark.parametrize("init_args", [["--encryption=repokey-aes-ocb"], ["--encryption", "none"]])
@ -427,13 +411,13 @@ def test_corrupted_file_chunk(archivers, request, init_args):
repository.put(chunk.id, data)
break
# the normal check checks all repository objects and the xxh64 checksum fails.
output = cmd(archiver, "check", "--repository-only", exit_code=1)
assert f"{bin_to_hex(chunk.id)} is corrupted: data does not match checksum." in output
# --verify-data decrypts and catches the corruption.
output = cmd(archiver, "check", "--archives-only", "--verify-data", exit_code=1)
assert f"{bin_to_hex(chunk.id)}, integrity error" in output
# repair: the defect chunk will be removed by repair.
output = cmd(archiver, "check", "--repair", exit_code=0)
assert f"{bin_to_hex(chunk.id)} is corrupted: data does not match checksum." in output
# repair: the defect chunk will be removed.
output = cmd(archiver, "check", "--repair", "--verify-data", exit_code=0)
assert f"{bin_to_hex(chunk.id)}, integrity error" in output
assert f"{src_file}: Missing file chunk detected" in output
# run normal check again

View file

@ -5,8 +5,6 @@ import sys
from unittest.mock import patch
import pytest
from xxhash import xxh64
from ..legacy.hashindex import NSIndex1
from ..helpers import Location
from ..helpers import IntegrityError
@ -75,7 +73,7 @@ def get_path(repository):
def fchunk(data, meta=b""):
# Create a raw chunk that has a valid RepoObj layout but does not use encryption or compression.
hdr = RepoObj.obj_header.pack(len(meta), len(data), xxh64(meta).digest(), xxh64(data).digest())
hdr = RepoObj.obj_header.pack(len(meta), len(data))
assert isinstance(data, bytes)
chunk = hdr + meta + data
return chunk
@ -150,7 +148,7 @@ def test_multiple_transactions(repo_fixtures, request):
def test_read_data(repo_fixtures, request):
with get_repository_from_fixture(repo_fixtures, request) as repository:
meta, data = b"meta", b"data"
hdr = RepoObj.obj_header.pack(len(meta), len(data), xxh64(meta).digest(), xxh64(data).digest())
hdr = RepoObj.obj_header.pack(len(meta), len(data))
chunk_complete = hdr + meta + data
repository.put(H(0), chunk_complete)
repository.commit(compact=False)

View file

@ -3,8 +3,6 @@ import os
import sys
import pytest
from xxhash import xxh64
from ..helpers import Location
from ..helpers import IntegrityError
from ..platformflags import is_win32
@ -57,7 +55,7 @@ def reopen(repository, exclusive: bool | None = True, create=False):
def fchunk(data, meta=b""):
# Format chunk: create a raw chunk that has a valid RepoObj layout, but does not use encryption or compression.
hdr = RepoObj.obj_header.pack(len(meta), len(data), xxh64(meta).digest(), xxh64(data).digest())
hdr = RepoObj.obj_header.pack(len(meta), len(data))
assert isinstance(data, bytes)
chunk = hdr + meta + data
return chunk
@ -99,7 +97,7 @@ def test_basic_operations(repo_fixtures, request):
def test_read_data(repo_fixtures, request):
with get_repository_from_fixture(repo_fixtures, request) as repository:
meta, data = b"meta", b"data"
hdr = RepoObj.obj_header.pack(len(meta), len(data), xxh64(meta).digest(), xxh64(data).digest())
hdr = RepoObj.obj_header.pack(len(meta), len(data))
chunk_complete = hdr + meta + data
chunk_short = hdr + meta
repository.put(H(0), chunk_complete)