From 05ce0a1897889e267dd776a0df5701e1b37344b7 Mon Sep 17 00:00:00 2001 From: Mrityunjay Raj Date: Mon, 1 Jun 2026 00:47:14 +0530 Subject: [PATCH] repository: add BORGPACK pack header, bump repo version to 4, refs #8572 Wrap each pack file in a 13-byte header (magic + version + blob_len) so packs are self-identifying and the [len][blob] unit extends to N>1 without a format revision. Bump version 3->4: packs/ and 49-byte ObjHeader are incompatible with version-3 readers. Fix test_extra_chunks chunk_id mismatch. --- src/borg/repository.py | 82 +++++++++++++------ src/borg/testsuite/archiver/check_cmd_test.py | 5 +- 2 files changed, 59 insertions(+), 28 deletions(-) diff --git a/src/borg/repository.py b/src/borg/repository.py index 158aa88d3..e7fb330f4 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -17,9 +17,16 @@ from .helpers import bin_to_hex, hex_to_bin from .storelocking import Lock from .logger import create_logger from .manifest import NoManifestError +from struct import Struct + from .repoobj import RepoObj, OBJ_MAGIC, OBJ_VERSION from .crypto.key import is_keyfile +PACK_MAGIC = b"BORGPACK" +PACK_VERSION = 0x01 +_pack_header = Struct("<8sBI") # magic(8) + version(1) + blob_len(4) +PACK_HEADER_SIZE = _pack_header.size # 13 bytes + logger = create_logger(__name__) @@ -174,7 +181,7 @@ class Repository: self._send_log = send_log_cb or (lambda: None) self.do_create = create self.created = False - self.acceptable_repo_versions = (3,) + self.acceptable_repo_versions = (4,) self.opened = False self.lock = None self.do_lock = lock @@ -212,10 +219,10 @@ class Repository: self.store.open() try: self.store.store("config/readme", REPOSITORY_README.encode()) - self.version = 3 + self.version = 4 self.store.store("config/version", str(self.version).encode()) self.store.store("config/id", bin_to_hex(os.urandom(32)).encode()) - # we know repo/data/ still does not have any chunks stored in it, + # we know repo/packs/ still does not have any chunks stored in it, # but for some stores, there might be a lot of empty directories and # listing them all might be rather slow, so we better cache an empty # ChunkIndex from here so that the first repo operation does not have @@ -329,25 +336,38 @@ class Repository: def check_object(obj): """Check if obj looks valid.""" - hdr_size = RepoObj.obj_header.size - obj_size = len(obj) - if obj_size >= hdr_size: - hdr = RepoObj.ObjHeader(*RepoObj.obj_header.unpack(obj[:hdr_size])) - if hdr.magic != OBJ_MAGIC: - log_error("invalid object magic.") - elif hdr.version != OBJ_VERSION: - log_error(f"unsupported object version: {hdr.version}.") - elif hdr.chunk_id != hex_to_bin(info.name): - log_error("chunk_id mismatch in header.") - else: - meta = obj[hdr_size : hdr_size + hdr.meta_size] - if hdr.meta_size != len(meta): - log_error("metadata size mismatch.") - data = obj[hdr_size + hdr.meta_size : hdr_size + hdr.meta_size + hdr.data_size] - if hdr.data_size != len(data): - log_error("data size mismatch.") - else: + if len(obj) < PACK_HEADER_SIZE: log_error("too small.") + return + magic, version, blob_len = _pack_header.unpack(obj[:PACK_HEADER_SIZE]) + if magic != PACK_MAGIC: + log_error("invalid pack magic.") + return + if version != PACK_VERSION: + log_error(f"unsupported pack version: {version}.") + return + blob = obj[PACK_HEADER_SIZE:] + if len(blob) != blob_len: + log_error(f"pack blob_len mismatch: header says {blob_len}, actual {len(blob)}.") + return + hdr_size = RepoObj.obj_header.size + if len(blob) < hdr_size: + log_error("too small.") + return + hdr = RepoObj.ObjHeader(*RepoObj.obj_header.unpack(blob[:hdr_size])) + if hdr.magic != OBJ_MAGIC: + log_error("invalid object magic.") + elif hdr.version != OBJ_VERSION: + log_error(f"unsupported object version: {hdr.version}.") + elif hdr.chunk_id != hex_to_bin(info.name): + log_error("chunk_id mismatch in header.") + else: + meta = blob[hdr_size : hdr_size + hdr.meta_size] + if hdr.meta_size != len(meta): + log_error("metadata size mismatch.") + data = blob[hdr_size + hdr.meta_size : hdr_size + hdr.meta_size + hdr.data_size] + if hdr.data_size != len(data): + log_error("data size mismatch.") # TODO: progress indicator, ... partial = bool(max_duration) @@ -488,14 +508,15 @@ class Repository: key = "packs/" + bin_to_hex(pack_id) try: if read_data: - # read everything - return self.store.load(key) + raw = self.store.load(key) + return raw[PACK_HEADER_SIZE:] else: # RepoObj layout supports separately encrypted metadata and data. # We return enough bytes so the client can decrypt the metadata. hdr_size = RepoObj.obj_header.size extra_size = 1024 - hdr_size # load a bit more, 1024b, reduces round trips - obj = self.store.load(key, size=hdr_size + extra_size) + raw = self.store.load(key, size=PACK_HEADER_SIZE + hdr_size + extra_size) + obj = raw[PACK_HEADER_SIZE:] hdr = obj[0:hdr_size] if len(hdr) != hdr_size: raise IntegrityError(f"Object too small [id {id_hex}]: expected {hdr_size}, got {len(hdr)} bytes") @@ -503,7 +524,8 @@ class Repository: if meta_size > extra_size: # we did not get enough, need to load more, but not all. # this should be rare, as chunk metadata is rather small usually. - obj = self.store.load(key, size=hdr_size + meta_size) + raw = self.store.load(key, size=PACK_HEADER_SIZE + hdr_size + meta_size) + obj = raw[PACK_HEADER_SIZE:] meta = obj[hdr_size : hdr_size + meta_size] if len(meta) != meta_size: raise IntegrityError(f"Object too small [id {id_hex}]: expected {meta_size}, got {len(meta)} bytes") @@ -531,13 +553,21 @@ class Repository: pack_id = id # N=1: pack_id == chunk_id key = "packs/" + bin_to_hex(pack_id) - self.store.store(key, data) + pack_hdr = _pack_header.pack(PACK_MAGIC, PACK_VERSION, data_size) + self.store.store(key, pack_hdr + data) def delete(self, id, wait=True): """delete a repo object Note: when doing calls with wait=False this gets async and caller must deal with async results / exceptions later. + + N=1: pack_id == chunk_id, so deleting the pack file is equivalent to + deleting the chunk. Hard delete is safe here. + N>1: a pack contains multiple chunks. Individual chunks cannot be deleted + from a pack without rewriting it. This method must become a soft-delete + (no-op) before N>1 is implemented; compact() will then be the sole + mechanism for reclaiming space based on live-ratio thresholds. """ self._lock_refresh() pack_id = id # N=1: pack_id == chunk_id diff --git a/src/borg/testsuite/archiver/check_cmd_test.py b/src/borg/testsuite/archiver/check_cmd_test.py index aeccfd91d..7d6290c90 100644 --- a/src/borg/testsuite/archiver/check_cmd_test.py +++ b/src/borg/testsuite/archiver/check_cmd_test.py @@ -351,8 +351,9 @@ def test_extra_chunks(archivers, request): check_cmd_setup(archiver) cmd(archiver, "check", exit_code=0) with Repository(archiver.repository_location, exclusive=True) as repository: - chunk = fchunk(b"xxxx") - repository.put(b"01234567890123456789012345678901", chunk) + key = b"01234567890123456789012345678901" + chunk = fchunk(b"xxxx", chunk_id=key) + repository.put(key, chunk) cmd(archiver, "check", "-v", exit_code=0) # check does not deal with orphans anymore