repository: add BORGPACK pack header, bump repo version to 4, refs #8572

Wrap each pack file in a 13-byte header (magic + version + blob_len) so
packs are self-identifying and the [len][blob] unit extends to N>1 without
a format revision. Bump version 3->4: packs/ and 49-byte ObjHeader are
incompatible with version-3 readers. Fix test_extra_chunks chunk_id mismatch.
This commit is contained in:
Mrityunjay Raj 2026-06-01 00:47:14 +05:30
parent 1cb8d99425
commit 05ce0a1897
2 changed files with 59 additions and 28 deletions

View file

@ -17,9 +17,16 @@ from .helpers import bin_to_hex, hex_to_bin
from .storelocking import Lock
from .logger import create_logger
from .manifest import NoManifestError
from struct import Struct
from .repoobj import RepoObj, OBJ_MAGIC, OBJ_VERSION
from .crypto.key import is_keyfile
PACK_MAGIC = b"BORGPACK"
PACK_VERSION = 0x01
_pack_header = Struct("<8sBI") # magic(8) + version(1) + blob_len(4)
PACK_HEADER_SIZE = _pack_header.size # 13 bytes
logger = create_logger(__name__)
@ -174,7 +181,7 @@ class Repository:
self._send_log = send_log_cb or (lambda: None)
self.do_create = create
self.created = False
self.acceptable_repo_versions = (3,)
self.acceptable_repo_versions = (4,)
self.opened = False
self.lock = None
self.do_lock = lock
@ -212,10 +219,10 @@ class Repository:
self.store.open()
try:
self.store.store("config/readme", REPOSITORY_README.encode())
self.version = 3
self.version = 4
self.store.store("config/version", str(self.version).encode())
self.store.store("config/id", bin_to_hex(os.urandom(32)).encode())
# we know repo/data/ still does not have any chunks stored in it,
# we know repo/packs/ still does not have any chunks stored in it,
# but for some stores, there might be a lot of empty directories and
# listing them all might be rather slow, so we better cache an empty
# ChunkIndex from here so that the first repo operation does not have
@ -329,25 +336,38 @@ class Repository:
def check_object(obj):
"""Check if obj looks valid."""
hdr_size = RepoObj.obj_header.size
obj_size = len(obj)
if obj_size >= hdr_size:
hdr = RepoObj.ObjHeader(*RepoObj.obj_header.unpack(obj[:hdr_size]))
if hdr.magic != OBJ_MAGIC:
log_error("invalid object magic.")
elif hdr.version != OBJ_VERSION:
log_error(f"unsupported object version: {hdr.version}.")
elif hdr.chunk_id != hex_to_bin(info.name):
log_error("chunk_id mismatch in header.")
else:
meta = obj[hdr_size : hdr_size + hdr.meta_size]
if hdr.meta_size != len(meta):
log_error("metadata size mismatch.")
data = obj[hdr_size + hdr.meta_size : hdr_size + hdr.meta_size + hdr.data_size]
if hdr.data_size != len(data):
log_error("data size mismatch.")
else:
if len(obj) < PACK_HEADER_SIZE:
log_error("too small.")
return
magic, version, blob_len = _pack_header.unpack(obj[:PACK_HEADER_SIZE])
if magic != PACK_MAGIC:
log_error("invalid pack magic.")
return
if version != PACK_VERSION:
log_error(f"unsupported pack version: {version}.")
return
blob = obj[PACK_HEADER_SIZE:]
if len(blob) != blob_len:
log_error(f"pack blob_len mismatch: header says {blob_len}, actual {len(blob)}.")
return
hdr_size = RepoObj.obj_header.size
if len(blob) < hdr_size:
log_error("too small.")
return
hdr = RepoObj.ObjHeader(*RepoObj.obj_header.unpack(blob[:hdr_size]))
if hdr.magic != OBJ_MAGIC:
log_error("invalid object magic.")
elif hdr.version != OBJ_VERSION:
log_error(f"unsupported object version: {hdr.version}.")
elif hdr.chunk_id != hex_to_bin(info.name):
log_error("chunk_id mismatch in header.")
else:
meta = blob[hdr_size : hdr_size + hdr.meta_size]
if hdr.meta_size != len(meta):
log_error("metadata size mismatch.")
data = blob[hdr_size + hdr.meta_size : hdr_size + hdr.meta_size + hdr.data_size]
if hdr.data_size != len(data):
log_error("data size mismatch.")
# TODO: progress indicator, ...
partial = bool(max_duration)
@ -488,14 +508,15 @@ class Repository:
key = "packs/" + bin_to_hex(pack_id)
try:
if read_data:
# read everything
return self.store.load(key)
raw = self.store.load(key)
return raw[PACK_HEADER_SIZE:]
else:
# RepoObj layout supports separately encrypted metadata and data.
# We return enough bytes so the client can decrypt the metadata.
hdr_size = RepoObj.obj_header.size
extra_size = 1024 - hdr_size # load a bit more, 1024b, reduces round trips
obj = self.store.load(key, size=hdr_size + extra_size)
raw = self.store.load(key, size=PACK_HEADER_SIZE + hdr_size + extra_size)
obj = raw[PACK_HEADER_SIZE:]
hdr = obj[0:hdr_size]
if len(hdr) != hdr_size:
raise IntegrityError(f"Object too small [id {id_hex}]: expected {hdr_size}, got {len(hdr)} bytes")
@ -503,7 +524,8 @@ class Repository:
if meta_size > extra_size:
# we did not get enough, need to load more, but not all.
# this should be rare, as chunk metadata is rather small usually.
obj = self.store.load(key, size=hdr_size + meta_size)
raw = self.store.load(key, size=PACK_HEADER_SIZE + hdr_size + meta_size)
obj = raw[PACK_HEADER_SIZE:]
meta = obj[hdr_size : hdr_size + meta_size]
if len(meta) != meta_size:
raise IntegrityError(f"Object too small [id {id_hex}]: expected {meta_size}, got {len(meta)} bytes")
@ -531,13 +553,21 @@ class Repository:
pack_id = id # N=1: pack_id == chunk_id
key = "packs/" + bin_to_hex(pack_id)
self.store.store(key, data)
pack_hdr = _pack_header.pack(PACK_MAGIC, PACK_VERSION, data_size)
self.store.store(key, pack_hdr + data)
def delete(self, id, wait=True):
"""delete a repo object
Note: when doing calls with wait=False this gets async and caller must
deal with async results / exceptions later.
N=1: pack_id == chunk_id, so deleting the pack file is equivalent to
deleting the chunk. Hard delete is safe here.
N>1: a pack contains multiple chunks. Individual chunks cannot be deleted
from a pack without rewriting it. This method must become a soft-delete
(no-op) before N>1 is implemented; compact() will then be the sole
mechanism for reclaiming space based on live-ratio thresholds.
"""
self._lock_refresh()
pack_id = id # N=1: pack_id == chunk_id

View file

@ -351,8 +351,9 @@ def test_extra_chunks(archivers, request):
check_cmd_setup(archiver)
cmd(archiver, "check", exit_code=0)
with Repository(archiver.repository_location, exclusive=True) as repository:
chunk = fchunk(b"xxxx")
repository.put(b"01234567890123456789012345678901", chunk)
key = b"01234567890123456789012345678901"
chunk = fchunk(b"xxxx", chunk_id=key)
repository.put(key, chunk)
cmd(archiver, "check", "-v", exit_code=0) # check does not deal with orphans anymore