From 8f28844a1e111fa5271e8fc050fbbc6520ec504f Mon Sep 17 00:00:00 2001 From: Mrityunjay Raj Date: Wed, 3 Jun 2026 18:21:13 +0530 Subject: [PATCH] hashindex: add pack_id, obj_offset, obj_size to ChunkIndexEntry, refs #8572 Add three new fields to ChunkIndexEntry and update all call sites: - pack_id (32 bytes): identifies the pack file containing the chunk - obj_offset (uint32): byte offset of the chunk within the pack - obj_size (uint32): stored (compressed) size of the chunk on disk At N=1 (one chunk per pack), chunk_id == pack_id, obj_offset == 0, and obj_size == pack_size. All sites use chunk_id as the ChunkIndex key and extract pack_id as a separate variable with an N=1 comment. compact_cmd.py: use obj_size (stored size) in repository_size sum. cache.py: preserve pack fields when serializing the chunk index cache. repository.py: populate pack_id/obj_size from borgstore object info. archive.py: extract pack_id on its own line, obj_size=0 for now. hashindex.pyx: update add(), namedtuple, format string, and docstring. hashindex.pyi: add new fields to ChunkIndexEntry and CIE type alias. testsuite/hashindex_test.py: update all ChunkIndexEntry constructions. --- src/borg/archive.py | 5 ++++- src/borg/archiver/compact_cmd.py | 12 +++++++----- src/borg/cache.py | 16 +++++++++------- src/borg/hashindex.pyi | 7 +++++-- src/borg/hashindex.pyx | 11 ++++++----- src/borg/repository.py | 7 +++++-- src/borg/testsuite/hashindex_test.py | 14 ++++++++------ 7 files changed, 44 insertions(+), 28 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 305b7894f..81a5fe196 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -2013,7 +2013,10 @@ class ArchiveChecker: # either we already have this chunk in repo and chunks index or we add it now if id_ not in self.chunks: assert cdata is not None - self.chunks[id_] = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=size) + pack_id = id_ # only correct for N=1 + self.chunks[id_] = ChunkIndexEntry( + flags=ChunkIndex.F_USED, size=size, pack_id=pack_id, obj_offset=0, obj_size=0 + ) if self.repair: self.repository.put(id_, cdata) diff --git a/src/borg/archiver/compact_cmd.py b/src/borg/archiver/compact_cmd.py index c25bb03e1..66ac4bfbc 100644 --- a/src/borg/archiver/compact_cmd.py +++ b/src/borg/archiver/compact_cmd.py @@ -35,7 +35,7 @@ class ArchiveGarbageCollector: def repository_size(self): if self.chunks is None or not self.stats: return None - return sum(entry.size for id, entry in self.chunks.iteritems()) # sum of stored sizes + return sum(entry.obj_size for id, entry in self.chunks.iteritems()) # sum of stored sizes def garbage_collect(self): """Removes unused chunks from a repository.""" @@ -53,12 +53,14 @@ class ArchiveGarbageCollector: if self.stats: # slow method: build a fresh chunks index, with stored chunk sizes. logger.info("Getting object IDs present in the repository...") chunks = ChunkIndex() - for id, stored_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT): + for pack_id, pack_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT): # we add this id to the chunks index (as unused chunk), because # we do not know yet whether it is actually referenced from some archives. - # we "abuse" the size field here. usually there is the plaintext size, - # but we use it for the size of the stored object here. - chunks[id] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=stored_size) + chunk_id = pack_id # N=1: chunk_id == pack_id + obj_size = pack_size # true for N=1 + chunks[chunk_id] = ChunkIndexEntry( + flags=ChunkIndex.F_NONE, size=0, pack_id=pack_id, obj_offset=0, obj_size=obj_size + ) else: # faster: rely on existing chunks index (with flags F_NONE and size 0). logger.info("Getting object IDs from cached chunks index...") chunks = build_chunkindex_from_repo(self.repository, cache_immediately=True) diff --git a/src/borg/cache.py b/src/borg/cache.py index c9b5793db..d6138db86 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -544,8 +544,7 @@ CHUNKINDEX_HASH_SEED = b"0001" # increment seed to invalidate old chunk indexes def write_chunkindex_to_repo_cache( repository, chunks, *, incremental=True, clear=False, force_write=False, delete_other=False, delete_these=None ): - # for now, we don't want to serialize the flags or the size, just the keys (chunk IDs): - cleaned_value = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=0) + # for now, we don't want to serialize the flags or the size: chunks_to_write = ChunkIndex() # incremental==True: # the borghash code has no means to only serialize the F_NEW table entries, @@ -553,8 +552,8 @@ def write_chunkindex_to_repo_cache( # incremental==False: # maybe copying the stuff into a new ChunkIndex is not needed here, # but for simplicity, we do it anyway. - for key, _ in chunks.iteritems(only_new=incremental): - chunks_to_write[key] = cleaned_value + for key, existing in chunks.iteritems(only_new=incremental): + chunks_to_write[key] = existing._replace(flags=ChunkIndex.F_NONE, size=0) with io.BytesIO() as f: chunks_to_write.write(f) data = f.getvalue() @@ -644,10 +643,13 @@ def build_chunkindex_from_repo(repository, *, disable_caches=False, cache_immedi num_chunks = 0 # The repo says it has these chunks, so we assume they are referenced/used chunks. # We do not know the plaintext size (!= stored_size), thus we set size = 0. - init_entry = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0) - for id, stored_size in repo_lister(repository, limit=LIST_SCAN_LIMIT): + for pack_id, pack_size in repo_lister(repository, limit=LIST_SCAN_LIMIT): num_chunks += 1 - chunks[id] = init_entry + chunk_id = pack_id # N=1: chunk_id == pack_id + obj_size = pack_size # true for N=1 + chunks[chunk_id] = ChunkIndexEntry( + flags=ChunkIndex.F_USED, size=0, pack_id=pack_id, obj_offset=0, obj_size=obj_size + ) # Cache does not contain the manifest. if not isinstance(repository, (Repository, RemoteRepository)): del chunks[Manifest.MANIFEST_ID] diff --git a/src/borg/hashindex.pyi b/src/borg/hashindex.pyi index 7241de8eb..a978c587a 100644 --- a/src/borg/hashindex.pyi +++ b/src/borg/hashindex.pyi @@ -6,9 +6,12 @@ class HTProxyMixin(MutableMapping): ... class ChunkIndexEntry(NamedTuple): flags: int - size: int + size: int # plaintext chunk size + pack_id: bytes + obj_offset: int + obj_size: int -CIE = Tuple[int, int] | Type[ChunkIndexEntry] +CIE = Tuple[int, int, bytes, int, int] | Type[ChunkIndexEntry] class ChunkIndex: F_NONE: int diff --git a/src/borg/hashindex.pyx b/src/borg/hashindex.pyx index e5328b425..e4eb43274 100644 --- a/src/borg/hashindex.pyx +++ b/src/borg/hashindex.pyx @@ -34,14 +34,14 @@ class HTProxyMixin: self.ht.clear() -ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'flags size') -ChunkIndexEntryFormatT = namedtuple('ChunkIndexEntryFormatT', 'flags size') -ChunkIndexEntryFormat = ChunkIndexEntryFormatT(flags="I", size="I") +ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'flags size pack_id obj_offset obj_size') +ChunkIndexEntryFormatT = namedtuple('ChunkIndexEntryFormatT', 'flags size pack_id obj_offset obj_size') +ChunkIndexEntryFormat = ChunkIndexEntryFormatT(flags="I", size="I", pack_id="32s", obj_offset="I", obj_size="I") class ChunkIndex(HTProxyMixin, MutableMapping): """ - Mapping from key256 to (flags32, size32) to track chunks in the repository. + Mapping from key256 to (flags32, size32, pack_id256, obj_offset32, obj_size32) to track chunks in the repository. """ # .flags related values: F_NONE = 0 # all flags cleared @@ -79,7 +79,8 @@ class ChunkIndex(HTProxyMixin, MutableMapping): else: flags = v.flags | self.F_USED assert v.size == 0 or v.size == size - self[key] = ChunkIndexEntry(flags=flags, size=size) + pack_id = key # N=1: chunk_id == pack_id + self[key] = ChunkIndexEntry(flags=flags, size=size, pack_id=pack_id, obj_offset=0, obj_size=0) def __getitem__(self, key): """Specialized __getitem__ that hides system flags.""" diff --git a/src/borg/repository.py b/src/borg/repository.py index bba937e33..51e1af70d 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -372,7 +372,6 @@ class Repository: # is using this object, but we assume that this is the case. # As we don't do garbage collection here, this is not a problem. # We also don't know the plaintext size, so we set it to 0. - init_entry = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0) infos = self.store.list("packs") try: for info in infos: @@ -410,8 +409,12 @@ class Repository: # borg check: the index may have corrupted objects (we did not delete them) # borg check --repair: the index will only have non-corrupted objects. pack_id = hex_to_bin(info.name) + pack_size = info.size chunk_id = pack_id # N=1: chunk_id == pack_id - chunks[chunk_id] = init_entry + obj_size = pack_size # correct for N=1 + chunks[chunk_id] = ChunkIndexEntry( + flags=ChunkIndex.F_USED, size=0, pack_id=pack_id, obj_offset=0, obj_size=obj_size + ) now = time.monotonic() if now > t_last_checkpoint + 300: # checkpoint every 5 mins t_last_checkpoint = now diff --git a/src/borg/testsuite/hashindex_test.py b/src/borg/testsuite/hashindex_test.py index 2425921ae..8c9ec5be9 100644 --- a/src/borg/testsuite/hashindex_test.py +++ b/src/borg/testsuite/hashindex_test.py @@ -20,11 +20,11 @@ def test_chunkindex_add(): chunks = ChunkIndex() x = H2(1) chunks.add(x, 0) - assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0) + assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0, pack_id=x, obj_offset=0, obj_size=0) chunks.add(x, 2) # updating size (we do not have a size yet) - assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2) + assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2, pack_id=x, obj_offset=0, obj_size=0) chunks.add(x, 2) - assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2) + assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2, pack_id=x, obj_offset=0, obj_size=0) with pytest.raises(AssertionError): chunks.add(x, 3) # inconsistent size (we already have a different size) @@ -35,7 +35,7 @@ def test_keyerror(): with pytest.raises(KeyError): chunks[x] with pytest.raises(struct.error): - chunks[x] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=2**33) + chunks[x] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=2**33, pack_id=x, obj_offset=0, obj_size=0) def test_new(): @@ -43,8 +43,10 @@ def test_new(): return list(chunks.iteritems(only_new=True)) chunks = ChunkIndex() - key1, value1a = H2(1), ChunkIndexEntry(flags=ChunkIndex.F_USED, size=23) - key2, value2a = H2(2), ChunkIndexEntry(flags=ChunkIndex.F_USED, size=42) + key1 = H2(1) + value1a = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=23, pack_id=key1, obj_offset=0, obj_size=0) + key2 = H2(2) + value2a = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=42, pack_id=key2, obj_offset=0, obj_size=0) # Tracking of new entries assert new_chunks() == [] chunks[key1] = value1a