hashindex: add pack_id, obj_offset, obj_size to ChunkIndexEntry, refs #8572

Add three new fields to ChunkIndexEntry and update all call sites: - pack_id (32 bytes): identifies the pack file containing the chunk - obj_offset (uint32): byte offset of the chunk within the pack - obj_size (uint32): stored (compressed) size of the chunk on disk At N=1 (one chunk per pack), chunk_id == pack_id, obj_offset == 0, and obj_size == pack_size. All sites use chunk_id as the ChunkIndex key and extract pack_id as a separate variable with an N=1 comment. compact_cmd.py: use obj_size (stored size) in repository_size sum. cache.py: preserve pack fields when serializing the chunk index cache. repository.py: populate pack_id/obj_size from borgstore object info. archive.py: extract pack_id on its own line, obj_size=0 for now. hashindex.pyx: update add(), namedtuple, format string, and docstring. hashindex.pyi: add new fields to ChunkIndexEntry and CIE type alias. testsuite/hashindex_test.py: update all ChunkIndexEntry constructions.
2026-06-09 08:51:54 -04:00 · 2026-06-03 18:21:13 +05:30 · 2026-06-03 18:21:13 +05:30 · 8f28844a1e
commit 8f28844a1e
parent ec331057a9
7 changed files with 44 additions and 28 deletions
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@ -2013,7 +2013,10 @@ class ArchiveChecker:
            # either we already have this chunk in repo and chunks index or we add it now
            if id_ not in self.chunks:
                assert cdata is not None
-                self.chunks[id_] = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=size)
+                pack_id = id_  # only correct for N=1
+                self.chunks[id_] = ChunkIndexEntry(
+                    flags=ChunkIndex.F_USED, size=size, pack_id=pack_id, obj_offset=0, obj_size=0
+                )
                if self.repair:
                    self.repository.put(id_, cdata)

--- a/src/borg/archiver/compact_cmd.py
+++ b/src/borg/archiver/compact_cmd.py
@ -35,7 +35,7 @@ class ArchiveGarbageCollector:
    def repository_size(self):
        if self.chunks is None or not self.stats:
            return None
-        return sum(entry.size for id, entry in self.chunks.iteritems())  # sum of stored sizes
+        return sum(entry.obj_size for id, entry in self.chunks.iteritems())  # sum of stored sizes

    def garbage_collect(self):
        """Removes unused chunks from a repository."""
@ -53,12 +53,14 @@ class ArchiveGarbageCollector:
        if self.stats:  # slow method: build a fresh chunks index, with stored chunk sizes.
            logger.info("Getting object IDs present in the repository...")
            chunks = ChunkIndex()
-            for id, stored_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT):
+            for pack_id, pack_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT):
                # we add this id to the chunks index (as unused chunk), because
                # we do not know yet whether it is actually referenced from some archives.
-                # we "abuse" the size field here. usually there is the plaintext size,
-                # but we use it for the size of the stored object here.
-                chunks[id] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=stored_size)
+                chunk_id = pack_id  # N=1: chunk_id == pack_id
+                obj_size = pack_size  # true for N=1
+                chunks[chunk_id] = ChunkIndexEntry(
+                    flags=ChunkIndex.F_NONE, size=0, pack_id=pack_id, obj_offset=0, obj_size=obj_size
+                )
        else:  # faster: rely on existing chunks index (with flags F_NONE and size 0).
            logger.info("Getting object IDs from cached chunks index...")
            chunks = build_chunkindex_from_repo(self.repository, cache_immediately=True)
--- a/src/borg/cache.py
+++ b/src/borg/cache.py
@ -544,8 +544,7 @@ CHUNKINDEX_HASH_SEED = b"0001"  # increment seed to invalidate old chunk indexes
 def write_chunkindex_to_repo_cache(
    repository, chunks, *, incremental=True, clear=False, force_write=False, delete_other=False, delete_these=None
 ):
-    # for now, we don't want to serialize the flags or the size, just the keys (chunk IDs):
-    cleaned_value = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=0)
+    # for now, we don't want to serialize the flags or the size:
    chunks_to_write = ChunkIndex()
    # incremental==True:
    # the borghash code has no means to only serialize the F_NEW table entries,
@ -553,8 +552,8 @@ def write_chunkindex_to_repo_cache(
    # incremental==False:
    # maybe copying the stuff into a new ChunkIndex is not needed here,
    # but for simplicity, we do it anyway.
-    for key, _ in chunks.iteritems(only_new=incremental):
-        chunks_to_write[key] = cleaned_value
+    for key, existing in chunks.iteritems(only_new=incremental):
+        chunks_to_write[key] = existing._replace(flags=ChunkIndex.F_NONE, size=0)
    with io.BytesIO() as f:
        chunks_to_write.write(f)
        data = f.getvalue()
@ -644,10 +643,13 @@ def build_chunkindex_from_repo(repository, *, disable_caches=False, cache_immedi
    num_chunks = 0
    # The repo says it has these chunks, so we assume they are referenced/used chunks.
    # We do not know the plaintext size (!= stored_size), thus we set size = 0.
-    init_entry = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
-    for id, stored_size in repo_lister(repository, limit=LIST_SCAN_LIMIT):
+    for pack_id, pack_size in repo_lister(repository, limit=LIST_SCAN_LIMIT):
        num_chunks += 1
-        chunks[id] = init_entry
+        chunk_id = pack_id  # N=1: chunk_id == pack_id
+        obj_size = pack_size  # true for N=1
+        chunks[chunk_id] = ChunkIndexEntry(
+            flags=ChunkIndex.F_USED, size=0, pack_id=pack_id, obj_offset=0, obj_size=obj_size
+        )
    # Cache does not contain the manifest.
    if not isinstance(repository, (Repository, RemoteRepository)):
        del chunks[Manifest.MANIFEST_ID]
--- a/src/borg/hashindex.pyi
+++ b/src/borg/hashindex.pyi
@ -6,9 +6,12 @@ class HTProxyMixin(MutableMapping): ...

 class ChunkIndexEntry(NamedTuple):
    flags: int
-    size: int
+    size: int  # plaintext chunk size
+    pack_id: bytes
+    obj_offset: int
+    obj_size: int

-CIE = Tuple[int, int] | Type[ChunkIndexEntry]
+CIE = Tuple[int, int, bytes, int, int] | Type[ChunkIndexEntry]

 class ChunkIndex:
    F_NONE: int
--- a/src/borg/hashindex.pyx
+++ b/src/borg/hashindex.pyx
@ -34,14 +34,14 @@ class HTProxyMixin:
        self.ht.clear()


-ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'flags size')
-ChunkIndexEntryFormatT = namedtuple('ChunkIndexEntryFormatT', 'flags size')
-ChunkIndexEntryFormat = ChunkIndexEntryFormatT(flags="I", size="I")
+ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'flags size pack_id obj_offset obj_size')
+ChunkIndexEntryFormatT = namedtuple('ChunkIndexEntryFormatT', 'flags size pack_id obj_offset obj_size')
+ChunkIndexEntryFormat = ChunkIndexEntryFormatT(flags="I", size="I", pack_id="32s", obj_offset="I", obj_size="I")


 class ChunkIndex(HTProxyMixin, MutableMapping):
    """
-    Mapping from key256 to (flags32, size32) to track chunks in the repository.
+    Mapping from key256 to (flags32, size32, pack_id256, obj_offset32, obj_size32) to track chunks in the repository.
    """
    # .flags related values:
    F_NONE = 0  # all flags cleared
@ -79,7 +79,8 @@ class ChunkIndex(HTProxyMixin, MutableMapping):
        else:
            flags = v.flags | self.F_USED
            assert v.size == 0 or v.size == size
-        self[key] = ChunkIndexEntry(flags=flags, size=size)
+        pack_id = key  # N=1: chunk_id == pack_id
+        self[key] = ChunkIndexEntry(flags=flags, size=size, pack_id=pack_id, obj_offset=0, obj_size=0)

    def __getitem__(self, key):
        """Specialized __getitem__ that hides system flags."""
--- a/src/borg/repository.py
+++ b/src/borg/repository.py
@ -372,7 +372,6 @@ class Repository:
        # is using this object, but we assume that this is the case.
        # As we don't do garbage collection here, this is not a problem.
        # We also don't know the plaintext size, so we set it to 0.
-        init_entry = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
        infos = self.store.list("packs")
        try:
            for info in infos:
@ -410,8 +409,12 @@ class Repository:
                    # borg check: the index may have corrupted objects (we did not delete them)
                    # borg check --repair: the index will only have non-corrupted objects.
                    pack_id = hex_to_bin(info.name)
+                    pack_size = info.size
                    chunk_id = pack_id  # N=1: chunk_id == pack_id
-                    chunks[chunk_id] = init_entry
+                    obj_size = pack_size  # correct for N=1
+                    chunks[chunk_id] = ChunkIndexEntry(
+                        flags=ChunkIndex.F_USED, size=0, pack_id=pack_id, obj_offset=0, obj_size=obj_size
+                    )
                now = time.monotonic()
                if now > t_last_checkpoint + 300:  # checkpoint every 5 mins
                    t_last_checkpoint = now
--- a/src/borg/testsuite/hashindex_test.py
+++ b/src/borg/testsuite/hashindex_test.py
@ -20,11 +20,11 @@ def test_chunkindex_add():
    chunks = ChunkIndex()
    x = H2(1)
    chunks.add(x, 0)
-    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
+    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0, pack_id=x, obj_offset=0, obj_size=0)
    chunks.add(x, 2)  # updating size (we do not have a size yet)
-    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2)
+    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2, pack_id=x, obj_offset=0, obj_size=0)
    chunks.add(x, 2)
-    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2)
+    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2, pack_id=x, obj_offset=0, obj_size=0)
    with pytest.raises(AssertionError):
        chunks.add(x, 3)  # inconsistent size (we already have a different size)

@ -35,7 +35,7 @@ def test_keyerror():
    with pytest.raises(KeyError):
        chunks[x]
    with pytest.raises(struct.error):
-        chunks[x] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=2**33)
+        chunks[x] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=2**33, pack_id=x, obj_offset=0, obj_size=0)


 def test_new():
@ -43,8 +43,10 @@ def test_new():
        return list(chunks.iteritems(only_new=True))

    chunks = ChunkIndex()
-    key1, value1a = H2(1), ChunkIndexEntry(flags=ChunkIndex.F_USED, size=23)
-    key2, value2a = H2(2), ChunkIndexEntry(flags=ChunkIndex.F_USED, size=42)
+    key1 = H2(1)
+    value1a = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=23, pack_id=key1, obj_offset=0, obj_size=0)
+    key2 = H2(2)
+    value2a = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=42, pack_id=key2, obj_offset=0, obj_size=0)
    # Tracking of new entries
    assert new_chunks() == []
    chunks[key1] = value1a