From 8f28844a1e111fa5271e8fc050fbbc6520ec504f Mon Sep 17 00:00:00 2001
From: Mrityunjay Raj <mr.raj.earth@gmail.com>
Date: Wed, 3 Jun 2026 18:21:13 +0530
Subject: [PATCH] hashindex: add pack_id, obj_offset, obj_size to
 ChunkIndexEntry, refs #8572

Add three new fields to ChunkIndexEntry and update all call sites:
- pack_id (32 bytes): identifies the pack file containing the chunk
- obj_offset (uint32): byte offset of the chunk within the pack
- obj_size (uint32): stored (compressed) size of the chunk on disk

At N=1 (one chunk per pack), chunk_id == pack_id, obj_offset == 0,
and obj_size == pack_size. All sites use chunk_id as the ChunkIndex
key and extract pack_id as a separate variable with an N=1 comment.

compact_cmd.py: use obj_size (stored size) in repository_size sum.
cache.py: preserve pack fields when serializing the chunk index cache.
repository.py: populate pack_id/obj_size from borgstore object info.
archive.py: extract pack_id on its own line, obj_size=0 for now.
hashindex.pyx: update add(), namedtuple, format string, and docstring.
hashindex.pyi: add new fields to ChunkIndexEntry and CIE type alias.
testsuite/hashindex_test.py: update all ChunkIndexEntry constructions.
---
 src/borg/archive.py                  |  5 ++++-
 src/borg/archiver/compact_cmd.py     | 12 +++++++-----
 src/borg/cache.py                    | 16 +++++++++-------
 src/borg/hashindex.pyi               |  7 +++++--
 src/borg/hashindex.pyx               | 11 ++++++-----
 src/borg/repository.py               |  7 +++++--
 src/borg/testsuite/hashindex_test.py | 14 ++++++++------
 7 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/src/borg/archive.py b/src/borg/archive.py
index 305b7894f..81a5fe196 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -2013,7 +2013,10 @@ class ArchiveChecker:
             # either we already have this chunk in repo and chunks index or we add it now
             if id_ not in self.chunks:
                 assert cdata is not None
-                self.chunks[id_] = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=size)
+                pack_id = id_  # only correct for N=1
+                self.chunks[id_] = ChunkIndexEntry(
+                    flags=ChunkIndex.F_USED, size=size, pack_id=pack_id, obj_offset=0, obj_size=0
+                )
                 if self.repair:
                     self.repository.put(id_, cdata)
 
diff --git a/src/borg/archiver/compact_cmd.py b/src/borg/archiver/compact_cmd.py
index c25bb03e1..66ac4bfbc 100644
--- a/src/borg/archiver/compact_cmd.py
+++ b/src/borg/archiver/compact_cmd.py
@@ -35,7 +35,7 @@ class ArchiveGarbageCollector:
     def repository_size(self):
         if self.chunks is None or not self.stats:
             return None
-        return sum(entry.size for id, entry in self.chunks.iteritems())  # sum of stored sizes
+        return sum(entry.obj_size for id, entry in self.chunks.iteritems())  # sum of stored sizes
 
     def garbage_collect(self):
         """Removes unused chunks from a repository."""
@@ -53,12 +53,14 @@ class ArchiveGarbageCollector:
         if self.stats:  # slow method: build a fresh chunks index, with stored chunk sizes.
             logger.info("Getting object IDs present in the repository...")
             chunks = ChunkIndex()
-            for id, stored_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT):
+            for pack_id, pack_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT):
                 # we add this id to the chunks index (as unused chunk), because
                 # we do not know yet whether it is actually referenced from some archives.
-                # we "abuse" the size field here. usually there is the plaintext size,
-                # but we use it for the size of the stored object here.
-                chunks[id] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=stored_size)
+                chunk_id = pack_id  # N=1: chunk_id == pack_id
+                obj_size = pack_size  # true for N=1
+                chunks[chunk_id] = ChunkIndexEntry(
+                    flags=ChunkIndex.F_NONE, size=0, pack_id=pack_id, obj_offset=0, obj_size=obj_size
+                )
         else:  # faster: rely on existing chunks index (with flags F_NONE and size 0).
             logger.info("Getting object IDs from cached chunks index...")
             chunks = build_chunkindex_from_repo(self.repository, cache_immediately=True)
diff --git a/src/borg/cache.py b/src/borg/cache.py
index c9b5793db..d6138db86 100644
--- a/src/borg/cache.py
+++ b/src/borg/cache.py
@@ -544,8 +544,7 @@ CHUNKINDEX_HASH_SEED = b"0001"  # increment seed to invalidate old chunk indexes
 def write_chunkindex_to_repo_cache(
     repository, chunks, *, incremental=True, clear=False, force_write=False, delete_other=False, delete_these=None
 ):
-    # for now, we don't want to serialize the flags or the size, just the keys (chunk IDs):
-    cleaned_value = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=0)
+    # for now, we don't want to serialize the flags or the size:
     chunks_to_write = ChunkIndex()
     # incremental==True:
     # the borghash code has no means to only serialize the F_NEW table entries,
@@ -553,8 +552,8 @@ def write_chunkindex_to_repo_cache(
     # incremental==False:
     # maybe copying the stuff into a new ChunkIndex is not needed here,
     # but for simplicity, we do it anyway.
-    for key, _ in chunks.iteritems(only_new=incremental):
-        chunks_to_write[key] = cleaned_value
+    for key, existing in chunks.iteritems(only_new=incremental):
+        chunks_to_write[key] = existing._replace(flags=ChunkIndex.F_NONE, size=0)
     with io.BytesIO() as f:
         chunks_to_write.write(f)
         data = f.getvalue()
@@ -644,10 +643,13 @@ def build_chunkindex_from_repo(repository, *, disable_caches=False, cache_immedi
     num_chunks = 0
     # The repo says it has these chunks, so we assume they are referenced/used chunks.
     # We do not know the plaintext size (!= stored_size), thus we set size = 0.
-    init_entry = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
-    for id, stored_size in repo_lister(repository, limit=LIST_SCAN_LIMIT):
+    for pack_id, pack_size in repo_lister(repository, limit=LIST_SCAN_LIMIT):
         num_chunks += 1
-        chunks[id] = init_entry
+        chunk_id = pack_id  # N=1: chunk_id == pack_id
+        obj_size = pack_size  # true for N=1
+        chunks[chunk_id] = ChunkIndexEntry(
+            flags=ChunkIndex.F_USED, size=0, pack_id=pack_id, obj_offset=0, obj_size=obj_size
+        )
     # Cache does not contain the manifest.
     if not isinstance(repository, (Repository, RemoteRepository)):
         del chunks[Manifest.MANIFEST_ID]
diff --git a/src/borg/hashindex.pyi b/src/borg/hashindex.pyi
index 7241de8eb..a978c587a 100644
--- a/src/borg/hashindex.pyi
+++ b/src/borg/hashindex.pyi
@@ -6,9 +6,12 @@ class HTProxyMixin(MutableMapping): ...
 
 class ChunkIndexEntry(NamedTuple):
     flags: int
-    size: int
+    size: int  # plaintext chunk size
+    pack_id: bytes
+    obj_offset: int
+    obj_size: int
 
-CIE = Tuple[int, int] | Type[ChunkIndexEntry]
+CIE = Tuple[int, int, bytes, int, int] | Type[ChunkIndexEntry]
 
 class ChunkIndex:
     F_NONE: int
diff --git a/src/borg/hashindex.pyx b/src/borg/hashindex.pyx
index e5328b425..e4eb43274 100644
--- a/src/borg/hashindex.pyx
+++ b/src/borg/hashindex.pyx
@@ -34,14 +34,14 @@ class HTProxyMixin:
         self.ht.clear()
 
 
-ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'flags size')
-ChunkIndexEntryFormatT = namedtuple('ChunkIndexEntryFormatT', 'flags size')
-ChunkIndexEntryFormat = ChunkIndexEntryFormatT(flags="I", size="I")
+ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'flags size pack_id obj_offset obj_size')
+ChunkIndexEntryFormatT = namedtuple('ChunkIndexEntryFormatT', 'flags size pack_id obj_offset obj_size')
+ChunkIndexEntryFormat = ChunkIndexEntryFormatT(flags="I", size="I", pack_id="32s", obj_offset="I", obj_size="I")
 
 
 class ChunkIndex(HTProxyMixin, MutableMapping):
     """
-    Mapping from key256 to (flags32, size32) to track chunks in the repository.
+    Mapping from key256 to (flags32, size32, pack_id256, obj_offset32, obj_size32) to track chunks in the repository.
     """
     # .flags related values:
     F_NONE = 0  # all flags cleared
@@ -79,7 +79,8 @@ class ChunkIndex(HTProxyMixin, MutableMapping):
         else:
             flags = v.flags | self.F_USED
             assert v.size == 0 or v.size == size
-        self[key] = ChunkIndexEntry(flags=flags, size=size)
+        pack_id = key  # N=1: chunk_id == pack_id
+        self[key] = ChunkIndexEntry(flags=flags, size=size, pack_id=pack_id, obj_offset=0, obj_size=0)
 
     def __getitem__(self, key):
         """Specialized __getitem__ that hides system flags."""
diff --git a/src/borg/repository.py b/src/borg/repository.py
index bba937e33..51e1af70d 100644
--- a/src/borg/repository.py
+++ b/src/borg/repository.py
@@ -372,7 +372,6 @@ class Repository:
         # is using this object, but we assume that this is the case.
         # As we don't do garbage collection here, this is not a problem.
         # We also don't know the plaintext size, so we set it to 0.
-        init_entry = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
         infos = self.store.list("packs")
         try:
             for info in infos:
@@ -410,8 +409,12 @@ class Repository:
                     # borg check: the index may have corrupted objects (we did not delete them)
                     # borg check --repair: the index will only have non-corrupted objects.
                     pack_id = hex_to_bin(info.name)
+                    pack_size = info.size
                     chunk_id = pack_id  # N=1: chunk_id == pack_id
-                    chunks[chunk_id] = init_entry
+                    obj_size = pack_size  # correct for N=1
+                    chunks[chunk_id] = ChunkIndexEntry(
+                        flags=ChunkIndex.F_USED, size=0, pack_id=pack_id, obj_offset=0, obj_size=obj_size
+                    )
                 now = time.monotonic()
                 if now > t_last_checkpoint + 300:  # checkpoint every 5 mins
                     t_last_checkpoint = now
diff --git a/src/borg/testsuite/hashindex_test.py b/src/borg/testsuite/hashindex_test.py
index 2425921ae..8c9ec5be9 100644
--- a/src/borg/testsuite/hashindex_test.py
+++ b/src/borg/testsuite/hashindex_test.py
@@ -20,11 +20,11 @@ def test_chunkindex_add():
     chunks = ChunkIndex()
     x = H2(1)
     chunks.add(x, 0)
-    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
+    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0, pack_id=x, obj_offset=0, obj_size=0)
     chunks.add(x, 2)  # updating size (we do not have a size yet)
-    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2)
+    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2, pack_id=x, obj_offset=0, obj_size=0)
     chunks.add(x, 2)
-    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2)
+    assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2, pack_id=x, obj_offset=0, obj_size=0)
     with pytest.raises(AssertionError):
         chunks.add(x, 3)  # inconsistent size (we already have a different size)
 
@@ -35,7 +35,7 @@ def test_keyerror():
     with pytest.raises(KeyError):
         chunks[x]
     with pytest.raises(struct.error):
-        chunks[x] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=2**33)
+        chunks[x] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=2**33, pack_id=x, obj_offset=0, obj_size=0)
 
 
 def test_new():
@@ -43,8 +43,10 @@ def test_new():
         return list(chunks.iteritems(only_new=True))
 
     chunks = ChunkIndex()
-    key1, value1a = H2(1), ChunkIndexEntry(flags=ChunkIndex.F_USED, size=23)
-    key2, value2a = H2(2), ChunkIndexEntry(flags=ChunkIndex.F_USED, size=42)
+    key1 = H2(1)
+    value1a = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=23, pack_id=key1, obj_offset=0, obj_size=0)
+    key2 = H2(2)
+    value2a = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=42, pack_id=key2, obj_offset=0, obj_size=0)
     # Tracking of new entries
     assert new_chunks() == []
     chunks[key1] = value1a