mirror of
https://github.com/borgbackup/borg.git
synced 2026-06-09 08:51:54 -04:00
hashindex: add pack_id, obj_offset, obj_size to ChunkIndexEntry, refs #8572
Add three new fields to ChunkIndexEntry and update all call sites: - pack_id (32 bytes): identifies the pack file containing the chunk - obj_offset (uint32): byte offset of the chunk within the pack - obj_size (uint32): stored (compressed) size of the chunk on disk At N=1 (one chunk per pack), chunk_id == pack_id, obj_offset == 0, and obj_size == pack_size. All sites use chunk_id as the ChunkIndex key and extract pack_id as a separate variable with an N=1 comment. compact_cmd.py: use obj_size (stored size) in repository_size sum. cache.py: preserve pack fields when serializing the chunk index cache. repository.py: populate pack_id/obj_size from borgstore object info. archive.py: extract pack_id on its own line, obj_size=0 for now. hashindex.pyx: update add(), namedtuple, format string, and docstring. hashindex.pyi: add new fields to ChunkIndexEntry and CIE type alias. testsuite/hashindex_test.py: update all ChunkIndexEntry constructions.
This commit is contained in:
parent
ec331057a9
commit
8f28844a1e
7 changed files with 44 additions and 28 deletions
|
|
@ -2013,7 +2013,10 @@ class ArchiveChecker:
|
|||
# either we already have this chunk in repo and chunks index or we add it now
|
||||
if id_ not in self.chunks:
|
||||
assert cdata is not None
|
||||
self.chunks[id_] = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=size)
|
||||
pack_id = id_ # only correct for N=1
|
||||
self.chunks[id_] = ChunkIndexEntry(
|
||||
flags=ChunkIndex.F_USED, size=size, pack_id=pack_id, obj_offset=0, obj_size=0
|
||||
)
|
||||
if self.repair:
|
||||
self.repository.put(id_, cdata)
|
||||
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ class ArchiveGarbageCollector:
|
|||
def repository_size(self):
|
||||
if self.chunks is None or not self.stats:
|
||||
return None
|
||||
return sum(entry.size for id, entry in self.chunks.iteritems()) # sum of stored sizes
|
||||
return sum(entry.obj_size for id, entry in self.chunks.iteritems()) # sum of stored sizes
|
||||
|
||||
def garbage_collect(self):
|
||||
"""Removes unused chunks from a repository."""
|
||||
|
|
@ -53,12 +53,14 @@ class ArchiveGarbageCollector:
|
|||
if self.stats: # slow method: build a fresh chunks index, with stored chunk sizes.
|
||||
logger.info("Getting object IDs present in the repository...")
|
||||
chunks = ChunkIndex()
|
||||
for id, stored_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT):
|
||||
for pack_id, pack_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT):
|
||||
# we add this id to the chunks index (as unused chunk), because
|
||||
# we do not know yet whether it is actually referenced from some archives.
|
||||
# we "abuse" the size field here. usually there is the plaintext size,
|
||||
# but we use it for the size of the stored object here.
|
||||
chunks[id] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=stored_size)
|
||||
chunk_id = pack_id # N=1: chunk_id == pack_id
|
||||
obj_size = pack_size # true for N=1
|
||||
chunks[chunk_id] = ChunkIndexEntry(
|
||||
flags=ChunkIndex.F_NONE, size=0, pack_id=pack_id, obj_offset=0, obj_size=obj_size
|
||||
)
|
||||
else: # faster: rely on existing chunks index (with flags F_NONE and size 0).
|
||||
logger.info("Getting object IDs from cached chunks index...")
|
||||
chunks = build_chunkindex_from_repo(self.repository, cache_immediately=True)
|
||||
|
|
|
|||
|
|
@ -544,8 +544,7 @@ CHUNKINDEX_HASH_SEED = b"0001" # increment seed to invalidate old chunk indexes
|
|||
def write_chunkindex_to_repo_cache(
|
||||
repository, chunks, *, incremental=True, clear=False, force_write=False, delete_other=False, delete_these=None
|
||||
):
|
||||
# for now, we don't want to serialize the flags or the size, just the keys (chunk IDs):
|
||||
cleaned_value = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=0)
|
||||
# for now, we don't want to serialize the flags or the size:
|
||||
chunks_to_write = ChunkIndex()
|
||||
# incremental==True:
|
||||
# the borghash code has no means to only serialize the F_NEW table entries,
|
||||
|
|
@ -553,8 +552,8 @@ def write_chunkindex_to_repo_cache(
|
|||
# incremental==False:
|
||||
# maybe copying the stuff into a new ChunkIndex is not needed here,
|
||||
# but for simplicity, we do it anyway.
|
||||
for key, _ in chunks.iteritems(only_new=incremental):
|
||||
chunks_to_write[key] = cleaned_value
|
||||
for key, existing in chunks.iteritems(only_new=incremental):
|
||||
chunks_to_write[key] = existing._replace(flags=ChunkIndex.F_NONE, size=0)
|
||||
with io.BytesIO() as f:
|
||||
chunks_to_write.write(f)
|
||||
data = f.getvalue()
|
||||
|
|
@ -644,10 +643,13 @@ def build_chunkindex_from_repo(repository, *, disable_caches=False, cache_immedi
|
|||
num_chunks = 0
|
||||
# The repo says it has these chunks, so we assume they are referenced/used chunks.
|
||||
# We do not know the plaintext size (!= stored_size), thus we set size = 0.
|
||||
init_entry = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
|
||||
for id, stored_size in repo_lister(repository, limit=LIST_SCAN_LIMIT):
|
||||
for pack_id, pack_size in repo_lister(repository, limit=LIST_SCAN_LIMIT):
|
||||
num_chunks += 1
|
||||
chunks[id] = init_entry
|
||||
chunk_id = pack_id # N=1: chunk_id == pack_id
|
||||
obj_size = pack_size # true for N=1
|
||||
chunks[chunk_id] = ChunkIndexEntry(
|
||||
flags=ChunkIndex.F_USED, size=0, pack_id=pack_id, obj_offset=0, obj_size=obj_size
|
||||
)
|
||||
# Cache does not contain the manifest.
|
||||
if not isinstance(repository, (Repository, RemoteRepository)):
|
||||
del chunks[Manifest.MANIFEST_ID]
|
||||
|
|
|
|||
|
|
@ -6,9 +6,12 @@ class HTProxyMixin(MutableMapping): ...
|
|||
|
||||
class ChunkIndexEntry(NamedTuple):
|
||||
flags: int
|
||||
size: int
|
||||
size: int # plaintext chunk size
|
||||
pack_id: bytes
|
||||
obj_offset: int
|
||||
obj_size: int
|
||||
|
||||
CIE = Tuple[int, int] | Type[ChunkIndexEntry]
|
||||
CIE = Tuple[int, int, bytes, int, int] | Type[ChunkIndexEntry]
|
||||
|
||||
class ChunkIndex:
|
||||
F_NONE: int
|
||||
|
|
|
|||
|
|
@ -34,14 +34,14 @@ class HTProxyMixin:
|
|||
self.ht.clear()
|
||||
|
||||
|
||||
ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'flags size')
|
||||
ChunkIndexEntryFormatT = namedtuple('ChunkIndexEntryFormatT', 'flags size')
|
||||
ChunkIndexEntryFormat = ChunkIndexEntryFormatT(flags="I", size="I")
|
||||
ChunkIndexEntry = namedtuple('ChunkIndexEntry', 'flags size pack_id obj_offset obj_size')
|
||||
ChunkIndexEntryFormatT = namedtuple('ChunkIndexEntryFormatT', 'flags size pack_id obj_offset obj_size')
|
||||
ChunkIndexEntryFormat = ChunkIndexEntryFormatT(flags="I", size="I", pack_id="32s", obj_offset="I", obj_size="I")
|
||||
|
||||
|
||||
class ChunkIndex(HTProxyMixin, MutableMapping):
|
||||
"""
|
||||
Mapping from key256 to (flags32, size32) to track chunks in the repository.
|
||||
Mapping from key256 to (flags32, size32, pack_id256, obj_offset32, obj_size32) to track chunks in the repository.
|
||||
"""
|
||||
# .flags related values:
|
||||
F_NONE = 0 # all flags cleared
|
||||
|
|
@ -79,7 +79,8 @@ class ChunkIndex(HTProxyMixin, MutableMapping):
|
|||
else:
|
||||
flags = v.flags | self.F_USED
|
||||
assert v.size == 0 or v.size == size
|
||||
self[key] = ChunkIndexEntry(flags=flags, size=size)
|
||||
pack_id = key # N=1: chunk_id == pack_id
|
||||
self[key] = ChunkIndexEntry(flags=flags, size=size, pack_id=pack_id, obj_offset=0, obj_size=0)
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""Specialized __getitem__ that hides system flags."""
|
||||
|
|
|
|||
|
|
@ -372,7 +372,6 @@ class Repository:
|
|||
# is using this object, but we assume that this is the case.
|
||||
# As we don't do garbage collection here, this is not a problem.
|
||||
# We also don't know the plaintext size, so we set it to 0.
|
||||
init_entry = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
|
||||
infos = self.store.list("packs")
|
||||
try:
|
||||
for info in infos:
|
||||
|
|
@ -410,8 +409,12 @@ class Repository:
|
|||
# borg check: the index may have corrupted objects (we did not delete them)
|
||||
# borg check --repair: the index will only have non-corrupted objects.
|
||||
pack_id = hex_to_bin(info.name)
|
||||
pack_size = info.size
|
||||
chunk_id = pack_id # N=1: chunk_id == pack_id
|
||||
chunks[chunk_id] = init_entry
|
||||
obj_size = pack_size # correct for N=1
|
||||
chunks[chunk_id] = ChunkIndexEntry(
|
||||
flags=ChunkIndex.F_USED, size=0, pack_id=pack_id, obj_offset=0, obj_size=obj_size
|
||||
)
|
||||
now = time.monotonic()
|
||||
if now > t_last_checkpoint + 300: # checkpoint every 5 mins
|
||||
t_last_checkpoint = now
|
||||
|
|
|
|||
|
|
@ -20,11 +20,11 @@ def test_chunkindex_add():
|
|||
chunks = ChunkIndex()
|
||||
x = H2(1)
|
||||
chunks.add(x, 0)
|
||||
assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0)
|
||||
assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=0, pack_id=x, obj_offset=0, obj_size=0)
|
||||
chunks.add(x, 2) # updating size (we do not have a size yet)
|
||||
assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2)
|
||||
assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2, pack_id=x, obj_offset=0, obj_size=0)
|
||||
chunks.add(x, 2)
|
||||
assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2)
|
||||
assert chunks[x] == ChunkIndexEntry(flags=ChunkIndex.F_USED, size=2, pack_id=x, obj_offset=0, obj_size=0)
|
||||
with pytest.raises(AssertionError):
|
||||
chunks.add(x, 3) # inconsistent size (we already have a different size)
|
||||
|
||||
|
|
@ -35,7 +35,7 @@ def test_keyerror():
|
|||
with pytest.raises(KeyError):
|
||||
chunks[x]
|
||||
with pytest.raises(struct.error):
|
||||
chunks[x] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=2**33)
|
||||
chunks[x] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=2**33, pack_id=x, obj_offset=0, obj_size=0)
|
||||
|
||||
|
||||
def test_new():
|
||||
|
|
@ -43,8 +43,10 @@ def test_new():
|
|||
return list(chunks.iteritems(only_new=True))
|
||||
|
||||
chunks = ChunkIndex()
|
||||
key1, value1a = H2(1), ChunkIndexEntry(flags=ChunkIndex.F_USED, size=23)
|
||||
key2, value2a = H2(2), ChunkIndexEntry(flags=ChunkIndex.F_USED, size=42)
|
||||
key1 = H2(1)
|
||||
value1a = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=23, pack_id=key1, obj_offset=0, obj_size=0)
|
||||
key2 = H2(2)
|
||||
value2a = ChunkIndexEntry(flags=ChunkIndex.F_USED, size=42, pack_id=key2, obj_offset=0, obj_size=0)
|
||||
# Tracking of new entries
|
||||
assert new_chunks() == []
|
||||
chunks[key1] = value1a
|
||||
|
|
|
|||
Loading…
Reference in a new issue