From fb6192bf86af31109581828eb3de279b4882fdf2 Mon Sep 17 00:00:00 2001 From: Mrityunjay Raj Date: Sat, 23 May 2026 13:27:51 +0530 Subject: [PATCH] legacy: move NSIndex1 to borg.legacy.hashindex, refs #9556 --- src/borg/hashindex.pyi | 14 +-- src/borg/hashindex.pyx | 100 ------------------ src/borg/legacy/hashindex.py | 108 ++++++++++++++++++++ src/borg/legacy/repository.py | 2 +- src/borg/testsuite/legacyrepository_test.py | 2 +- 5 files changed, 113 insertions(+), 113 deletions(-) create mode 100644 src/borg/legacy/hashindex.py diff --git a/src/borg/hashindex.pyi b/src/borg/hashindex.pyi index 60c4fa8da..7241de8eb 100644 --- a/src/borg/hashindex.pyi +++ b/src/borg/hashindex.pyi @@ -1,7 +1,9 @@ -from typing import NamedTuple, Tuple, Type, IO, Iterator, Any +from typing import NamedTuple, Tuple, Type, IO, Iterator, Any, MutableMapping PATH_OR_FILE = str | IO +class HTProxyMixin(MutableMapping): ... + class ChunkIndexEntry(NamedTuple): flags: int size: int @@ -22,16 +24,6 @@ class ChunkIndex: def __getitem__(self, key: bytes) -> Type[ChunkIndexEntry]: ... def __setitem__(self, key: bytes, value: CIE) -> None: ... -class NSIndex1Entry(NamedTuple): - segment: int - offset: int - -class NSIndex1: # legacy - def iteritems(self, *args, **kwargs) -> Iterator: ... - def __contains__(self, key: bytes) -> bool: ... - def __getitem__(self, key: bytes) -> Any: ... - def __setitem__(self, key: bytes, value: Any) -> None: ... - class FuseVersionsIndexEntry(NamedTuple): version: int hash: bytes diff --git a/src/borg/hashindex.pyx b/src/borg/hashindex.pyx index a62a0491a..e5328b425 100644 --- a/src/borg/hashindex.pyx +++ b/src/borg/hashindex.pyx @@ -143,103 +143,3 @@ class FuseVersionsIndex(HTProxyMixin, MutableMapping): """ def __init__(self): self.ht = HashTableNT(key_size=16, value_type=FuseVersionsIndexEntry, value_format=FuseVersionsIndexEntryFormat) - - -NSIndex1Entry = namedtuple('NSIndex1Entry', 'segment offset') -NSIndex1EntryFormatT = namedtuple('NSIndex1EntryFormatT', 'segment offset') -NSIndex1EntryFormat = NSIndex1EntryFormatT(segment="I", offset="I") - - -class NSIndex1(HTProxyMixin, MutableMapping): - """ - Mapping from key256 to (segment32, offset32), as used by the legacy repository index of Borg 1.x. - """ - MAX_VALUE = 2**32 - 1 # borghash has the full uint32_t range - MAGIC = b"BORG_IDX" # borg 1.x - HEADER_FMT = "<8sIIBB" # magic, entries, buckets, ksize, vsize - KEY_SIZE = 32 - VALUE_SIZE = 8 - - def __init__(self, capacity=1000, path=None, usable=None): - if usable is not None: - capacity = usable * 2 # load factor 0.5 - self.ht = HashTableNT(key_size=self.KEY_SIZE, value_type=NSIndex1Entry, value_format=NSIndex1EntryFormat, - capacity=capacity) - if path: - self._read(path) - - def iteritems(self, marker=None): - do_yield = marker is None - for key, value in self.ht.items(): - if do_yield: - yield key, value - else: - do_yield = key == marker - - @classmethod - def read(cls, path): - return cls(path=path) - - def size(self): - return self.ht.size() # not quite correct as this is not the on-disk read-only format. - - def write(self, path): - if isinstance(path, str): - with open(path, 'wb') as fd: - self._write_fd(fd) - else: - self._write_fd(path) - - def _read(self, path): - if isinstance(path, str): - with open(path, 'rb') as fd: - self._read_fd(fd) - else: - self._read_fd(path) - - def _write_fd(self, fd): - used = len(self.ht) - header_bytes = struct.pack(self.HEADER_FMT, self.MAGIC, used, used, self.KEY_SIZE, self.VALUE_SIZE) - fd.write(header_bytes) - # record the header as a separate integrity-hash part if supported - hash_part = getattr(fd, "hash_part", None) - if hash_part: - hash_part("HashHeader") - count = 0 - for key, _ in self.ht.items(): - value = self.ht._get_raw(key) - fd.write(key) - fd.write(value) - count += 1 - assert count == used - - def _read_fd(self, fd): - header_size = struct.calcsize(self.HEADER_FMT) - header_bytes = fd.read(header_size) - if len(header_bytes) < header_size: - raise ValueError(f"Invalid file: file is too short (header).") - # verify the header as a separate integrity-hash part if supported - hash_part = getattr(fd, "hash_part", None) - if hash_part: - hash_part("HashHeader") - magic, entries, buckets, ksize, vsize = struct.unpack(self.HEADER_FMT, header_bytes) - if magic != self.MAGIC: - raise ValueError(f"Invalid file: magic {self.MAGIC.decode()} not found.") - assert ksize == self.KEY_SIZE, "invalid key size" - assert vsize == self.VALUE_SIZE, "invalid value size" - buckets_size = buckets * (ksize + vsize) - current_pos = fd.tell() - end_of_file = fd.seek(0, os.SEEK_END) - if current_pos + buckets_size != end_of_file: - raise ValueError(f"Invalid file: file size does not match (buckets).") - fd.seek(current_pos) - for i in range(buckets): - key = fd.read(ksize) - value = fd.read(vsize) - if value.startswith(b'\xFF\xFF\xFF\xFF'): # LE for 0xffffffff (empty/unused bucket) - continue - if value.startswith(b'\xFE\xFF\xFF\xFF'): # LE for 0xfffffffe (deleted/tombstone bucket) - continue - self.ht._set_raw(key, value) - pos = fd.tell() - assert pos == end_of_file diff --git a/src/borg/legacy/hashindex.py b/src/borg/legacy/hashindex.py new file mode 100644 index 000000000..2dad9db39 --- /dev/null +++ b/src/borg/legacy/hashindex.py @@ -0,0 +1,108 @@ +from collections.abc import MutableMapping +from collections import namedtuple +import os +import struct + +from borghash import HashTableNT + +from ..hashindex import HTProxyMixin + + +NSIndex1Entry = namedtuple("NSIndex1Entry", "segment offset") +NSIndex1EntryFormatT = namedtuple("NSIndex1EntryFormatT", "segment offset") +NSIndex1EntryFormat = NSIndex1EntryFormatT(segment="I", offset="I") + + +class NSIndex1(HTProxyMixin, MutableMapping): + """ + Mapping from key256 to (segment32, offset32), as used by the legacy repository index of Borg 1.x. + """ + + MAX_VALUE = 2**32 - 1 # borghash has the full uint32_t range + MAGIC = b"BORG_IDX" # borg 1.x + HEADER_FMT = "<8sIIBB" # magic, entries, buckets, ksize, vsize + KEY_SIZE = 32 + VALUE_SIZE = 8 + + def __init__(self, capacity=1000, path=None, usable=None): + if usable is not None: + capacity = usable * 2 # load factor 0.5 + self.ht = HashTableNT( + key_size=self.KEY_SIZE, value_type=NSIndex1Entry, value_format=NSIndex1EntryFormat, capacity=capacity + ) + if path: + self._read(path) + + def iteritems(self, marker=None): + do_yield = marker is None + for key, value in self.ht.items(): + if do_yield: + yield key, value + else: + do_yield = key == marker + + @classmethod + def read(cls, path): + return cls(path=path) + + def size(self): + return self.ht.size() # not quite correct as this is not the on-disk read-only format. + + def write(self, path): + if isinstance(path, str): + with open(path, "wb") as fd: + self._write_fd(fd) + else: + self._write_fd(path) + + def _read(self, path): + if isinstance(path, str): + with open(path, "rb") as fd: + self._read_fd(fd) + else: + self._read_fd(path) + + def _write_fd(self, fd): + used = len(self.ht) + header_bytes = struct.pack(self.HEADER_FMT, self.MAGIC, used, used, self.KEY_SIZE, self.VALUE_SIZE) + fd.write(header_bytes) + hash_part = getattr(fd, "hash_part", None) + if hash_part: + hash_part("HashHeader") + count = 0 + for key, _ in self.ht.items(): + value = self.ht._get_raw(key) + fd.write(key) + fd.write(value) + count += 1 + assert count == used + + def _read_fd(self, fd): + header_size = struct.calcsize(self.HEADER_FMT) + header_bytes = fd.read(header_size) + if len(header_bytes) < header_size: + raise ValueError("Invalid file: file is too short (header).") + hash_part = getattr(fd, "hash_part", None) + if hash_part: + hash_part("HashHeader") + magic, entries, buckets, ksize, vsize = struct.unpack(self.HEADER_FMT, header_bytes) + if magic != self.MAGIC: + raise ValueError(f"Invalid file: magic {self.MAGIC.decode()} not found.") + assert ksize == self.KEY_SIZE, "invalid key size" + assert vsize == self.VALUE_SIZE, "invalid value size" + buckets_size = buckets * (ksize + vsize) + current_pos = fd.tell() + end_of_file = fd.seek(0, os.SEEK_END) + if current_pos + buckets_size != end_of_file: + raise ValueError("Invalid file: file size does not match (buckets).") + fd.seek(current_pos) + for i in range(buckets): + key = fd.read(ksize) + value = fd.read(vsize) + if value.startswith(b"\xff\xff\xff\xff"): # LE for 0xffffffff (empty/unused bucket) + continue + if value.startswith(b"\xfe\xff\xff\xff"): # LE for 0xfffffffe (deleted/tombstone bucket) + continue + self.ht._set_raw(key, value) + pos = fd.tell() + assert pos == end_of_file diff --git a/src/borg/legacy/repository.py b/src/borg/legacy/repository.py index 60e375e3e..5ae48e774 100644 --- a/src/borg/legacy/repository.py +++ b/src/borg/legacy/repository.py @@ -16,7 +16,7 @@ from zlib import crc32 import xxhash from ..constants import * # NOQA -from ..hashindex import NSIndex1Entry, NSIndex1 +from .hashindex import NSIndex1Entry, NSIndex1 from ..helpers import Error, ErrorWithTraceback, IntegrityError, format_file_size, parse_file_size from ..helpers import Location from ..helpers import ProgressIndicatorPercent diff --git a/src/borg/testsuite/legacyrepository_test.py b/src/borg/testsuite/legacyrepository_test.py index 28e10adcf..a1da6248a 100644 --- a/src/borg/testsuite/legacyrepository_test.py +++ b/src/borg/testsuite/legacyrepository_test.py @@ -7,7 +7,7 @@ from unittest.mock import patch import pytest from xxhash import xxh64 -from ..hashindex import NSIndex1 +from ..legacy.hashindex import NSIndex1 from ..helpers import Location from ..helpers import IntegrityError from ..helpers import msgpack