Merge pull request #9672 from ThomasWaldmann/less-xxh64

Use less xxh64, add SHA256FileHashingWrapper
2026-06-10 09:21:44 -04:00 · 2026-05-28 20:37:48 +02:00 · 2026-05-28 20:37:48 +02:00 · 33afaa1f3c
commit 33afaa1f3c
parent 9389d98c6d caf4d25d2d
7 changed files with 24 additions and 60 deletions
--- a/src/borg/archiver/benchmark_cmd.py
+++ b/src/borg/archiver/benchmark_cmd.py
@ -213,22 +213,6 @@ class BenchmarkMixIn:
            else:
                print(f"{spec:<24} {format_file_size(size):<10} {dt:.3f}s")

-        from xxhash import xxh64
-        from zlib import crc32
-
-        if not args.json:
-            print("Non-cryptographic checksums / hashes ===========================")
-        else:
-            result["checksums"] = []
-        size = 1000000000
-        tests = [("xxh64", lambda: xxh64(random_10M).digest()), ("crc32 (zlib)", lambda: crc32(random_10M))]
-        for spec, func in tests:
-            dt = timeit(func, number=number_default)
-            if args.json:
-                result["checksums"].append({"algo": spec, "size": size, "time": dt})
-            else:
-                print(f"{spec:<24} {format_file_size(size):<10} {dt:.3f}s")
-
        from ..crypto.low_level import hmac_sha256, blake2b_256

        if not args.json:
--- a/src/borg/archiver/check_cmd.py
+++ b/src/borg/archiver/check_cmd.py
@ -40,7 +40,7 @@ class CheckMixIn:
        if args.repair and args.max_duration:
            raise CommandError("--repair does not allow --max-duration argument.")
        if args.max_duration and not args.repo_only:
-            # when doing a partial repo check, we can only check xxh64 hashes in repository files.
+            # when doing a partial repo check, we can only do a low-level check of the repository files.
            # archives check requires that a full repo check was done before and has built/cached a ChunkIndex.
            # also, there is no max_duration support in the archives check code anyway.
            raise CommandError("--repository-only is required for --max-duration support.")
@ -123,13 +123,12 @@ class CheckMixIn:
        for the same reason. Therefore, partial checks may be useful only with very large
        repositories where a full check would take too long.

-        The ``--verify-data`` option will perform a full integrity verification (as
-        opposed to checking just the xxh64) of data, which means reading the
-        data from the repository, decrypting and decompressing it. It is a complete
-        cryptographic verification and hence very time-consuming, but will detect any
-        accidental and malicious corruption. Tamper-resistance is only guaranteed for
-        encrypted repositories against attackers without access to the keys. You cannot
-        use ``--verify-data`` with ``--repository-only``.
+        The ``--verify-data`` option will perform a full integrity verification of data,
+        which means reading the data from the repository, decrypting and decompressing it.
+        It is a complete cryptographic verification and hence very time-consuming, but
+        will detect any accidental and malicious corruption. Tamper-resistance is only
+        guaranteed for encrypted repositories against attackers without access to the keys.
+        You cannot use ``--verify-data`` with ``--repository-only``.

        The ``--find-lost-archives`` option will also scan the whole repository, but
        tells Borg to search for lost archive metadata. If Borg encounters any archive
--- a/src/borg/cache.py
+++ b/src/borg/cache.py
@ -1,4 +1,5 @@
 import configparser
+import hashlib
 import io
 import os
 import shutil
@ -8,8 +9,6 @@ from datetime import datetime, timezone, timedelta
 from pathlib import Path
 from time import perf_counter

-from xxhash import xxh64
-
 from borgstore.backends.errors import PermissionDenied

 from .logger import create_logger
@ -51,7 +50,7 @@ def files_cache_name(archive_name, files_cache_name="files"):
    # when not, the user may manually do that by using the env var.
    if not suffix:
        # avoid issues with too complex or long archive_name by hashing it:
-        suffix = xxh64(archive_name.encode()).hexdigest()
+        suffix = hashlib.sha256(archive_name.encode()).hexdigest()
    return files_cache_name + "." + suffix


@ -539,7 +538,7 @@ def delete_chunkindex_cache(repository):
    logger.debug(f"cached chunk indexes deleted: {hashes}")


-CHUNKINDEX_HASH_SEED = 3
+CHUNKINDEX_HASH_SEED = b"0001"  # increment seed to invalidate old chunk indexes


 def write_chunkindex_to_repo_cache(
@ -564,11 +563,11 @@ def write_chunkindex_to_repo_cache(
    if clear:
        # if we don't need the in-memory chunks index anymore:
        chunks.clear()  # free memory, immediately
-    new_hash = xxh64(data, seed=CHUNKINDEX_HASH_SEED).hexdigest()
+    new_hash = hashlib.sha256(data + CHUNKINDEX_HASH_SEED).hexdigest()
    cached_hashes = list_chunkindex_hashes(repository)
    if force_write or new_hash not in cached_hashes:
        # when an updated chunks index is stored into the cache, we also store its hash as part of the name.
-        # when a client is loading the chunks index from a cache, it has to compare its xxh64
+        # when a client is loading the chunks index from a cache, it has to compare its content
        # hash against the hash in its name. if it is the same, the cache is valid.
        # if it is different, the cache is either corrupted or out of date and has to be discarded.
        # when some functionality is DELETING chunks from the repository, it has to delete
@ -605,7 +604,7 @@ def read_chunkindex_from_repo_cache(repository, hash):
    except StoreObjectNotFound:
        logger.debug(f"{cache_name} not found in the repository.")
    else:
-        if xxh64(chunks_data, seed=CHUNKINDEX_HASH_SEED).digest() == hex_to_bin(hash):
+        if hashlib.sha256(chunks_data + CHUNKINDEX_HASH_SEED).digest() == hex_to_bin(hash):
            logger.debug(f"{cache_name} is valid.")
            with io.BytesIO(chunks_data) as f:
                chunks = ChunkIndex.read(f)
--- a/src/borg/crypto/file_integrity.py
+++ b/src/borg/crypto/file_integrity.py
@ -5,8 +5,6 @@ from hmac import compare_digest
 from collections.abc import Callable
 from pathlib import Path

-from xxhash import xxh64
-
 from ..helpers import IntegrityError
 from ..logger import create_logger

@ -106,20 +104,12 @@ class FileHashingWrapper(FileLikeWrapper):
        self.hash.update(str(self.tell()).encode())


-class SHA512FileHashingWrapper(FileHashingWrapper):
-    ALGORITHM = "SHA512"
-    FACTORY = hashlib.sha512
+class SHA256FileHashingWrapper(FileHashingWrapper):
+    ALGORITHM = "SHA256"
+    FACTORY = hashlib.sha256


-class XXH64FileHashingWrapper(FileHashingWrapper):
-    ALGORITHM = "XXH64"
-    FACTORY = xxh64
-
-
-SUPPORTED_ALGORITHMS = {
-    SHA512FileHashingWrapper.ALGORITHM: SHA512FileHashingWrapper,
-    XXH64FileHashingWrapper.ALGORITHM: XXH64FileHashingWrapper,
-}
+SUPPORTED_ALGORITHMS = {SHA256FileHashingWrapper.ALGORITHM: SHA256FileHashingWrapper}


 class FileIntegrityError(IntegrityError):
@ -137,7 +127,7 @@ class IntegrityCheckedFile(FileLikeWrapper):
        self.file_opened = override_fd is None
        self.digests = {}

-        hash_cls = XXH64FileHashingWrapper
+        hash_cls = SHA256FileHashingWrapper

        if not write:
            algorithm_and_digests = self.load_integrity_data(path, integrity_data)
--- a/src/borg/helpers/parseformat.py
+++ b/src/borg/helpers/parseformat.py
@ -968,9 +968,9 @@ class ArchiveFormatter(BaseFormatter):


 class ItemFormatter(BaseFormatter):
-    # we provide the hash algos from python stdlib (except shake_*) and additionally xxh64.
+    # we provide the hash algos from python stdlib (except shake_*).
    # shake_* is not provided because it uses an incompatible .digest() method to support variable length.
-    hash_algorithms = set(hashlib.algorithms_guaranteed).union({"xxh64"}).difference({"shake_128", "shake_256"})
+    hash_algorithms = set(hashlib.algorithms_guaranteed).difference({"shake_128", "shake_256"})
    KEY_DESCRIPTIONS = {
        "type": "file type (file, dir, symlink, ...)",
        "mode": "file mode (as in stat)",
@ -992,7 +992,6 @@ class ItemFormatter(BaseFormatter):
        "isomtime": "file modification time (ISO 8601 format)",
        "isoctime": "file change time (ISO 8601 format)",
        "isoatime": "file access time (ISO 8601 format)",
-        "xxh64": "XXH64 checksum of this file (note: this is NOT a cryptographic hash!)",
        "fingerprint": "Fingerprint of the file content (may have false negatives), format: H(conditions)-H(chunk_ids)",
        "archiveid": "internal ID of the archive",
        "archivename": "name of the archive",
@ -1013,11 +1012,8 @@ class ItemFormatter(BaseFormatter):
        return any(key in cls.KEYS_REQUIRING_CACHE for key in format_keys)

    def __init__(self, archive, format):
-        from xxhash import xxh64
-
        static_data = {"archivename": archive.name, "archiveid": archive.fpr} | self.FIXED_KEYS
        super().__init__(format, static_data)
-        self.xxh64 = xxh64
        self.archive = archive
        # track which keys were requested in the format string
        self.format_keys = {f[1] for f in Formatter().parse(format)}
@ -1104,9 +1100,7 @@ class ItemFormatter(BaseFormatter):
    def hash_item(self, hash_function, item):
        if "chunks" not in item:
            return ""
-        if hash_function == "xxh64":
-            hash = self.xxh64()
-        elif hash_function in self.hash_algorithms:
+        if hash_function in self.hash_algorithms:
            hash = hashlib.new(hash_function)
        for data in self.archive.pipeline.fetch_many(item.chunks, ro_type=ROBJ_FILE_STREAM):
            hash.update(data)
--- a/src/borg/storelocking.py
+++ b/src/borg/storelocking.py
@ -1,10 +1,9 @@
 import datetime
+import hashlib
 import json
 import random
 import time

-from xxhash import xxh64
-
 from borgstore.store import ObjectNotFound

 from . import platform
@ -101,7 +100,7 @@ class Lock:
        timestamp = now.isoformat(timespec="milliseconds")
        lock = dict(exclusive=exclusive, hostid=self.id[0], processid=self.id[1], threadid=self.id[2], time=timestamp)
        value = json.dumps(lock).encode("utf-8")
-        key = xxh64(value).hexdigest()
+        key = hashlib.sha256(value).hexdigest()
        logger.debug(f"LOCK-CREATE: creating lock in store. key: {key}, lock: {lock}.")
        self.store.store(f"locks/{key}", value)
        if update_last_refresh:
--- a/src/borg/testsuite/archiver/benchmark_cmd_test.py
+++ b/src/borg/testsuite/archiver/benchmark_cmd_test.py
@ -50,7 +50,6 @@ def test_benchmark_cpu(archiver, monkeypatch):
    output = cmd(archiver, "benchmark", "cpu")
    # verify all section headers appear in the plain-text output
    assert "Chunkers" in output
-    assert "Non-cryptographic checksums / hashes" in output
    assert "Cryptographic hashes / MACs" in output
    assert "Encryption" in output
    assert "Compression" in output
@ -63,7 +62,7 @@ def test_benchmark_cpu_json(archiver, monkeypatch):
    result = json.loads(output)
    assert isinstance(result, dict)
    # categories with "size" field (bytes)
-    for category in ["chunkers", "checksums", "hashes", "encryption"]:
+    for category in ["chunkers", "hashes", "encryption"]:
        assert isinstance(result[category], list)
        assert len(result[category]) > 0
        for entry in result[category]: