Merge pull request #9672 from ThomasWaldmann/less-xxh64

Use less xxh64, add SHA256FileHashingWrapper
This commit is contained in:
TW 2026-05-28 20:37:48 +02:00 committed by GitHub
commit 33afaa1f3c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 24 additions and 60 deletions

View file

@ -213,22 +213,6 @@ class BenchmarkMixIn:
else:
print(f"{spec:<24} {format_file_size(size):<10} {dt:.3f}s")
from xxhash import xxh64
from zlib import crc32
if not args.json:
print("Non-cryptographic checksums / hashes ===========================")
else:
result["checksums"] = []
size = 1000000000
tests = [("xxh64", lambda: xxh64(random_10M).digest()), ("crc32 (zlib)", lambda: crc32(random_10M))]
for spec, func in tests:
dt = timeit(func, number=number_default)
if args.json:
result["checksums"].append({"algo": spec, "size": size, "time": dt})
else:
print(f"{spec:<24} {format_file_size(size):<10} {dt:.3f}s")
from ..crypto.low_level import hmac_sha256, blake2b_256
if not args.json:

View file

@ -40,7 +40,7 @@ class CheckMixIn:
if args.repair and args.max_duration:
raise CommandError("--repair does not allow --max-duration argument.")
if args.max_duration and not args.repo_only:
# when doing a partial repo check, we can only check xxh64 hashes in repository files.
# when doing a partial repo check, we can only do a low-level check of the repository files.
# archives check requires that a full repo check was done before and has built/cached a ChunkIndex.
# also, there is no max_duration support in the archives check code anyway.
raise CommandError("--repository-only is required for --max-duration support.")
@ -123,13 +123,12 @@ class CheckMixIn:
for the same reason. Therefore, partial checks may be useful only with very large
repositories where a full check would take too long.
The ``--verify-data`` option will perform a full integrity verification (as
opposed to checking just the xxh64) of data, which means reading the
data from the repository, decrypting and decompressing it. It is a complete
cryptographic verification and hence very time-consuming, but will detect any
accidental and malicious corruption. Tamper-resistance is only guaranteed for
encrypted repositories against attackers without access to the keys. You cannot
use ``--verify-data`` with ``--repository-only``.
The ``--verify-data`` option will perform a full integrity verification of data,
which means reading the data from the repository, decrypting and decompressing it.
It is a complete cryptographic verification and hence very time-consuming, but
will detect any accidental and malicious corruption. Tamper-resistance is only
guaranteed for encrypted repositories against attackers without access to the keys.
You cannot use ``--verify-data`` with ``--repository-only``.
The ``--find-lost-archives`` option will also scan the whole repository, but
tells Borg to search for lost archive metadata. If Borg encounters any archive

View file

@ -1,4 +1,5 @@
import configparser
import hashlib
import io
import os
import shutil
@ -8,8 +9,6 @@ from datetime import datetime, timezone, timedelta
from pathlib import Path
from time import perf_counter
from xxhash import xxh64
from borgstore.backends.errors import PermissionDenied
from .logger import create_logger
@ -51,7 +50,7 @@ def files_cache_name(archive_name, files_cache_name="files"):
# when not, the user may manually do that by using the env var.
if not suffix:
# avoid issues with too complex or long archive_name by hashing it:
suffix = xxh64(archive_name.encode()).hexdigest()
suffix = hashlib.sha256(archive_name.encode()).hexdigest()
return files_cache_name + "." + suffix
@ -539,7 +538,7 @@ def delete_chunkindex_cache(repository):
logger.debug(f"cached chunk indexes deleted: {hashes}")
CHUNKINDEX_HASH_SEED = 3
CHUNKINDEX_HASH_SEED = b"0001" # increment seed to invalidate old chunk indexes
def write_chunkindex_to_repo_cache(
@ -564,11 +563,11 @@ def write_chunkindex_to_repo_cache(
if clear:
# if we don't need the in-memory chunks index anymore:
chunks.clear() # free memory, immediately
new_hash = xxh64(data, seed=CHUNKINDEX_HASH_SEED).hexdigest()
new_hash = hashlib.sha256(data + CHUNKINDEX_HASH_SEED).hexdigest()
cached_hashes = list_chunkindex_hashes(repository)
if force_write or new_hash not in cached_hashes:
# when an updated chunks index is stored into the cache, we also store its hash as part of the name.
# when a client is loading the chunks index from a cache, it has to compare its xxh64
# when a client is loading the chunks index from a cache, it has to compare its content
# hash against the hash in its name. if it is the same, the cache is valid.
# if it is different, the cache is either corrupted or out of date and has to be discarded.
# when some functionality is DELETING chunks from the repository, it has to delete
@ -605,7 +604,7 @@ def read_chunkindex_from_repo_cache(repository, hash):
except StoreObjectNotFound:
logger.debug(f"{cache_name} not found in the repository.")
else:
if xxh64(chunks_data, seed=CHUNKINDEX_HASH_SEED).digest() == hex_to_bin(hash):
if hashlib.sha256(chunks_data + CHUNKINDEX_HASH_SEED).digest() == hex_to_bin(hash):
logger.debug(f"{cache_name} is valid.")
with io.BytesIO(chunks_data) as f:
chunks = ChunkIndex.read(f)

View file

@ -5,8 +5,6 @@ from hmac import compare_digest
from collections.abc import Callable
from pathlib import Path
from xxhash import xxh64
from ..helpers import IntegrityError
from ..logger import create_logger
@ -106,20 +104,12 @@ class FileHashingWrapper(FileLikeWrapper):
self.hash.update(str(self.tell()).encode())
class SHA512FileHashingWrapper(FileHashingWrapper):
ALGORITHM = "SHA512"
FACTORY = hashlib.sha512
class SHA256FileHashingWrapper(FileHashingWrapper):
ALGORITHM = "SHA256"
FACTORY = hashlib.sha256
class XXH64FileHashingWrapper(FileHashingWrapper):
ALGORITHM = "XXH64"
FACTORY = xxh64
SUPPORTED_ALGORITHMS = {
SHA512FileHashingWrapper.ALGORITHM: SHA512FileHashingWrapper,
XXH64FileHashingWrapper.ALGORITHM: XXH64FileHashingWrapper,
}
SUPPORTED_ALGORITHMS = {SHA256FileHashingWrapper.ALGORITHM: SHA256FileHashingWrapper}
class FileIntegrityError(IntegrityError):
@ -137,7 +127,7 @@ class IntegrityCheckedFile(FileLikeWrapper):
self.file_opened = override_fd is None
self.digests = {}
hash_cls = XXH64FileHashingWrapper
hash_cls = SHA256FileHashingWrapper
if not write:
algorithm_and_digests = self.load_integrity_data(path, integrity_data)

View file

@ -968,9 +968,9 @@ class ArchiveFormatter(BaseFormatter):
class ItemFormatter(BaseFormatter):
# we provide the hash algos from python stdlib (except shake_*) and additionally xxh64.
# we provide the hash algos from python stdlib (except shake_*).
# shake_* is not provided because it uses an incompatible .digest() method to support variable length.
hash_algorithms = set(hashlib.algorithms_guaranteed).union({"xxh64"}).difference({"shake_128", "shake_256"})
hash_algorithms = set(hashlib.algorithms_guaranteed).difference({"shake_128", "shake_256"})
KEY_DESCRIPTIONS = {
"type": "file type (file, dir, symlink, ...)",
"mode": "file mode (as in stat)",
@ -992,7 +992,6 @@ class ItemFormatter(BaseFormatter):
"isomtime": "file modification time (ISO 8601 format)",
"isoctime": "file change time (ISO 8601 format)",
"isoatime": "file access time (ISO 8601 format)",
"xxh64": "XXH64 checksum of this file (note: this is NOT a cryptographic hash!)",
"fingerprint": "Fingerprint of the file content (may have false negatives), format: H(conditions)-H(chunk_ids)",
"archiveid": "internal ID of the archive",
"archivename": "name of the archive",
@ -1013,11 +1012,8 @@ class ItemFormatter(BaseFormatter):
return any(key in cls.KEYS_REQUIRING_CACHE for key in format_keys)
def __init__(self, archive, format):
from xxhash import xxh64
static_data = {"archivename": archive.name, "archiveid": archive.fpr} | self.FIXED_KEYS
super().__init__(format, static_data)
self.xxh64 = xxh64
self.archive = archive
# track which keys were requested in the format string
self.format_keys = {f[1] for f in Formatter().parse(format)}
@ -1104,9 +1100,7 @@ class ItemFormatter(BaseFormatter):
def hash_item(self, hash_function, item):
if "chunks" not in item:
return ""
if hash_function == "xxh64":
hash = self.xxh64()
elif hash_function in self.hash_algorithms:
if hash_function in self.hash_algorithms:
hash = hashlib.new(hash_function)
for data in self.archive.pipeline.fetch_many(item.chunks, ro_type=ROBJ_FILE_STREAM):
hash.update(data)

View file

@ -1,10 +1,9 @@
import datetime
import hashlib
import json
import random
import time
from xxhash import xxh64
from borgstore.store import ObjectNotFound
from . import platform
@ -101,7 +100,7 @@ class Lock:
timestamp = now.isoformat(timespec="milliseconds")
lock = dict(exclusive=exclusive, hostid=self.id[0], processid=self.id[1], threadid=self.id[2], time=timestamp)
value = json.dumps(lock).encode("utf-8")
key = xxh64(value).hexdigest()
key = hashlib.sha256(value).hexdigest()
logger.debug(f"LOCK-CREATE: creating lock in store. key: {key}, lock: {lock}.")
self.store.store(f"locks/{key}", value)
if update_last_refresh:

View file

@ -50,7 +50,6 @@ def test_benchmark_cpu(archiver, monkeypatch):
output = cmd(archiver, "benchmark", "cpu")
# verify all section headers appear in the plain-text output
assert "Chunkers" in output
assert "Non-cryptographic checksums / hashes" in output
assert "Cryptographic hashes / MACs" in output
assert "Encryption" in output
assert "Compression" in output
@ -63,7 +62,7 @@ def test_benchmark_cpu_json(archiver, monkeypatch):
result = json.loads(output)
assert isinstance(result, dict)
# categories with "size" field (bytes)
for category in ["chunkers", "checksums", "hashes", "encryption"]:
for category in ["chunkers", "hashes", "encryption"]:
assert isinstance(result[category], list)
assert len(result[category]) > 0
for entry in result[category]: