diff --git a/src/borg/archiver/benchmark_cmd.py b/src/borg/archiver/benchmark_cmd.py index b81b763f6..1eb6bdba9 100644 --- a/src/borg/archiver/benchmark_cmd.py +++ b/src/borg/archiver/benchmark_cmd.py @@ -213,22 +213,6 @@ class BenchmarkMixIn: else: print(f"{spec:<24} {format_file_size(size):<10} {dt:.3f}s") - from xxhash import xxh64 - from zlib import crc32 - - if not args.json: - print("Non-cryptographic checksums / hashes ===========================") - else: - result["checksums"] = [] - size = 1000000000 - tests = [("xxh64", lambda: xxh64(random_10M).digest()), ("crc32 (zlib)", lambda: crc32(random_10M))] - for spec, func in tests: - dt = timeit(func, number=number_default) - if args.json: - result["checksums"].append({"algo": spec, "size": size, "time": dt}) - else: - print(f"{spec:<24} {format_file_size(size):<10} {dt:.3f}s") - from ..crypto.low_level import hmac_sha256, blake2b_256 if not args.json: diff --git a/src/borg/archiver/check_cmd.py b/src/borg/archiver/check_cmd.py index 83d1f6e29..353306bb9 100644 --- a/src/borg/archiver/check_cmd.py +++ b/src/borg/archiver/check_cmd.py @@ -40,7 +40,7 @@ class CheckMixIn: if args.repair and args.max_duration: raise CommandError("--repair does not allow --max-duration argument.") if args.max_duration and not args.repo_only: - # when doing a partial repo check, we can only check xxh64 hashes in repository files. + # when doing a partial repo check, we can only do a low-level check of the repository files. # archives check requires that a full repo check was done before and has built/cached a ChunkIndex. # also, there is no max_duration support in the archives check code anyway. raise CommandError("--repository-only is required for --max-duration support.") @@ -123,13 +123,12 @@ class CheckMixIn: for the same reason. Therefore, partial checks may be useful only with very large repositories where a full check would take too long. - The ``--verify-data`` option will perform a full integrity verification (as - opposed to checking just the xxh64) of data, which means reading the - data from the repository, decrypting and decompressing it. It is a complete - cryptographic verification and hence very time-consuming, but will detect any - accidental and malicious corruption. Tamper-resistance is only guaranteed for - encrypted repositories against attackers without access to the keys. You cannot - use ``--verify-data`` with ``--repository-only``. + The ``--verify-data`` option will perform a full integrity verification of data, + which means reading the data from the repository, decrypting and decompressing it. + It is a complete cryptographic verification and hence very time-consuming, but + will detect any accidental and malicious corruption. Tamper-resistance is only + guaranteed for encrypted repositories against attackers without access to the keys. + You cannot use ``--verify-data`` with ``--repository-only``. The ``--find-lost-archives`` option will also scan the whole repository, but tells Borg to search for lost archive metadata. If Borg encounters any archive diff --git a/src/borg/cache.py b/src/borg/cache.py index 464311d24..c9b5793db 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -1,4 +1,5 @@ import configparser +import hashlib import io import os import shutil @@ -8,8 +9,6 @@ from datetime import datetime, timezone, timedelta from pathlib import Path from time import perf_counter -from xxhash import xxh64 - from borgstore.backends.errors import PermissionDenied from .logger import create_logger @@ -51,7 +50,7 @@ def files_cache_name(archive_name, files_cache_name="files"): # when not, the user may manually do that by using the env var. if not suffix: # avoid issues with too complex or long archive_name by hashing it: - suffix = xxh64(archive_name.encode()).hexdigest() + suffix = hashlib.sha256(archive_name.encode()).hexdigest() return files_cache_name + "." + suffix @@ -539,7 +538,7 @@ def delete_chunkindex_cache(repository): logger.debug(f"cached chunk indexes deleted: {hashes}") -CHUNKINDEX_HASH_SEED = 3 +CHUNKINDEX_HASH_SEED = b"0001" # increment seed to invalidate old chunk indexes def write_chunkindex_to_repo_cache( @@ -564,11 +563,11 @@ def write_chunkindex_to_repo_cache( if clear: # if we don't need the in-memory chunks index anymore: chunks.clear() # free memory, immediately - new_hash = xxh64(data, seed=CHUNKINDEX_HASH_SEED).hexdigest() + new_hash = hashlib.sha256(data + CHUNKINDEX_HASH_SEED).hexdigest() cached_hashes = list_chunkindex_hashes(repository) if force_write or new_hash not in cached_hashes: # when an updated chunks index is stored into the cache, we also store its hash as part of the name. - # when a client is loading the chunks index from a cache, it has to compare its xxh64 + # when a client is loading the chunks index from a cache, it has to compare its content # hash against the hash in its name. if it is the same, the cache is valid. # if it is different, the cache is either corrupted or out of date and has to be discarded. # when some functionality is DELETING chunks from the repository, it has to delete @@ -605,7 +604,7 @@ def read_chunkindex_from_repo_cache(repository, hash): except StoreObjectNotFound: logger.debug(f"{cache_name} not found in the repository.") else: - if xxh64(chunks_data, seed=CHUNKINDEX_HASH_SEED).digest() == hex_to_bin(hash): + if hashlib.sha256(chunks_data + CHUNKINDEX_HASH_SEED).digest() == hex_to_bin(hash): logger.debug(f"{cache_name} is valid.") with io.BytesIO(chunks_data) as f: chunks = ChunkIndex.read(f) diff --git a/src/borg/crypto/file_integrity.py b/src/borg/crypto/file_integrity.py index e5310a136..701c7f9a2 100644 --- a/src/borg/crypto/file_integrity.py +++ b/src/borg/crypto/file_integrity.py @@ -5,8 +5,6 @@ from hmac import compare_digest from collections.abc import Callable from pathlib import Path -from xxhash import xxh64 - from ..helpers import IntegrityError from ..logger import create_logger @@ -106,20 +104,12 @@ class FileHashingWrapper(FileLikeWrapper): self.hash.update(str(self.tell()).encode()) -class SHA512FileHashingWrapper(FileHashingWrapper): - ALGORITHM = "SHA512" - FACTORY = hashlib.sha512 +class SHA256FileHashingWrapper(FileHashingWrapper): + ALGORITHM = "SHA256" + FACTORY = hashlib.sha256 -class XXH64FileHashingWrapper(FileHashingWrapper): - ALGORITHM = "XXH64" - FACTORY = xxh64 - - -SUPPORTED_ALGORITHMS = { - SHA512FileHashingWrapper.ALGORITHM: SHA512FileHashingWrapper, - XXH64FileHashingWrapper.ALGORITHM: XXH64FileHashingWrapper, -} +SUPPORTED_ALGORITHMS = {SHA256FileHashingWrapper.ALGORITHM: SHA256FileHashingWrapper} class FileIntegrityError(IntegrityError): @@ -137,7 +127,7 @@ class IntegrityCheckedFile(FileLikeWrapper): self.file_opened = override_fd is None self.digests = {} - hash_cls = XXH64FileHashingWrapper + hash_cls = SHA256FileHashingWrapper if not write: algorithm_and_digests = self.load_integrity_data(path, integrity_data) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 38762a485..9f6d8445f 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -968,9 +968,9 @@ class ArchiveFormatter(BaseFormatter): class ItemFormatter(BaseFormatter): - # we provide the hash algos from python stdlib (except shake_*) and additionally xxh64. + # we provide the hash algos from python stdlib (except shake_*). # shake_* is not provided because it uses an incompatible .digest() method to support variable length. - hash_algorithms = set(hashlib.algorithms_guaranteed).union({"xxh64"}).difference({"shake_128", "shake_256"}) + hash_algorithms = set(hashlib.algorithms_guaranteed).difference({"shake_128", "shake_256"}) KEY_DESCRIPTIONS = { "type": "file type (file, dir, symlink, ...)", "mode": "file mode (as in stat)", @@ -992,7 +992,6 @@ class ItemFormatter(BaseFormatter): "isomtime": "file modification time (ISO 8601 format)", "isoctime": "file change time (ISO 8601 format)", "isoatime": "file access time (ISO 8601 format)", - "xxh64": "XXH64 checksum of this file (note: this is NOT a cryptographic hash!)", "fingerprint": "Fingerprint of the file content (may have false negatives), format: H(conditions)-H(chunk_ids)", "archiveid": "internal ID of the archive", "archivename": "name of the archive", @@ -1013,11 +1012,8 @@ class ItemFormatter(BaseFormatter): return any(key in cls.KEYS_REQUIRING_CACHE for key in format_keys) def __init__(self, archive, format): - from xxhash import xxh64 - static_data = {"archivename": archive.name, "archiveid": archive.fpr} | self.FIXED_KEYS super().__init__(format, static_data) - self.xxh64 = xxh64 self.archive = archive # track which keys were requested in the format string self.format_keys = {f[1] for f in Formatter().parse(format)} @@ -1104,9 +1100,7 @@ class ItemFormatter(BaseFormatter): def hash_item(self, hash_function, item): if "chunks" not in item: return "" - if hash_function == "xxh64": - hash = self.xxh64() - elif hash_function in self.hash_algorithms: + if hash_function in self.hash_algorithms: hash = hashlib.new(hash_function) for data in self.archive.pipeline.fetch_many(item.chunks, ro_type=ROBJ_FILE_STREAM): hash.update(data) diff --git a/src/borg/storelocking.py b/src/borg/storelocking.py index ce30bdbe6..6417c0981 100644 --- a/src/borg/storelocking.py +++ b/src/borg/storelocking.py @@ -1,10 +1,9 @@ import datetime +import hashlib import json import random import time -from xxhash import xxh64 - from borgstore.store import ObjectNotFound from . import platform @@ -101,7 +100,7 @@ class Lock: timestamp = now.isoformat(timespec="milliseconds") lock = dict(exclusive=exclusive, hostid=self.id[0], processid=self.id[1], threadid=self.id[2], time=timestamp) value = json.dumps(lock).encode("utf-8") - key = xxh64(value).hexdigest() + key = hashlib.sha256(value).hexdigest() logger.debug(f"LOCK-CREATE: creating lock in store. key: {key}, lock: {lock}.") self.store.store(f"locks/{key}", value) if update_last_refresh: diff --git a/src/borg/testsuite/archiver/benchmark_cmd_test.py b/src/borg/testsuite/archiver/benchmark_cmd_test.py index 5d01b1cbd..5e0375fa2 100644 --- a/src/borg/testsuite/archiver/benchmark_cmd_test.py +++ b/src/borg/testsuite/archiver/benchmark_cmd_test.py @@ -50,7 +50,6 @@ def test_benchmark_cpu(archiver, monkeypatch): output = cmd(archiver, "benchmark", "cpu") # verify all section headers appear in the plain-text output assert "Chunkers" in output - assert "Non-cryptographic checksums / hashes" in output assert "Cryptographic hashes / MACs" in output assert "Encryption" in output assert "Compression" in output @@ -63,7 +62,7 @@ def test_benchmark_cpu_json(archiver, monkeypatch): result = json.loads(output) assert isinstance(result, dict) # categories with "size" field (bytes) - for category in ["chunkers", "checksums", "hashes", "encryption"]: + for category in ["chunkers", "hashes", "encryption"]: assert isinstance(result[category], list) assert len(result[category]) > 0 for entry in result[category]: