mirror of
https://github.com/borgbackup/borg.git
synced 2026-06-10 09:21:44 -04:00
Merge pull request #9672 from ThomasWaldmann/less-xxh64
Use less xxh64, add SHA256FileHashingWrapper
This commit is contained in:
commit
33afaa1f3c
7 changed files with 24 additions and 60 deletions
|
|
@ -213,22 +213,6 @@ class BenchmarkMixIn:
|
|||
else:
|
||||
print(f"{spec:<24} {format_file_size(size):<10} {dt:.3f}s")
|
||||
|
||||
from xxhash import xxh64
|
||||
from zlib import crc32
|
||||
|
||||
if not args.json:
|
||||
print("Non-cryptographic checksums / hashes ===========================")
|
||||
else:
|
||||
result["checksums"] = []
|
||||
size = 1000000000
|
||||
tests = [("xxh64", lambda: xxh64(random_10M).digest()), ("crc32 (zlib)", lambda: crc32(random_10M))]
|
||||
for spec, func in tests:
|
||||
dt = timeit(func, number=number_default)
|
||||
if args.json:
|
||||
result["checksums"].append({"algo": spec, "size": size, "time": dt})
|
||||
else:
|
||||
print(f"{spec:<24} {format_file_size(size):<10} {dt:.3f}s")
|
||||
|
||||
from ..crypto.low_level import hmac_sha256, blake2b_256
|
||||
|
||||
if not args.json:
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ class CheckMixIn:
|
|||
if args.repair and args.max_duration:
|
||||
raise CommandError("--repair does not allow --max-duration argument.")
|
||||
if args.max_duration and not args.repo_only:
|
||||
# when doing a partial repo check, we can only check xxh64 hashes in repository files.
|
||||
# when doing a partial repo check, we can only do a low-level check of the repository files.
|
||||
# archives check requires that a full repo check was done before and has built/cached a ChunkIndex.
|
||||
# also, there is no max_duration support in the archives check code anyway.
|
||||
raise CommandError("--repository-only is required for --max-duration support.")
|
||||
|
|
@ -123,13 +123,12 @@ class CheckMixIn:
|
|||
for the same reason. Therefore, partial checks may be useful only with very large
|
||||
repositories where a full check would take too long.
|
||||
|
||||
The ``--verify-data`` option will perform a full integrity verification (as
|
||||
opposed to checking just the xxh64) of data, which means reading the
|
||||
data from the repository, decrypting and decompressing it. It is a complete
|
||||
cryptographic verification and hence very time-consuming, but will detect any
|
||||
accidental and malicious corruption. Tamper-resistance is only guaranteed for
|
||||
encrypted repositories against attackers without access to the keys. You cannot
|
||||
use ``--verify-data`` with ``--repository-only``.
|
||||
The ``--verify-data`` option will perform a full integrity verification of data,
|
||||
which means reading the data from the repository, decrypting and decompressing it.
|
||||
It is a complete cryptographic verification and hence very time-consuming, but
|
||||
will detect any accidental and malicious corruption. Tamper-resistance is only
|
||||
guaranteed for encrypted repositories against attackers without access to the keys.
|
||||
You cannot use ``--verify-data`` with ``--repository-only``.
|
||||
|
||||
The ``--find-lost-archives`` option will also scan the whole repository, but
|
||||
tells Borg to search for lost archive metadata. If Borg encounters any archive
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import configparser
|
||||
import hashlib
|
||||
import io
|
||||
import os
|
||||
import shutil
|
||||
|
|
@ -8,8 +9,6 @@ from datetime import datetime, timezone, timedelta
|
|||
from pathlib import Path
|
||||
from time import perf_counter
|
||||
|
||||
from xxhash import xxh64
|
||||
|
||||
from borgstore.backends.errors import PermissionDenied
|
||||
|
||||
from .logger import create_logger
|
||||
|
|
@ -51,7 +50,7 @@ def files_cache_name(archive_name, files_cache_name="files"):
|
|||
# when not, the user may manually do that by using the env var.
|
||||
if not suffix:
|
||||
# avoid issues with too complex or long archive_name by hashing it:
|
||||
suffix = xxh64(archive_name.encode()).hexdigest()
|
||||
suffix = hashlib.sha256(archive_name.encode()).hexdigest()
|
||||
return files_cache_name + "." + suffix
|
||||
|
||||
|
||||
|
|
@ -539,7 +538,7 @@ def delete_chunkindex_cache(repository):
|
|||
logger.debug(f"cached chunk indexes deleted: {hashes}")
|
||||
|
||||
|
||||
CHUNKINDEX_HASH_SEED = 3
|
||||
CHUNKINDEX_HASH_SEED = b"0001" # increment seed to invalidate old chunk indexes
|
||||
|
||||
|
||||
def write_chunkindex_to_repo_cache(
|
||||
|
|
@ -564,11 +563,11 @@ def write_chunkindex_to_repo_cache(
|
|||
if clear:
|
||||
# if we don't need the in-memory chunks index anymore:
|
||||
chunks.clear() # free memory, immediately
|
||||
new_hash = xxh64(data, seed=CHUNKINDEX_HASH_SEED).hexdigest()
|
||||
new_hash = hashlib.sha256(data + CHUNKINDEX_HASH_SEED).hexdigest()
|
||||
cached_hashes = list_chunkindex_hashes(repository)
|
||||
if force_write or new_hash not in cached_hashes:
|
||||
# when an updated chunks index is stored into the cache, we also store its hash as part of the name.
|
||||
# when a client is loading the chunks index from a cache, it has to compare its xxh64
|
||||
# when a client is loading the chunks index from a cache, it has to compare its content
|
||||
# hash against the hash in its name. if it is the same, the cache is valid.
|
||||
# if it is different, the cache is either corrupted or out of date and has to be discarded.
|
||||
# when some functionality is DELETING chunks from the repository, it has to delete
|
||||
|
|
@ -605,7 +604,7 @@ def read_chunkindex_from_repo_cache(repository, hash):
|
|||
except StoreObjectNotFound:
|
||||
logger.debug(f"{cache_name} not found in the repository.")
|
||||
else:
|
||||
if xxh64(chunks_data, seed=CHUNKINDEX_HASH_SEED).digest() == hex_to_bin(hash):
|
||||
if hashlib.sha256(chunks_data + CHUNKINDEX_HASH_SEED).digest() == hex_to_bin(hash):
|
||||
logger.debug(f"{cache_name} is valid.")
|
||||
with io.BytesIO(chunks_data) as f:
|
||||
chunks = ChunkIndex.read(f)
|
||||
|
|
|
|||
|
|
@ -5,8 +5,6 @@ from hmac import compare_digest
|
|||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
|
||||
from xxhash import xxh64
|
||||
|
||||
from ..helpers import IntegrityError
|
||||
from ..logger import create_logger
|
||||
|
||||
|
|
@ -106,20 +104,12 @@ class FileHashingWrapper(FileLikeWrapper):
|
|||
self.hash.update(str(self.tell()).encode())
|
||||
|
||||
|
||||
class SHA512FileHashingWrapper(FileHashingWrapper):
|
||||
ALGORITHM = "SHA512"
|
||||
FACTORY = hashlib.sha512
|
||||
class SHA256FileHashingWrapper(FileHashingWrapper):
|
||||
ALGORITHM = "SHA256"
|
||||
FACTORY = hashlib.sha256
|
||||
|
||||
|
||||
class XXH64FileHashingWrapper(FileHashingWrapper):
|
||||
ALGORITHM = "XXH64"
|
||||
FACTORY = xxh64
|
||||
|
||||
|
||||
SUPPORTED_ALGORITHMS = {
|
||||
SHA512FileHashingWrapper.ALGORITHM: SHA512FileHashingWrapper,
|
||||
XXH64FileHashingWrapper.ALGORITHM: XXH64FileHashingWrapper,
|
||||
}
|
||||
SUPPORTED_ALGORITHMS = {SHA256FileHashingWrapper.ALGORITHM: SHA256FileHashingWrapper}
|
||||
|
||||
|
||||
class FileIntegrityError(IntegrityError):
|
||||
|
|
@ -137,7 +127,7 @@ class IntegrityCheckedFile(FileLikeWrapper):
|
|||
self.file_opened = override_fd is None
|
||||
self.digests = {}
|
||||
|
||||
hash_cls = XXH64FileHashingWrapper
|
||||
hash_cls = SHA256FileHashingWrapper
|
||||
|
||||
if not write:
|
||||
algorithm_and_digests = self.load_integrity_data(path, integrity_data)
|
||||
|
|
|
|||
|
|
@ -968,9 +968,9 @@ class ArchiveFormatter(BaseFormatter):
|
|||
|
||||
|
||||
class ItemFormatter(BaseFormatter):
|
||||
# we provide the hash algos from python stdlib (except shake_*) and additionally xxh64.
|
||||
# we provide the hash algos from python stdlib (except shake_*).
|
||||
# shake_* is not provided because it uses an incompatible .digest() method to support variable length.
|
||||
hash_algorithms = set(hashlib.algorithms_guaranteed).union({"xxh64"}).difference({"shake_128", "shake_256"})
|
||||
hash_algorithms = set(hashlib.algorithms_guaranteed).difference({"shake_128", "shake_256"})
|
||||
KEY_DESCRIPTIONS = {
|
||||
"type": "file type (file, dir, symlink, ...)",
|
||||
"mode": "file mode (as in stat)",
|
||||
|
|
@ -992,7 +992,6 @@ class ItemFormatter(BaseFormatter):
|
|||
"isomtime": "file modification time (ISO 8601 format)",
|
||||
"isoctime": "file change time (ISO 8601 format)",
|
||||
"isoatime": "file access time (ISO 8601 format)",
|
||||
"xxh64": "XXH64 checksum of this file (note: this is NOT a cryptographic hash!)",
|
||||
"fingerprint": "Fingerprint of the file content (may have false negatives), format: H(conditions)-H(chunk_ids)",
|
||||
"archiveid": "internal ID of the archive",
|
||||
"archivename": "name of the archive",
|
||||
|
|
@ -1013,11 +1012,8 @@ class ItemFormatter(BaseFormatter):
|
|||
return any(key in cls.KEYS_REQUIRING_CACHE for key in format_keys)
|
||||
|
||||
def __init__(self, archive, format):
|
||||
from xxhash import xxh64
|
||||
|
||||
static_data = {"archivename": archive.name, "archiveid": archive.fpr} | self.FIXED_KEYS
|
||||
super().__init__(format, static_data)
|
||||
self.xxh64 = xxh64
|
||||
self.archive = archive
|
||||
# track which keys were requested in the format string
|
||||
self.format_keys = {f[1] for f in Formatter().parse(format)}
|
||||
|
|
@ -1104,9 +1100,7 @@ class ItemFormatter(BaseFormatter):
|
|||
def hash_item(self, hash_function, item):
|
||||
if "chunks" not in item:
|
||||
return ""
|
||||
if hash_function == "xxh64":
|
||||
hash = self.xxh64()
|
||||
elif hash_function in self.hash_algorithms:
|
||||
if hash_function in self.hash_algorithms:
|
||||
hash = hashlib.new(hash_function)
|
||||
for data in self.archive.pipeline.fetch_many(item.chunks, ro_type=ROBJ_FILE_STREAM):
|
||||
hash.update(data)
|
||||
|
|
|
|||
|
|
@ -1,10 +1,9 @@
|
|||
import datetime
|
||||
import hashlib
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
|
||||
from xxhash import xxh64
|
||||
|
||||
from borgstore.store import ObjectNotFound
|
||||
|
||||
from . import platform
|
||||
|
|
@ -101,7 +100,7 @@ class Lock:
|
|||
timestamp = now.isoformat(timespec="milliseconds")
|
||||
lock = dict(exclusive=exclusive, hostid=self.id[0], processid=self.id[1], threadid=self.id[2], time=timestamp)
|
||||
value = json.dumps(lock).encode("utf-8")
|
||||
key = xxh64(value).hexdigest()
|
||||
key = hashlib.sha256(value).hexdigest()
|
||||
logger.debug(f"LOCK-CREATE: creating lock in store. key: {key}, lock: {lock}.")
|
||||
self.store.store(f"locks/{key}", value)
|
||||
if update_last_refresh:
|
||||
|
|
|
|||
|
|
@ -50,7 +50,6 @@ def test_benchmark_cpu(archiver, monkeypatch):
|
|||
output = cmd(archiver, "benchmark", "cpu")
|
||||
# verify all section headers appear in the plain-text output
|
||||
assert "Chunkers" in output
|
||||
assert "Non-cryptographic checksums / hashes" in output
|
||||
assert "Cryptographic hashes / MACs" in output
|
||||
assert "Encryption" in output
|
||||
assert "Compression" in output
|
||||
|
|
@ -63,7 +62,7 @@ def test_benchmark_cpu_json(archiver, monkeypatch):
|
|||
result = json.loads(output)
|
||||
assert isinstance(result, dict)
|
||||
# categories with "size" field (bytes)
|
||||
for category in ["chunkers", "checksums", "hashes", "encryption"]:
|
||||
for category in ["chunkers", "hashes", "encryption"]:
|
||||
assert isinstance(result[category], list)
|
||||
assert len(result[category]) > 0
|
||||
for entry in result[category]:
|
||||
|
|
|
|||
Loading…
Reference in a new issue