mirror of
https://github.com/borgbackup/borg.git
synced 2026-06-11 01:41:57 -04:00
list --format: add fingerprint placeholder
This allows users to compare file content efficiently without reading the full file data, by exposing a hash of the chunk IDs and the relevant conditions for valid comparisons, like chunker params, chunker seed/key, id key, key type, etc. This is based on PR #5167 by @hrehfeld, code + discussion, with some changes: - the conditions hash now includes more relevant input params - returning a single value that is composed of 2 parts - tests (including new buzhash64) Example output (different files in same archive): 1e88bfb02d0a5320-a539587200c33b857f9827d01fcb7dabacf30501c83929e7308668d43f4a6302 file1 1e88bfb02d0a5320-9ed78a4c14d0506d9ae75d914cca90db64655ddea22647dd1c89f19e2fc080ae file2 The fingerprint has 2 parts: First part: same hash, indicates same chunking / chunk id generation params, meaning that the second part is valid to be compared. Second part: different hash, because file content is different. same hash here would mean same content.
This commit is contained in:
parent
d680ee0feb
commit
baed0bc5d0
2 changed files with 81 additions and 1 deletions
|
|
@ -15,6 +15,7 @@ from typing import ClassVar, Any, TYPE_CHECKING, Literal
|
|||
from collections import OrderedDict
|
||||
from datetime import datetime, timezone
|
||||
from functools import partial
|
||||
from hashlib import sha256
|
||||
from string import Formatter
|
||||
|
||||
from ..logger import create_logger
|
||||
|
|
@ -876,6 +877,7 @@ class ItemFormatter(BaseFormatter):
|
|||
"isoctime": "file change time (ISO 8601 format)",
|
||||
"isoatime": "file access time (ISO 8601 format)",
|
||||
"xxh64": "XXH64 checksum of this file (note: this is NOT a cryptographic hash!)",
|
||||
"fingerprint": "Fingerprint of the file content (may have false negatives), format: H(conditions)-H(chunk_ids)",
|
||||
"archiveid": "internal ID of the archive",
|
||||
"archivename": "name of the archive",
|
||||
}
|
||||
|
|
@ -883,7 +885,7 @@ class ItemFormatter(BaseFormatter):
|
|||
("type", "mode", "uid", "gid", "user", "group", "path", "target", "hlid", "inode", "flags"),
|
||||
("size", "num_chunks"),
|
||||
("mtime", "ctime", "atime", "isomtime", "isoctime", "isoatime"),
|
||||
tuple(sorted(hash_algorithms)),
|
||||
tuple(["fingerprint"] + sorted(hash_algorithms)),
|
||||
("archiveid", "archivename", "extra"),
|
||||
)
|
||||
|
||||
|
|
@ -903,6 +905,15 @@ class ItemFormatter(BaseFormatter):
|
|||
self.archive = archive
|
||||
# track which keys were requested in the format string
|
||||
self.format_keys = {f[1] for f in Formatter().parse(format)}
|
||||
|
||||
# we want a hash over the conditions that influence the chunk ID list for a given file content:
|
||||
# - the id algorithm and key
|
||||
# - the chunker seed (if any - buzhash64 derives seed from id_key)
|
||||
# - the chunker params
|
||||
key = archive.key
|
||||
conditions = f"{key.TYPE_STR!r}{key.id_key!r}{key.chunk_seed!r}{archive.metadata.get('chunker_params')!r}"
|
||||
self.conditions_hash = sha256(conditions.encode()).hexdigest()
|
||||
|
||||
self.call_keys = {
|
||||
"size": self.calculate_size,
|
||||
"num_chunks": self.calculate_num_chunks,
|
||||
|
|
@ -912,6 +923,7 @@ class ItemFormatter(BaseFormatter):
|
|||
"mtime": partial(self.format_time, "mtime"),
|
||||
"ctime": partial(self.format_time, "ctime"),
|
||||
"atime": partial(self.format_time, "atime"),
|
||||
"fingerprint": self.calculate_fingerprint,
|
||||
}
|
||||
for hash_function in self.hash_algorithms:
|
||||
self.call_keys[hash_function] = partial(self.hash_item, hash_function)
|
||||
|
|
@ -963,6 +975,16 @@ class ItemFormatter(BaseFormatter):
|
|||
# note: does not support hard link slaves, they will be size 0
|
||||
return item.get_size()
|
||||
|
||||
def calculate_fingerprint(self, item):
|
||||
# calculate a very fast file contents fingerprint
|
||||
chunks = item.get("chunks")
|
||||
if chunks is None:
|
||||
return ""
|
||||
chunks_hash = sha256(b"".join(c.id for c in chunks)).hexdigest()
|
||||
# we do not encounter many different conditions hashes, so the collision probability is low.
|
||||
# thus, we can keep it short and only return 64 bits from the conditions hash.
|
||||
return f"{self.conditions_hash[:16]}-{chunks_hash}"
|
||||
|
||||
def hash_item(self, hash_function, item):
|
||||
if "chunks" not in item:
|
||||
return ""
|
||||
|
|
|
|||
|
|
@ -201,3 +201,61 @@ def test_list_inode_hardlinks(archivers, request):
|
|||
assert inodes["input/fileA"] != inodes["input/fileC"]
|
||||
else:
|
||||
pytest.skip("Platform does not provide inode numbers for items")
|
||||
|
||||
|
||||
def test_fingerprint(archivers, request):
|
||||
archiver = request.getfixturevalue(archivers)
|
||||
cmd(archiver, "repo-create", RK_ENCRYPTION)
|
||||
create_regular_file(archiver.input_path, "file1", contents=b"content")
|
||||
create_regular_file(archiver.input_path, "file2", contents=b"other")
|
||||
cmd(archiver, "create", "test1", "input")
|
||||
|
||||
output = cmd(archiver, "list", "test1", "--format={fingerprint} {path}{NL}")
|
||||
fingerprints1 = {}
|
||||
for line in output.splitlines():
|
||||
fp, path = line.split(" ", 1)
|
||||
fingerprints1[path] = fp
|
||||
|
||||
# Same content, same chunker params -> same fingerprint
|
||||
cmd(archiver, "create", "test2", "input")
|
||||
output = cmd(archiver, "list", "test2", "--format={fingerprint} {path}{NL}")
|
||||
fingerprints2 = {}
|
||||
for line in output.splitlines():
|
||||
fp, path = line.split(" ", 1)
|
||||
fingerprints2[path] = fp
|
||||
assert fingerprints1 == fingerprints2
|
||||
|
||||
# Modified content -> different fingerprint
|
||||
create_regular_file(archiver.input_path, "file1", contents=b"modification")
|
||||
cmd(archiver, "create", "test3", "input")
|
||||
output = cmd(archiver, "list", "test3", "--format={fingerprint} {path}{NL}")
|
||||
fingerprints3 = {}
|
||||
for line in output.splitlines():
|
||||
fp, path = line.split(" ", 1)
|
||||
fingerprints3[path] = fp
|
||||
assert fingerprints1["input/file1"] != fingerprints3["input/file1"]
|
||||
# Unmodified file should still match
|
||||
assert fingerprints1["input/file2"] == fingerprints3["input/file2"]
|
||||
|
||||
# Different chunker params -> different fingerprint
|
||||
# We can use the same repo but specify different chunker params for a new archive
|
||||
cmd(archiver, "create", "--chunker-params=fixed,4096", "test4", "input")
|
||||
output = cmd(archiver, "list", "test4", "--format={fingerprint} {path}{NL}")
|
||||
fingerprints4 = {}
|
||||
for line in output.splitlines():
|
||||
fp, path = line.split(" ", 1)
|
||||
fingerprints4[path] = fp
|
||||
|
||||
# Even unmodified files should have different fingerprints because conditions_hash changed
|
||||
assert fingerprints1["input/file2"] != fingerprints4["input/file2"]
|
||||
|
||||
# Also try with buzhash64
|
||||
cmd(archiver, "create", "--chunker-params=buzhash64,10,23,16,4095", "test5", "input")
|
||||
output = cmd(archiver, "list", "test5", "--format={fingerprint} {path}{NL}")
|
||||
fingerprints5 = {}
|
||||
for line in output.splitlines():
|
||||
fp, path = line.split(" ", 1)
|
||||
fingerprints5[path] = fp
|
||||
|
||||
# Even unmodified files should have different fingerprints because conditions_hash changed
|
||||
assert fingerprints1["input/file2"] != fingerprints5["input/file2"]
|
||||
|
|
|
|||
Loading…
Reference in a new issue