list --format: add fingerprint placeholder

This allows users to compare file content efficiently without reading the
full file data, by exposing a hash of the chunk IDs and the relevant
conditions for valid comparisons, like chunker params, chunker seed/key,
id key, key type, etc.

This is based on PR #5167 by @hrehfeld, code + discussion, with some changes:
- the conditions hash now includes more relevant input params
- returning a single value that is composed of 2 parts
- tests (including new buzhash64)

Example output (different files in same archive):

1e88bfb02d0a5320-a539587200c33b857f9827d01fcb7dabacf30501c83929e7308668d43f4a6302 file1
1e88bfb02d0a5320-9ed78a4c14d0506d9ae75d914cca90db64655ddea22647dd1c89f19e2fc080ae file2

The fingerprint has 2 parts:

First part: same hash, indicates same chunking / chunk id generation params,
            meaning that the second part is valid to be compared.

Second part: different hash, because file content is different.
             same hash here would mean same content.
This commit is contained in:
Thomas Waldmann 2026-02-10 08:07:47 +01:00
parent d680ee0feb
commit baed0bc5d0
No known key found for this signature in database
GPG key ID: 243ACFA951F78E01
2 changed files with 81 additions and 1 deletions

View file

@ -15,6 +15,7 @@ from typing import ClassVar, Any, TYPE_CHECKING, Literal
from collections import OrderedDict
from datetime import datetime, timezone
from functools import partial
from hashlib import sha256
from string import Formatter
from ..logger import create_logger
@ -876,6 +877,7 @@ class ItemFormatter(BaseFormatter):
"isoctime": "file change time (ISO 8601 format)",
"isoatime": "file access time (ISO 8601 format)",
"xxh64": "XXH64 checksum of this file (note: this is NOT a cryptographic hash!)",
"fingerprint": "Fingerprint of the file content (may have false negatives), format: H(conditions)-H(chunk_ids)",
"archiveid": "internal ID of the archive",
"archivename": "name of the archive",
}
@ -883,7 +885,7 @@ class ItemFormatter(BaseFormatter):
("type", "mode", "uid", "gid", "user", "group", "path", "target", "hlid", "inode", "flags"),
("size", "num_chunks"),
("mtime", "ctime", "atime", "isomtime", "isoctime", "isoatime"),
tuple(sorted(hash_algorithms)),
tuple(["fingerprint"] + sorted(hash_algorithms)),
("archiveid", "archivename", "extra"),
)
@ -903,6 +905,15 @@ class ItemFormatter(BaseFormatter):
self.archive = archive
# track which keys were requested in the format string
self.format_keys = {f[1] for f in Formatter().parse(format)}
# we want a hash over the conditions that influence the chunk ID list for a given file content:
# - the id algorithm and key
# - the chunker seed (if any - buzhash64 derives seed from id_key)
# - the chunker params
key = archive.key
conditions = f"{key.TYPE_STR!r}{key.id_key!r}{key.chunk_seed!r}{archive.metadata.get('chunker_params')!r}"
self.conditions_hash = sha256(conditions.encode()).hexdigest()
self.call_keys = {
"size": self.calculate_size,
"num_chunks": self.calculate_num_chunks,
@ -912,6 +923,7 @@ class ItemFormatter(BaseFormatter):
"mtime": partial(self.format_time, "mtime"),
"ctime": partial(self.format_time, "ctime"),
"atime": partial(self.format_time, "atime"),
"fingerprint": self.calculate_fingerprint,
}
for hash_function in self.hash_algorithms:
self.call_keys[hash_function] = partial(self.hash_item, hash_function)
@ -963,6 +975,16 @@ class ItemFormatter(BaseFormatter):
# note: does not support hard link slaves, they will be size 0
return item.get_size()
def calculate_fingerprint(self, item):
# calculate a very fast file contents fingerprint
chunks = item.get("chunks")
if chunks is None:
return ""
chunks_hash = sha256(b"".join(c.id for c in chunks)).hexdigest()
# we do not encounter many different conditions hashes, so the collision probability is low.
# thus, we can keep it short and only return 64 bits from the conditions hash.
return f"{self.conditions_hash[:16]}-{chunks_hash}"
def hash_item(self, hash_function, item):
if "chunks" not in item:
return ""

View file

@ -201,3 +201,61 @@ def test_list_inode_hardlinks(archivers, request):
assert inodes["input/fileA"] != inodes["input/fileC"]
else:
pytest.skip("Platform does not provide inode numbers for items")
def test_fingerprint(archivers, request):
archiver = request.getfixturevalue(archivers)
cmd(archiver, "repo-create", RK_ENCRYPTION)
create_regular_file(archiver.input_path, "file1", contents=b"content")
create_regular_file(archiver.input_path, "file2", contents=b"other")
cmd(archiver, "create", "test1", "input")
output = cmd(archiver, "list", "test1", "--format={fingerprint} {path}{NL}")
fingerprints1 = {}
for line in output.splitlines():
fp, path = line.split(" ", 1)
fingerprints1[path] = fp
# Same content, same chunker params -> same fingerprint
cmd(archiver, "create", "test2", "input")
output = cmd(archiver, "list", "test2", "--format={fingerprint} {path}{NL}")
fingerprints2 = {}
for line in output.splitlines():
fp, path = line.split(" ", 1)
fingerprints2[path] = fp
assert fingerprints1 == fingerprints2
# Modified content -> different fingerprint
create_regular_file(archiver.input_path, "file1", contents=b"modification")
cmd(archiver, "create", "test3", "input")
output = cmd(archiver, "list", "test3", "--format={fingerprint} {path}{NL}")
fingerprints3 = {}
for line in output.splitlines():
fp, path = line.split(" ", 1)
fingerprints3[path] = fp
assert fingerprints1["input/file1"] != fingerprints3["input/file1"]
# Unmodified file should still match
assert fingerprints1["input/file2"] == fingerprints3["input/file2"]
# Different chunker params -> different fingerprint
# We can use the same repo but specify different chunker params for a new archive
cmd(archiver, "create", "--chunker-params=fixed,4096", "test4", "input")
output = cmd(archiver, "list", "test4", "--format={fingerprint} {path}{NL}")
fingerprints4 = {}
for line in output.splitlines():
fp, path = line.split(" ", 1)
fingerprints4[path] = fp
# Even unmodified files should have different fingerprints because conditions_hash changed
assert fingerprints1["input/file2"] != fingerprints4["input/file2"]
# Also try with buzhash64
cmd(archiver, "create", "--chunker-params=buzhash64,10,23,16,4095", "test5", "input")
output = cmd(archiver, "list", "test5", "--format={fingerprint} {path}{NL}")
fingerprints5 = {}
for line in output.splitlines():
fp, path = line.split(" ", 1)
fingerprints5[path] = fp
# Even unmodified files should have different fingerprints because conditions_hash changed
assert fingerprints1["input/file2"] != fingerprints5["input/file2"]