list --format: add fingerprint placeholder

This allows users to compare file content efficiently without reading the full file data, by exposing a hash of the chunk IDs and the relevant conditions for valid comparisons, like chunker params, chunker seed/key, id key, key type, etc. This is based on PR #5167 by @hrehfeld, code + discussion, with some changes: - the conditions hash now includes more relevant input params - returning a single value that is composed of 2 parts - tests (including new buzhash64) Example output (different files in same archive): 1e88bfb02d0a5320-a539587200c33b857f9827d01fcb7dabacf30501c83929e7308668d43f4a6302 file1 1e88bfb02d0a5320-9ed78a4c14d0506d9ae75d914cca90db64655ddea22647dd1c89f19e2fc080ae file2 The fingerprint has 2 parts: First part: same hash, indicates same chunking / chunk id generation params, meaning that the second part is valid to be compared. Second part: different hash, because file content is different. same hash here would mean same content.
2026-06-11 01:41:57 -04:00 · 2026-02-10 08:07:47 +01:00 · 2026-02-10 08:07:47 +01:00 · baed0bc5d0
commit baed0bc5d0
parent d680ee0feb
2 changed files with 81 additions and 1 deletions
--- a/src/borg/helpers/parseformat.py
+++ b/src/borg/helpers/parseformat.py
@ -15,6 +15,7 @@ from typing import ClassVar, Any, TYPE_CHECKING, Literal
 from collections import OrderedDict
 from datetime import datetime, timezone
 from functools import partial
+from hashlib import sha256
 from string import Formatter

 from ..logger import create_logger
@ -876,6 +877,7 @@ class ItemFormatter(BaseFormatter):
        "isoctime": "file change time (ISO 8601 format)",
        "isoatime": "file access time (ISO 8601 format)",
        "xxh64": "XXH64 checksum of this file (note: this is NOT a cryptographic hash!)",
+        "fingerprint": "Fingerprint of the file content (may have false negatives), format: H(conditions)-H(chunk_ids)",
        "archiveid": "internal ID of the archive",
        "archivename": "name of the archive",
    }
@ -883,7 +885,7 @@ class ItemFormatter(BaseFormatter):
        ("type", "mode", "uid", "gid", "user", "group", "path", "target", "hlid", "inode", "flags"),
        ("size", "num_chunks"),
        ("mtime", "ctime", "atime", "isomtime", "isoctime", "isoatime"),
-        tuple(sorted(hash_algorithms)),
+        tuple(["fingerprint"] + sorted(hash_algorithms)),
        ("archiveid", "archivename", "extra"),
    )

@ -903,6 +905,15 @@ class ItemFormatter(BaseFormatter):
        self.archive = archive
        # track which keys were requested in the format string
        self.format_keys = {f[1] for f in Formatter().parse(format)}
+
+        # we want a hash over the conditions that influence the chunk ID list for a given file content:
+        # - the id algorithm and key
+        # - the chunker seed (if any - buzhash64 derives seed from id_key)
+        # - the chunker params
+        key = archive.key
+        conditions = f"{key.TYPE_STR!r}{key.id_key!r}{key.chunk_seed!r}{archive.metadata.get('chunker_params')!r}"
+        self.conditions_hash = sha256(conditions.encode()).hexdigest()
+
        self.call_keys = {
            "size": self.calculate_size,
            "num_chunks": self.calculate_num_chunks,
@ -912,6 +923,7 @@ class ItemFormatter(BaseFormatter):
            "mtime": partial(self.format_time, "mtime"),
            "ctime": partial(self.format_time, "ctime"),
            "atime": partial(self.format_time, "atime"),
+            "fingerprint": self.calculate_fingerprint,
        }
        for hash_function in self.hash_algorithms:
            self.call_keys[hash_function] = partial(self.hash_item, hash_function)
@ -963,6 +975,16 @@ class ItemFormatter(BaseFormatter):
        # note: does not support hard link slaves, they will be size 0
        return item.get_size()

+    def calculate_fingerprint(self, item):
+        # calculate a very fast file contents fingerprint
+        chunks = item.get("chunks")
+        if chunks is None:
+            return ""
+        chunks_hash = sha256(b"".join(c.id for c in chunks)).hexdigest()
+        # we do not encounter many different conditions hashes, so the collision probability is low.
+        # thus, we can keep it short and only return 64 bits from the conditions hash.
+        return f"{self.conditions_hash[:16]}-{chunks_hash}"
+
    def hash_item(self, hash_function, item):
        if "chunks" not in item:
            return ""
--- a/src/borg/testsuite/archiver/list_cmd_test.py
+++ b/src/borg/testsuite/archiver/list_cmd_test.py
@ -201,3 +201,61 @@ def test_list_inode_hardlinks(archivers, request):
        assert inodes["input/fileA"] != inodes["input/fileC"]
    else:
        pytest.skip("Platform does not provide inode numbers for items")
+
+
+def test_fingerprint(archivers, request):
+    archiver = request.getfixturevalue(archivers)
+    cmd(archiver, "repo-create", RK_ENCRYPTION)
+    create_regular_file(archiver.input_path, "file1", contents=b"content")
+    create_regular_file(archiver.input_path, "file2", contents=b"other")
+    cmd(archiver, "create", "test1", "input")
+
+    output = cmd(archiver, "list", "test1", "--format={fingerprint} {path}{NL}")
+    fingerprints1 = {}
+    for line in output.splitlines():
+        fp, path = line.split(" ", 1)
+        fingerprints1[path] = fp
+
+    # Same content, same chunker params -> same fingerprint
+    cmd(archiver, "create", "test2", "input")
+    output = cmd(archiver, "list", "test2", "--format={fingerprint} {path}{NL}")
+    fingerprints2 = {}
+    for line in output.splitlines():
+        fp, path = line.split(" ", 1)
+        fingerprints2[path] = fp
+    assert fingerprints1 == fingerprints2
+
+    # Modified content -> different fingerprint
+    create_regular_file(archiver.input_path, "file1", contents=b"modification")
+    cmd(archiver, "create", "test3", "input")
+    output = cmd(archiver, "list", "test3", "--format={fingerprint} {path}{NL}")
+    fingerprints3 = {}
+    for line in output.splitlines():
+        fp, path = line.split(" ", 1)
+        fingerprints3[path] = fp
+    assert fingerprints1["input/file1"] != fingerprints3["input/file1"]
+    # Unmodified file should still match
+    assert fingerprints1["input/file2"] == fingerprints3["input/file2"]
+
+    # Different chunker params -> different fingerprint
+    # We can use the same repo but specify different chunker params for a new archive
+    cmd(archiver, "create", "--chunker-params=fixed,4096", "test4", "input")
+    output = cmd(archiver, "list", "test4", "--format={fingerprint} {path}{NL}")
+    fingerprints4 = {}
+    for line in output.splitlines():
+        fp, path = line.split(" ", 1)
+        fingerprints4[path] = fp
+
+    # Even unmodified files should have different fingerprints because conditions_hash changed
+    assert fingerprints1["input/file2"] != fingerprints4["input/file2"]
+
+    # Also try with buzhash64
+    cmd(archiver, "create", "--chunker-params=buzhash64,10,23,16,4095", "test5", "input")
+    output = cmd(archiver, "list", "test5", "--format={fingerprint} {path}{NL}")
+    fingerprints5 = {}
+    for line in output.splitlines():
+        fp, path = line.split(" ", 1)
+        fingerprints5[path] = fp
+
+    # Even unmodified files should have different fingerprints because conditions_hash changed
+    assert fingerprints1["input/file2"] != fingerprints5["input/file2"]