diff --sort-by: enhanced sorting, fixes #8998

use borg diff --sort-by=spec1,spec2,spec2 for enhanced sorting.

remove legacy --sort behaviour (sort by path), this was deprecated
since 1.4.2.

Co-authored-by: Daniel Rudolf <github.com@daniel-rudolf.de>

This is a port of #9005 to master branch.
This commit is contained in:
Thomas Waldmann 2025-10-12 17:37:14 +02:00
parent 51ddfaf9c1
commit 5c44dad125
No known key found for this signature in database
GPG key ID: 243ACFA951F78E01
4 changed files with 279 additions and 7 deletions

View file

@ -16,3 +16,11 @@ Examples
{"path": "file4", "changes": [{"type": "added", "size": 0}]}
{"path": "file3", "changes": [{"type": "removed", "size": 0}]}
# Use --sort-by with a comma-separated list; sorts apply stably from last to first.
# Here: primary by net size change descending, tie-breaker by path ascending
$ borg diff --sort-by=">size_diff,path" archive1 archive2
+17 B -5 B [-rw-r--r-- -> -rwxr-xr-x] file1
removed 0 B file3
added 0 B file4
+135 B -252 B file2

View file

@ -8,6 +8,8 @@ from ._common import with_repository, build_matcher, Highlander
from ..archive import Archive
from ..constants import * # NOQA
from ..helpers import BaseFormatter, DiffFormatter, archivename_validator, PathSpec, BorgJsonEncoder
from ..helpers import IncludePatternNeverMatchedWarning, remove_surrogates
from ..item import ItemDiff
from ..manifest import Manifest
from ..logger import create_logger
@ -87,11 +89,75 @@ class DiffMixIn:
diffs_iter = Archive.compare_archives_iter(
archive1, archive2, matcher, can_compare_chunk_ids=can_compare_chunk_ids
)
# Conversion to string and filtering for diff.equal to save memory if sorting
# Filter out equal items early (keep as generator; listify only if sorting)
diffs = (diff for diff in diffs_iter if not diff.equal(args.content_only))
if args.sort:
diffs = sorted(diffs, key=lambda diff: diff.path)
sort_specs = []
if args.sort_by:
for spec in args.sort_by.split(","):
spec = spec.strip()
if spec:
sort_specs.append(spec)
def key_for(field: str, d: "ItemDiff"):
# strip direction markers if present
if field and field[0] in ("<", ">"):
field = field[1:]
# path
if field in (None, "", "path"):
return remove_surrogates(d.path)
# compute size_* from changes
if field in ("size_diff", "size_added", "size_removed"):
added = removed = 0
ch = d.changes().get("content")
if ch is not None:
info = ch.to_dict()
t = info.get("type")
if t == "modified":
added = info.get("added", 0)
removed = info.get("removed", 0)
elif t and t.startswith("added"):
added = info.get("added", info.get("size", 0))
removed = 0
elif t and t.startswith("removed"):
added = 0
removed = info.get("removed", info.get("size", 0))
if field == "size_diff":
return added - removed
if field == "size_added":
return added
if field == "size_removed":
return removed
# timestamp diffs
if field in ("ctime_diff", "mtime_diff"):
ts = field.split("_")[0]
t1 = d._item1.get(ts, 0)
t2 = d._item2.get(ts, 0)
return t2 - t1
# size of item in archive2
if field == "size":
it = d._item2
if it is None or it.get("deleted"):
return 0
return it.get_size()
# direct attributes from current item (prefer item2)
it = d._item2 or d._item1
attr_defaults = {"user": "", "group": "", "uid": -1, "gid": -1, "ctime": 0, "mtime": 0}
if field in attr_defaults:
if it is None:
return attr_defaults[field]
return it.get(field, attr_defaults[field])
raise ValueError(f"Invalid field name: {field}")
if sort_specs:
diffs = list(diffs)
# Apply stable sorts from last to first
for spec in reversed(sort_specs):
desc = False
field = spec
if field and field[0] in ("<", ">"):
desc = field[0] == ">"
diffs.sort(key=lambda di: key_for(field, di), reverse=desc)
formatter = DiffFormatter(format, args.content_only)
for diff in diffs:
@ -112,7 +178,7 @@ class DiffMixIn:
"""
This command finds differences (file contents, metadata) between ARCHIVE1 and ARCHIVE2.
For more help on include/exclude patterns, see the :ref:`borg_patterns` command output.
For more help on include/exclude patterns, see the output of the :ref:`borg_patterns` command.
.. man NOTES
@ -149,7 +215,84 @@ class DiffMixIn:
"""
)
+ DiffFormatter.keys_help()
+ textwrap.dedent(
"""
What is compared
+++++++++++++++++
For each matching item in both archives, Borg reports:
- Content changes: total added/removed bytes within files. If chunker parameters are comparable,
Borg compares chunk IDs quickly; otherwise, it compares the content.
- Metadata changes: user, group, mode, and other metadata shown inline, like
"[old_mode -> new_mode]" for mode changes. Use ``--content-only`` to suppress metadata changes.
- Added/removed items: printed as "added SIZE path" or "removed SIZE path".
Output formats
++++++++++++++
The default (text) output shows one line per changed path, e.g.::
+135 B -252 B [ -rw-r--r-- -> -rwxr-xr-x ] path/to/file
JSON Lines output (``--json-lines``) prints one JSON object per changed path, e.g.::
{"path": "PATH", "changes": [
{"type": "modified", "added": BYTES, "removed": BYTES},
{"type": "mode", "old_mode": "-rw-r--r--", "new_mode": "-rwxr-xr-x"},
{"type": "added", "size": SIZE},
{"type": "removed", "size": SIZE}
]}
Sorting
++++++++
Use ``--sort-by FIELDS`` where FIELDS is a comma-separated list of fields.
Sorts are applied stably from last to first in the given list. Prepend ">" for
descending, "<" (or no prefix) for ascending, for example ``--sort-by=">size_added,path"``.
Supported fields include:
- path: the item path
- size_added: total bytes added for the item content
- size_removed: total bytes removed for the item content
- size_diff: size_added - size_removed (net content change)
- size: size of the item as stored in ARCHIVE2 (0 for removed items)
- user, group, uid, gid, ctime, mtime: taken from the item state in ARCHIVE2 when present
- ctime_diff, mtime_diff: timestamp difference (ARCHIVE2 - ARCHIVE1)
Performance considerations
++++++++++++++++++++++++++
diff automatically detects whether the archives were created with the same chunker
parameters. If so, only chunk IDs are compared, which is very fast.
"""
)
)
def diff_sort_spec_validator(s):
if not isinstance(s, str):
raise argparse.ArgumentTypeError("unsupported sort field (not a string)")
allowed = {
"path",
"size_added",
"size_removed",
"size_diff",
"size",
"user",
"group",
"uid",
"gid",
"ctime",
"mtime",
"ctime_diff",
"mtime_diff",
}
parts = [p.strip() for p in s.split(",") if p.strip()]
if not parts:
raise argparse.ArgumentTypeError("unsupported sort field: empty spec")
for spec in parts:
field = spec[1:] if spec and spec[0] in (">", "<") else spec
if field not in allowed:
raise argparse.ArgumentTypeError(f"unsupported sort field: {field}")
return ",".join(parts)
subparser = subparsers.add_parser(
"diff",
parents=[common_parser],
@ -172,7 +315,6 @@ class DiffMixIn:
action="store_true",
help="override the check of chunker parameters",
)
subparser.add_argument("--sort", dest="sort", action="store_true", help="Sort the output lines by file path.")
subparser.add_argument(
"--format",
metavar="FORMAT",
@ -181,6 +323,12 @@ class DiffMixIn:
help='specify format for differences between archives (default: "{change} {path}{NL}")',
)
subparser.add_argument("--json-lines", action="store_true", help="Format output as JSON Lines.")
subparser.add_argument(
"--sort-by",
dest="sort_by",
type=diff_sort_spec_validator,
help="Sort output by comma-separated fields (e.g., '>size_added,path').",
)
subparser.add_argument(
"--content-only",
action="store_true",

View file

@ -263,6 +263,12 @@ class DiffChange:
class ItemDiff:
path: str
_item1: Item
_item2: Item
_chunk_1: Iterator
_chunk_2: Iterator
_numeric_ids: bool
_can_compare_chunk_ids: bool
def __init__(
self,
path: str,

View file

@ -293,7 +293,7 @@ def test_time_diffs(archivers, request):
assert "ctime" not in output
def test_sort_option(archivers, request):
def test_sort_by_option(archivers, request):
archiver = request.getfixturevalue(archivers)
cmd(archiver, "repo-create", RK_ENCRYPTION)
@ -313,7 +313,7 @@ def test_sort_option(archivers, request):
create_regular_file(archiver.input_path, "d_file_added", size=256)
cmd(archiver, "create", "test1", "input")
output = cmd(archiver, "diff", "test0", "test1", "--sort", "--content-only")
output = cmd(archiver, "diff", "test0", "test1", "--sort-by=path", "--content-only")
expected = ["a_file_removed", "b_file_added", "c_file_changed", "d_file_added", "e_file_changed", "f_file_removed"]
assert isinstance(output, str)
outputs = output.splitlines()
@ -321,6 +321,116 @@ def test_sort_option(archivers, request):
assert all(x in line for x, line in zip(expected, outputs))
def test_sort_by_invalid_field_is_rejected(archivers, request):
archiver = request.getfixturevalue(archivers)
cmd(archiver, "repo-create", RK_ENCRYPTION)
create_regular_file(archiver.input_path, "file", size=1)
cmd(archiver, "create", "a1", "input")
create_regular_file(archiver.input_path, "file", size=2)
cmd(archiver, "create", "a2", "input")
# Unsupported field should cause argument parsing error
cmd(archiver, "diff", "a1", "a2", "--sort-by=not_a_field", exit_code=EXIT_ERROR)
def test_sort_by_size_added_then_path(archivers, request):
archiver = request.getfixturevalue(archivers)
cmd(archiver, "repo-create", RK_ENCRYPTION)
# Base archive with two files that will be removed later
create_regular_file(archiver.input_path, "r_big_removed", size=50)
create_regular_file(archiver.input_path, "r_small_removed", size=5)
cmd(archiver, "create", "base", "input")
# Second archive: remove both above and add two new files of different sizes
os.unlink("input/r_big_removed")
os.unlink("input/r_small_removed")
create_regular_file(archiver.input_path, "a_small_added", size=10)
create_regular_file(archiver.input_path, "b_large_added", size=30)
cmd(archiver, "create", "next", "input")
# Sort by size added (ascending), then path to break ties deterministically
output = cmd(archiver, "diff", "base", "next", "--sort-by=size_added,path", "--content-only")
lines = output.splitlines()
# Expect removed entries first (size_added=0), ordered by path, then added entries by increasing size
expected_order = [
"removed:.*input/r_big_removed", # size_added=0
"removed:.*input/r_small_removed", # size_added=0
"added:.*10 B.*input/a_small_added",
"added:.*30 B.*input/b_large_added",
]
assert len(lines) == len(expected_order)
for pattern, line in zip(expected_order, lines):
assert_line_exists([line], pattern)
@pytest.mark.parametrize(
"sort_key",
[
"path",
"size",
"size_added",
"size_removed",
"size_diff",
"user",
"group",
"uid",
"gid",
"ctime",
"mtime",
"ctime_diff",
"mtime_diff",
],
)
def test_sort_by_all_keys_with_directions(archivers, request, sort_key):
archiver = request.getfixturevalue(archivers)
cmd(archiver, "repo-create", RK_ENCRYPTION)
# Prepare initial files
create_regular_file(archiver.input_path, "a_removed", size=11)
create_regular_file(archiver.input_path, "f_removed", size=22)
create_regular_file(archiver.input_path, "c_changed", size=33)
create_regular_file(archiver.input_path, "e_changed", size=44)
cmd(archiver, "create", "s0", "input")
# Ensure that subsequent modifications happen on a later timestamp tick than s0
time.sleep(1.0 if is_darwin else 0.1) # HFS+ has ~1s timestamp granularity on macOS
# Create differences for second archive
os.unlink("input/a_removed")
os.unlink("input/f_removed")
os.unlink("input/c_changed")
os.unlink("input/e_changed")
# Recreate changed files with different sizes
create_regular_file(archiver.input_path, "c_changed", size=333)
create_regular_file(archiver.input_path, "e_changed", size=444)
# Added files
create_regular_file(archiver.input_path, "b_added", size=55)
create_regular_file(archiver.input_path, "d_added", size=66)
cmd(archiver, "create", "s1", "input")
expected_paths = {
"input/a_removed",
"input/b_added",
"input/c_changed",
"input/d_added",
"input/e_changed",
"input/f_removed",
}
# Exercise both ascending and descending for each key.
for direction in ("<", ">"):
sort_spec = f"{direction}{sort_key},path"
output = cmd(archiver, "diff", "s0", "s1", f"--sort-by={sort_spec}", "--content-only")
lines = output.splitlines()
assert len(lines) == len(expected_paths)
# Validate that we got exactly the expected items regardless of order.
# As we do not test the order, this is mostly for test coverage.
seen_paths = {line.split()[-1] for line in lines}
assert seen_paths == expected_paths
@pytest.mark.skipif(not are_hardlinks_supported(), reason="hardlinks not supported")
def test_hard_link_deletion_and_replacement(archivers, request):
archiver = request.getfixturevalue(archivers)