From 179aadc5ccf3a83c3ff4053152e6cbc6836e31a6 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 2 Nov 2024 11:33:31 +0100 Subject: [PATCH 01/11] repository: add store_move for soft-delete --- src/borg/remote.py | 5 +++++ src/borg/repository.py | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/src/borg/remote.py b/src/borg/remote.py index 27ec9f68b..150a58068 100644 --- a/src/borg/remote.py +++ b/src/borg/remote.py @@ -180,6 +180,7 @@ class RepositoryServer: # pragma: no cover "store_load", "store_store", "store_delete", + "store_move", ) def __init__(self, restrict_to_paths, restrict_to_repositories, append_only, storage_quota, use_socket): @@ -1093,6 +1094,10 @@ class RemoteRepository: def store_delete(self, name): """actual remoting is done via self.call in the @api decorator""" + @api(since=parse_version("2.0.0b13")) # TODO -> b14 + def store_move(self, name, new_name=None, *, delete=False, undelete=False, deleted=False): + """actual remoting is done via self.call in the @api decorator""" + class RepositoryNoCache: """A not caching Repository wrapper, passes through to repository. diff --git a/src/borg/repository.py b/src/borg/repository.py index 93c5a4f74..8b52f45ca 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -537,3 +537,7 @@ class Repository: def store_delete(self, name): self._lock_refresh() return self.store.delete(name) + + def store_move(self, name, new_name=None, *, delete=False, undelete=False, deleted=False): + self._lock_refresh() + return self.store.move(name, new_name, delete=delete, undelete=undelete, deleted=deleted) From c8bce0329dce1e9a2687a9313913fcde55536460 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 2 Nov 2024 11:40:00 +0100 Subject: [PATCH 02/11] archive deletion: use store soft-deletion This keeps the object, just renames it to "*.del". --- src/borg/manifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/borg/manifest.py b/src/borg/manifest.py index 9586b5895..6df834cb5 100644 --- a/src/borg/manifest.py +++ b/src/borg/manifest.py @@ -327,7 +327,7 @@ class Archives: # delete an archive assert isinstance(id, bytes) assert not self.legacy - self.repository.store_delete(f"archives/{bin_to_hex(id)}") + self.repository.store_move(f"archives/{bin_to_hex(id)}", delete=True) # soft-delete def list( self, From 088d59d8147af6bc575d6a40ea377392bfd58ac1 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 2 Nov 2024 11:50:48 +0100 Subject: [PATCH 03/11] repository: store_list: add deleted param --- src/borg/remote.py | 6 ++++-- src/borg/repository.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/borg/remote.py b/src/borg/remote.py index 150a58068..72382a98c 100644 --- a/src/borg/remote.py +++ b/src/borg/remote.py @@ -1078,8 +1078,10 @@ class RemoteRepository: def put_manifest(self, data): """actual remoting is done via self.call in the @api decorator""" - @api(since=parse_version("2.0.0b8")) - def store_list(self, name): + @api( + since=parse_version("2.0.0b8"), deleted={"since": parse_version("2.0.0b13"), "previously": False} # TODO -> b14 + ) + def store_list(self, name, *, deleted=False): """actual remoting is done via self.call in the @api decorator""" @api(since=parse_version("2.0.0b8")) diff --git a/src/borg/repository.py b/src/borg/repository.py index 8b52f45ca..68f9c3814 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -519,10 +519,10 @@ class Repository: self._lock_refresh() return self.store.store("config/manifest", data) - def store_list(self, name): + def store_list(self, name, *, deleted=False): self._lock_refresh() try: - return list(self.store.list(name)) + return list(self.store.list(name, deleted=deleted)) except StoreObjectNotFound: return [] From 7ed8ed56d217765c68b8ed9e46d46585cb4a7754 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 2 Nov 2024 13:32:00 +0100 Subject: [PATCH 04/11] repo-list: --deleted lists deleted archives --- src/borg/archive.py | 6 ++-- src/borg/archiver/_common.py | 9 ++++- src/borg/archiver/repo_list_cmd.py | 4 +-- src/borg/helpers/parseformat.py | 5 +-- src/borg/manifest.py | 33 ++++++++++++------- .../testsuite/archiver/repo_list_cmd_test.py | 20 +++++++++++ 6 files changed, 58 insertions(+), 19 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 24411f15b..68cf30bd2 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -458,6 +458,7 @@ class Archive: end=None, log_json=False, iec=False, + deleted=False, ): name_is_id = isinstance(name, bytes) self.cwd = os.getcwd() @@ -499,8 +500,9 @@ class Archive: self.tags = set() else: if name_is_id: - # we also go over the manifest here to avoid quick&dirty deleted archives - info = self.manifest.archives.get_by_id(name) + # we also go over the manifest here to avoid quick&dirty deleted archives, + # except if we explicitly request one via deleted=True. + info = self.manifest.archives.get_by_id(name, deleted=deleted) else: info = self.manifest.archives.get(name) if info is None: diff --git a/src/borg/archiver/_common.py b/src/borg/archiver/_common.py index f8223eb73..068a1fba5 100644 --- a/src/borg/archiver/_common.py +++ b/src/borg/archiver/_common.py @@ -369,7 +369,9 @@ def define_exclusion_group(subparser, **kwargs): return exclude_group -def define_archive_filters_group(subparser, *, sort_by=True, first_last=True, oldest_newest=True, older_newer=True): +def define_archive_filters_group( + subparser, *, sort_by=True, first_last=True, oldest_newest=True, older_newer=True, deleted=False +): filters_group = subparser.add_argument_group( "Archive filters", "Archive filters can be applied to repository targets." ) @@ -456,6 +458,11 @@ def define_archive_filters_group(subparser, *, sort_by=True, first_last=True, ol help="consider archives newer than (now - TIMESPAN), e.g. 7d or 12m.", ) + if deleted: + filters_group.add_argument( + "--deleted", dest="deleted", action="store_true", help="consider only deleted archives." + ) + return filters_group diff --git a/src/borg/archiver/repo_list_cmd.py b/src/borg/archiver/repo_list_cmd.py index 752b706f2..fbdb327d3 100644 --- a/src/borg/archiver/repo_list_cmd.py +++ b/src/borg/archiver/repo_list_cmd.py @@ -26,7 +26,7 @@ class RepoListMixIn: "BORG_RLIST_FORMAT", "{id:.8} {time} {archive:<15} {tags:<10} {username:<10} {hostname:<10} {comment:.40}{NL}", ) - formatter = ArchiveFormatter(format, repository, manifest, manifest.key, iec=args.iec) + formatter = ArchiveFormatter(format, repository, manifest, manifest.key, iec=args.iec, deleted=args.deleted) output_data = [] @@ -113,4 +113,4 @@ class RepoListMixIn: "but keys used in it are added to the JSON output. " "Some keys are always present. Note: JSON can only represent text.", ) - define_archive_filters_group(subparser) + define_archive_filters_group(subparser, deleted=True) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index f995c8467..0f3f397da 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -718,7 +718,7 @@ class ArchiveFormatter(BaseFormatter): ("size", "nfiles"), ) - def __init__(self, format, repository, manifest, key, *, iec=False): + def __init__(self, format, repository, manifest, key, *, iec=False, deleted=False): static_data = {} # here could be stuff on repo level, above archive level static_data.update(self.FIXED_KEYS) super().__init__(format, static_data) @@ -728,6 +728,7 @@ class ArchiveFormatter(BaseFormatter): self.name = None self.id = None self._archive = None + self.deleted = deleted # True if we want to deal with deleted archives. self.iec = iec self.format_keys = {f[1] for f in Formatter().parse(format)} self.call_keys = { @@ -772,7 +773,7 @@ class ArchiveFormatter(BaseFormatter): if self._archive is None or self._archive.id != self.id: from ..archive import Archive - self._archive = Archive(self.manifest, self.id, iec=self.iec) + self._archive = Archive(self.manifest, self.id, iec=self.iec, deleted=self.deleted) return self._archive def get_meta(self, key, default=None): diff --git a/src/borg/manifest.py b/src/borg/manifest.py index 6df834cb5..51785af78 100644 --- a/src/borg/manifest.py +++ b/src/borg/manifest.py @@ -101,11 +101,17 @@ class Archives: manifest_archives = StableDict(self._get_raw_dict()) return manifest_archives - def ids(self): + def ids(self, *, deleted=False): # yield the binary IDs of all archives if not self.legacy: try: - infos = list(self.repository.store_list("archives")) + infos = list(self.repository.store_list("archives", deleted=deleted)) + if deleted: + # hack: store_list(deleted=True) yields deleted AND not deleted items, + # guess this should be fixed in a future borgstore release. + # for now, we remove the not-deleted archives here: + not_deleted_infos = set(self.repository.store_list("archives", deleted=False)) + infos = [info for info in infos if info not in not_deleted_infos] except ObjectNotFound: infos = [] for info in infos: @@ -156,13 +162,13 @@ class Archives: ) return metadata - def _infos(self): + def _infos(self, *, deleted=False): # yield the infos of all archives - for id in self.ids(): + for id in self.ids(deleted=deleted): yield self._get_archive_meta(id) - def _info_tuples(self): - for info in self._infos(): + def _info_tuples(self, *, deleted=False): + for info in self._infos(deleted=deleted): yield ArchiveInfo( name=info["name"], id=info["id"], @@ -172,8 +178,8 @@ class Archives: host=info["hostname"], ) - def _matching_info_tuples(self, match_patterns, match_end): - archive_infos = list(self._info_tuples()) + def _matching_info_tuples(self, match_patterns, match_end, *, deleted=False): + archive_infos = list(self._info_tuples(deleted=deleted)) if match_patterns: assert isinstance(match_patterns, list), f"match_pattern is a {type(match_patterns)}" for match in match_patterns: @@ -279,13 +285,14 @@ class Archives: else: return dict(name=name, id=values["id"], time=values["time"]) - def get_by_id(self, id, raw=False): + def get_by_id(self, id, raw=False, *, deleted=False): assert isinstance(id, bytes) if not self.legacy: - if id in self.ids(): # check directory + if id in self.ids(deleted=deleted): # check directory # looks like this archive id is in the archives directory, thus it is NOT deleted. + # OR we have explicitly requested a soft-deleted archive via deleted=True. archive_info = self._get_archive_meta(id) - if archive_info["exists"]: + if archive_info["exists"]: # True means we have found Archive metadata in the repo. if not raw: ts = parse_timestamp(archive_info["time"]) archive_info = ArchiveInfo( @@ -342,6 +349,7 @@ class Archives: newer=None, oldest=None, newest=None, + deleted=False, ): """ Return list of ArchiveInfo instances according to the parameters. @@ -363,7 +371,7 @@ class Archives: if isinstance(sort_by, (str, bytes)): raise TypeError("sort_by must be a sequence of str") - archive_infos = self._matching_info_tuples(match, match_end) + archive_infos = self._matching_info_tuples(match, match_end, deleted=deleted) if any([oldest, newest, older, newer]): archive_infos = filter_archives_by_date( @@ -397,6 +405,7 @@ class Archives: newer=getattr(args, "newer", None), oldest=getattr(args, "oldest", None), newest=getattr(args, "newest", None), + deleted=getattr(args, "deleted", False), ) def get_one(self, match, *, match_end=r"\Z"): diff --git a/src/borg/testsuite/archiver/repo_list_cmd_test.py b/src/borg/testsuite/archiver/repo_list_cmd_test.py index ffce5af00..66cc9bf63 100644 --- a/src/borg/testsuite/archiver/repo_list_cmd_test.py +++ b/src/borg/testsuite/archiver/repo_list_cmd_test.py @@ -98,3 +98,23 @@ def test_repo_list_json(archivers, request): assert "keyfile" not in list_repo["encryption"] archive0 = list_repo["archives"][0] checkts(archive0["time"]) + + +def test_repo_list_deleted(archivers, request): + archiver = request.getfixturevalue(archivers) + cmd(archiver, "repo-create", RK_ENCRYPTION) + cmd(archiver, "create", "normal1", src_dir) + cmd(archiver, "create", "deleted1", src_dir) + cmd(archiver, "create", "normal2", src_dir) + cmd(archiver, "create", "deleted2", src_dir) + cmd(archiver, "delete", "-a", "sh:deleted*") + output = cmd(archiver, "repo-list") + assert "normal1" in output + assert "normal2" in output + assert "deleted1" not in output + assert "deleted2" not in output + output = cmd(archiver, "repo-list", "--deleted") + assert "normal1" not in output + assert "normal2" not in output + assert "deleted1" in output + assert "deleted2" in output From 9fabc19e6b82c246f151a9ad999b7312933e8182 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 2 Nov 2024 14:52:40 +0100 Subject: [PATCH 05/11] undelete: undelete soft-deleted archives, fixes #8500 --- src/borg/archive.py | 2 +- src/borg/archiver/__init__.py | 3 + src/borg/archiver/undelete_cmd.py | 90 +++++++++++++++++++ src/borg/manifest.py | 10 ++- .../testsuite/archiver/undelete_cmd_test.py | 67 ++++++++++++++ 5 files changed, 169 insertions(+), 3 deletions(-) create mode 100644 src/borg/archiver/undelete_cmd.py create mode 100644 src/borg/testsuite/archiver/undelete_cmd_test.py diff --git a/src/borg/archive.py b/src/borg/archive.py index 68cf30bd2..ccd469e12 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -500,7 +500,7 @@ class Archive: self.tags = set() else: if name_is_id: - # we also go over the manifest here to avoid quick&dirty deleted archives, + # we also go over the manifest here to avoid soft-deleted archives, # except if we explicitly request one via deleted=True. info = self.manifest.archives.get_by_id(name, deleted=deleted) else: diff --git a/src/borg/archiver/__init__.py b/src/borg/archiver/__init__.py index add009bea..f8939f894 100644 --- a/src/borg/archiver/__init__.py +++ b/src/borg/archiver/__init__.py @@ -92,6 +92,7 @@ from .serve_cmd import ServeMixIn from .tag_cmd import TagMixIn from .tar_cmds import TarMixIn from .transfer_cmd import TransferMixIn +from .undelete_cmd import UnDeleteMixIn from .version_cmd import VersionMixIn @@ -124,6 +125,7 @@ class Archiver( TagMixIn, TarMixIn, TransferMixIn, + UnDeleteMixIn, VersionMixIn, ): def __init__(self, lock_wait=None, prog=None): @@ -364,6 +366,7 @@ class Archiver( self.build_parser_tag(subparsers, common_parser, mid_common_parser) self.build_parser_tar(subparsers, common_parser, mid_common_parser) self.build_parser_transfer(subparsers, common_parser, mid_common_parser) + self.build_parser_undelete(subparsers, common_parser, mid_common_parser) self.build_parser_version(subparsers, common_parser, mid_common_parser) return parser diff --git a/src/borg/archiver/undelete_cmd.py b/src/borg/archiver/undelete_cmd.py new file mode 100644 index 000000000..31e38ca51 --- /dev/null +++ b/src/borg/archiver/undelete_cmd.py @@ -0,0 +1,90 @@ +import argparse +import logging + +from ._common import with_repository +from ..constants import * # NOQA +from ..helpers import format_archive, CommandError, bin_to_hex, archivename_validator +from ..manifest import Manifest + +from ..logger import create_logger + +logger = create_logger() + + +class UnDeleteMixIn: + @with_repository(manifest=False) + def do_undelete(self, args, repository): + """Undelete archives""" + self.output_list = args.output_list + dry_run = args.dry_run + manifest = Manifest.load(repository, (Manifest.Operation.DELETE,)) + if args.name: + archive_infos = [manifest.archives.get_one([args.name], deleted=True)] + else: + args.deleted = True + archive_infos = manifest.archives.list_considering(args) + count = len(archive_infos) + if count == 0: + return + if not args.name and not args.match_archives and args.first == 0 and args.last == 0: + raise CommandError("Aborting: if you really want to undelete all archives, please use -a 'sh:*'.") + + undeleted = False + logger_list = logging.getLogger("borg.output.list") + for i, archive_info in enumerate(archive_infos, 1): + name, id, hex_id = archive_info.name, archive_info.id, bin_to_hex(archive_info.id) + try: + if not dry_run: + manifest.archives.undelete_by_id(id) + except KeyError: + self.print_warning(f"Archive {name} {hex_id} not found ({i}/{count}).") + else: + undeleted = True + if self.output_list: + msg = "Would undelete: {} ({}/{})" if dry_run else "Undeleted archive: {} ({}/{})" + logger_list.info(msg.format(format_archive(archive_info), i, count)) + if dry_run: + logger.info("Finished dry-run.") + elif undeleted: + manifest.write() + self.print_warning("Done.", wc=None) + else: + self.print_warning("Aborted.", wc=None) + return + + def build_parser_undelete(self, subparsers, common_parser, mid_common_parser): + from ._common import process_epilog, define_archive_filters_group + + undelete_epilog = process_epilog( + """ + This command undeletes archives in the repository. + + Important: Undeleting archives is only possible before compacting. + Once ``borg compact`` has run, all disk space occupied only by the + deleted archives will be freed and undelete is not possible anymore. + + When in doubt, use ``--dry-run --list`` to see what would be undeleted. + + You can undelete multiple archives by specifying a matching pattern, + using the ``--match-archives PATTERN`` option (for more info on these patterns, + see :ref:`borg_patterns`). + """ + ) + subparser = subparsers.add_parser( + "undelete", + parents=[common_parser], + add_help=False, + description=self.do_undelete.__doc__, + epilog=undelete_epilog, + formatter_class=argparse.RawDescriptionHelpFormatter, + help="undelete archive", + ) + subparser.set_defaults(func=self.do_undelete) + subparser.add_argument("-n", "--dry-run", dest="dry_run", action="store_true", help="do not change repository") + subparser.add_argument( + "--list", dest="output_list", action="store_true", help="output verbose list of archives" + ) + define_archive_filters_group(subparser) + subparser.add_argument( + "name", metavar="NAME", nargs="?", type=archivename_validator, help="specify the archive name" + ) diff --git a/src/borg/manifest.py b/src/borg/manifest.py index 51785af78..7091c2907 100644 --- a/src/borg/manifest.py +++ b/src/borg/manifest.py @@ -336,6 +336,12 @@ class Archives: assert not self.legacy self.repository.store_move(f"archives/{bin_to_hex(id)}", delete=True) # soft-delete + def undelete_by_id(self, id): + # undelete an archive + assert isinstance(id, bytes) + assert not self.legacy + self.repository.store_move(f"archives/{bin_to_hex(id)}", undelete=True) + def list( self, *, @@ -408,10 +414,10 @@ class Archives: deleted=getattr(args, "deleted", False), ) - def get_one(self, match, *, match_end=r"\Z"): + def get_one(self, match, *, match_end=r"\Z", deleted=False): """get exactly one archive matching """ assert match is not None - archive_infos = self._matching_info_tuples(match, match_end) + archive_infos = self._matching_info_tuples(match, match_end, deleted=deleted) if len(archive_infos) != 1: raise CommandError(f"{match} needed to match precisely one archive, but matched {len(archive_infos)}.") return archive_infos[0] diff --git a/src/borg/testsuite/archiver/undelete_cmd_test.py b/src/borg/testsuite/archiver/undelete_cmd_test.py new file mode 100644 index 000000000..8cbc18a93 --- /dev/null +++ b/src/borg/testsuite/archiver/undelete_cmd_test.py @@ -0,0 +1,67 @@ +from ...constants import * # NOQA +from . import cmd, create_regular_file, generate_archiver_tests, RK_ENCRYPTION + +pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary") # NOQA + + +def test_undelete_single(archivers, request): + archiver = request.getfixturevalue(archivers) + create_regular_file(archiver.input_path, "file1", size=1024 * 80) + cmd(archiver, "repo-create", RK_ENCRYPTION) + cmd(archiver, "create", "normal", "input") + cmd(archiver, "create", "deleted", "input") + cmd(archiver, "delete", "deleted") + output = cmd(archiver, "repo-list") + assert "normal" in output + assert "deleted" not in output + cmd(archiver, "undelete", "deleted") + output = cmd(archiver, "repo-list") + assert "normal" in output + assert "deleted" in output # it's back! + cmd(archiver, "check") + + +def test_undelete_multiple_dryrun(archivers, request): + archiver = request.getfixturevalue(archivers) + create_regular_file(archiver.input_path, "file1", size=1024 * 80) + cmd(archiver, "repo-create", RK_ENCRYPTION) + cmd(archiver, "create", "normal", "input") + cmd(archiver, "create", "deleted1", "input") + cmd(archiver, "create", "deleted2", "input") + cmd(archiver, "delete", "deleted1") + cmd(archiver, "delete", "deleted2") + output = cmd(archiver, "repo-list") + assert "normal" in output + assert "deleted1" not in output + assert "deleted2" not in output + output = cmd(archiver, "undelete", "--dry-run", "--list", "-a", "sh:*") + assert "normal" not in output # not a candidate for undeletion + assert "deleted1" in output # candidate for undeletion + assert "deleted2" in output # candidate for undeletion + output = cmd(archiver, "repo-list") # nothing change, it was a dry-run + assert "normal" in output + assert "deleted1" not in output + assert "deleted2" not in output + + +def test_undelete_multiple_run(archivers, request): + archiver = request.getfixturevalue(archivers) + create_regular_file(archiver.input_path, "file1", size=1024 * 80) + cmd(archiver, "repo-create", RK_ENCRYPTION) + cmd(archiver, "create", "normal", "input") + cmd(archiver, "create", "deleted1", "input") + cmd(archiver, "create", "deleted2", "input") + cmd(archiver, "delete", "deleted1") + cmd(archiver, "delete", "deleted2") + output = cmd(archiver, "repo-list") + assert "normal" in output + assert "deleted1" not in output + assert "deleted2" not in output + output = cmd(archiver, "undelete", "--list", "-a", "sh:*") + assert "normal" not in output # not undeleted + assert "deleted1" in output # undeleted + assert "deleted2" in output # undeleted + output = cmd(archiver, "repo-list") # nothing change, it was a dry-run + assert "normal" in output + assert "deleted1" in output + assert "deleted2" in output From 299c05287fe6ba13671f43120b0be73d70029fe0 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 2 Nov 2024 15:17:29 +0100 Subject: [PATCH 06/11] compact: remove soft-deleted archives/* entries, docs --- src/borg/archiver/compact_cmd.py | 25 ++++++++++++++++++++++--- src/borg/manifest.py | 8 +++++++- src/borg/remote.py | 6 ++++-- src/borg/repository.py | 4 ++-- 4 files changed, 35 insertions(+), 8 deletions(-) diff --git a/src/borg/archiver/compact_cmd.py b/src/borg/archiver/compact_cmd.py index 20c9fa480..067a099c4 100644 --- a/src/borg/archiver/compact_cmd.py +++ b/src/borg/archiver/compact_cmd.py @@ -127,6 +127,15 @@ class ArchiveGarbageCollector: logger.warning(f"{len(self.reappeared_chunks)} previously missing objects re-appeared!" + run_repair) set_ec(EXIT_WARNING) + logger.info("Cleaning archives directory from deleted archives...") + archive_infos = self.manifest.archives.list(sort_by=["ts"], deleted=True) + for archive_info in archive_infos: + name, id, hex_id = archive_info.name, archive_info.id, bin_to_hex(archive_info.id) + try: + self.manifest.archives.nuke_by_id(id) + except KeyError: + self.print_warning(f"Archive {name} {hex_id} not found.") + repo_size_before = self.repository_size logger.info("Determining unused objects...") unused = set() @@ -166,9 +175,19 @@ class CompactMixIn: """ Free repository space by deleting unused chunks. - borg compact analyzes all existing archives to find out which chunks are - actually used. There might be unused chunks resulting from borg delete or prune, - which can be removed to free space in the repository. + borg compact analyzes all existing archives to find out which repository + objects are actually used (referenced). It then removes all unused objects + to free repository space. + + Unused objects may result from: + + - borg delete or prune usage + - interrupted backups (maybe retry the backup first before running compact!) + - backup of source files that had an I/O error in the middle of their contents + and that were skipped due to this. + + Important: after compacting it is not possible anymore to use ``borg undelete`` + to recover previously deleted archives. Differently than borg 1.x, borg2's compact needs the borg key if the repo is encrypted. diff --git a/src/borg/manifest.py b/src/borg/manifest.py index 7091c2907..9dc9e818c 100644 --- a/src/borg/manifest.py +++ b/src/borg/manifest.py @@ -331,7 +331,7 @@ class Archives: self._archives[name] = {"id": id, "time": ts} def delete_by_id(self, id): - # delete an archive + # soft-delete an archive assert isinstance(id, bytes) assert not self.legacy self.repository.store_move(f"archives/{bin_to_hex(id)}", delete=True) # soft-delete @@ -342,6 +342,12 @@ class Archives: assert not self.legacy self.repository.store_move(f"archives/{bin_to_hex(id)}", undelete=True) + def nuke_by_id(self, id): + # really delete an already soft-deleted archive + assert isinstance(id, bytes) + assert not self.legacy + self.repository.store_delete(f"archives/{bin_to_hex(id)}", deleted=True) + def list( self, *, diff --git a/src/borg/remote.py b/src/borg/remote.py index 72382a98c..fcf6a116f 100644 --- a/src/borg/remote.py +++ b/src/borg/remote.py @@ -1092,8 +1092,10 @@ class RemoteRepository: def store_store(self, name, value): """actual remoting is done via self.call in the @api decorator""" - @api(since=parse_version("2.0.0b8")) - def store_delete(self, name): + @api( + since=parse_version("2.0.0b8"), deleted={"since": parse_version("2.0.0b13"), "previously": False} + ) # TODO -> b14) + def store_delete(self, name, *, deleted=False): """actual remoting is done via self.call in the @api decorator""" @api(since=parse_version("2.0.0b13")) # TODO -> b14 diff --git a/src/borg/repository.py b/src/borg/repository.py index 68f9c3814..5f7ac27e6 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -534,9 +534,9 @@ class Repository: self._lock_refresh() return self.store.store(name, value) - def store_delete(self, name): + def store_delete(self, name, *, deleted=False): self._lock_refresh() - return self.store.delete(name) + return self.store.delete(name, deleted=deleted) def store_move(self, name, new_name=None, *, delete=False, undelete=False, deleted=False): self._lock_refresh() From c35cbc9028c5ee536185b80d92f01aea1fb6541b Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 2 Nov 2024 20:16:03 +0100 Subject: [PATCH 07/11] rebuild_archives_directory: accelerate by only reading metadata We are only interested in archive metadata objects here, thus for most repo objects it is enough to read the repoobj's metadata and determine the object's type. Only if it is the right type of object, we need to read the full object (metadata and data). --- src/borg/archive.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index ccd469e12..f3b10e588 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -1833,6 +1833,16 @@ class ArchiveChecker: ) for chunk_id, _ in self.chunks.iteritems(): pi.show() + cdata = self.repository.get(chunk_id, read_data=False) # only get metadata + try: + meta = self.repo_objs.parse_meta(chunk_id, cdata, ro_type=ROBJ_DONTCARE) + except IntegrityErrorBase as exc: + logger.error("Skipping corrupted chunk: %s", exc) + self.error_found = True + continue + if meta["type"] != ROBJ_ARCHIVE_META: + continue + # now we know it is an archive metadata chunk, load the full object from the repo: cdata = self.repository.get(chunk_id) try: meta, data = self.repo_objs.parse(chunk_id, cdata, ro_type=ROBJ_DONTCARE) @@ -1841,7 +1851,7 @@ class ArchiveChecker: self.error_found = True continue if meta["type"] != ROBJ_ARCHIVE_META: - continue + continue # should never happen try: archive = msgpack.unpackb(data) # Ignore exceptions that might be raised when feeding msgpack with invalid data From a48a8d2bea07e3c41b650f6286a83b2d035fe63e Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 2 Nov 2024 18:10:12 +0100 Subject: [PATCH 08/11] check --find-lost-archives (was: --undelete-archives) Consider soft-deleted archives/ directory entries, but only create a new archives/ directory entry if: - there is no entry for that archive ID - there is no soft-deleted entry for that archive ID either Support running with or without --repair. Without --repair, it can be used to detect such inconsistencies and return with rc != 0. --repository-only contradicts --find-lost-archives. --- src/borg/archive.py | 28 ++++++++++++------- src/borg/archiver/check_cmd.py | 20 ++++++------- src/borg/manifest.py | 8 ++++++ src/borg/testsuite/archiver/check_cmd_test.py | 22 +++++++++------ 4 files changed, 48 insertions(+), 30 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index f3b10e588..c07cec08c 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -1635,7 +1635,7 @@ class ArchiveChecker: *, verify_data=False, repair=False, - undelete_archives=False, + find_lost_archives=False, match=None, sort_by="", first=0, @@ -1648,7 +1648,7 @@ class ArchiveChecker: """Perform a set of checks on 'repository' :param repair: enable repair mode, write updated or corrected data into repository - :param undelete_archives: create archive directory entries that are missing + :param find_lost_archives: create archive directory entries that are missing :param first/last/sort_by: only check this number of first/last archives ordered by sort_by :param match: only check archives matching this pattern :param older/newer: only check archives older/newer than timedelta from now @@ -1685,7 +1685,7 @@ class ArchiveChecker: rebuild_manifest = True if rebuild_manifest: self.manifest = self.rebuild_manifest() - if undelete_archives: + if find_lost_archives: self.rebuild_archives_directory() self.rebuild_archives( match=match, first=first, last=last, sort_by=sort_by, older=older, oldest=oldest, newer=newer, newest=newest @@ -1815,8 +1815,10 @@ class ArchiveChecker: """Rebuild the archives directory, undeleting archives. Iterates through all objects in the repository looking for archive metadata blocks. - When finding some that do not have a corresponding archives directory entry, it will - create that entry (undeleting all archives). + When finding some that do not have a corresponding archives directory entry (either + a normal entry for an "existing" archive, or a soft-deleted entry for a "deleted" + archive), it will create that entry (making the archives directory consistent with + the repository). """ def valid_archive(obj): @@ -1862,12 +1864,18 @@ class ArchiveChecker: archive = ArchiveItem(internal_dict=archive) name = archive.name archive_id, archive_id_hex = chunk_id, bin_to_hex(chunk_id) - logger.info(f"Found archive {name} {archive_id_hex}.") - if self.manifest.archives.exists_name_and_id(name, archive_id): - logger.info("We already have an archives directory entry for this.") + if self.manifest.archives.exists_id(archive_id, deleted=False): + logger.debug(f"We already have an archives directory entry for {name} {archive_id_hex}.") + elif self.manifest.archives.exists_id(archive_id, deleted=True): + logger.debug(f"We already have a deleted archives directory entry for {name} {archive_id_hex}.") else: - logger.warning(f"Creating archives directory entry for {name} {archive_id_hex}.") - self.manifest.archives.create(name, archive_id, archive.time) + self.error_found = True + if self.repair: + logger.warning(f"Creating archives directory entry for {name} {archive_id_hex}.") + self.manifest.archives.create(name, archive_id, archive.time) + else: + logger.warning(f"Would create archives directory entry for {name} {archive_id_hex}.") + pi.finish() logger.info("Rebuilding missing archives directory entries completed.") diff --git a/src/borg/archiver/check_cmd.py b/src/borg/archiver/check_cmd.py index a7d0ea990..7fe962bff 100644 --- a/src/borg/archiver/check_cmd.py +++ b/src/borg/archiver/check_cmd.py @@ -35,10 +35,10 @@ class CheckMixIn: raise CommandError( "--repository-only contradicts --first, --last, -a / --match-archives and --verify-data arguments." ) + if args.repo_only and args.find_lost_archives: + raise CommandError("--repository-only contradicts the --find-lost-archives option.") if args.repair and args.max_duration: raise CommandError("--repair does not allow --max-duration argument.") - if args.undelete_archives and not args.repair: - raise CommandError("--undelete-archives requires --repair argument.") if args.max_duration and not args.repo_only: # when doing a partial repo check, we can only check xxh64 hashes in repository files. # archives check requires that a full repo check was done before and has built/cached a ChunkIndex. @@ -51,7 +51,7 @@ class CheckMixIn: repository, verify_data=args.verify_data, repair=args.repair, - undelete_archives=args.undelete_archives, + find_lost_archives=args.find_lost_archives, match=args.match_archives, sort_by=args.sort_by or "ts", first=args.first, @@ -180,11 +180,12 @@ class CheckMixIn: Consequently, if lost chunks were repaired earlier, it is advised to run ``--repair`` a second time after creating some new backups. - If ``--repair --undelete-archives`` is given, Borg will scan the repository + If ``--repair --find-lost-archives`` is given, Borg will scan the repository for archive metadata and if it finds some where no corresponding archives - directory entry exists, it will create the entries. This is basically undoing - ``borg delete archive`` or ``borg prune ...`` commands and only possible before - ``borg compact`` would remove the archives' data completely. + directory entry exists, it will create one. + This will make archives reappear for which the directory entry was lost. + This is only possible before ``borg compact`` would remove the archives' + data completely. """ ) subparser = subparsers.add_parser( @@ -213,10 +214,7 @@ class CheckMixIn: "--repair", dest="repair", action="store_true", help="attempt to repair any inconsistencies found" ) subparser.add_argument( - "--undelete-archives", - dest="undelete_archives", - action="store_true", - help="attempt to undelete archives (use with --repair)", + "--find-lost-archives", dest="find_lost_archives", action="store_true", help="attempt to find lost archives" ) subparser.add_argument( "--max-duration", diff --git a/src/borg/manifest.py b/src/borg/manifest.py index 9dc9e818c..608bfcaab 100644 --- a/src/borg/manifest.py +++ b/src/borg/manifest.py @@ -222,6 +222,14 @@ class Archives: else: return name in self._archives + def exists_id(self, id, *, deleted=False): + # check if an archive with this id exists + assert isinstance(id, bytes) + if not self.legacy: + return id in self.ids(deleted=deleted) + else: + raise NotImplementedError + def exists_name_and_id(self, name, id): # check if an archive with this name AND id exists assert isinstance(name, str) diff --git a/src/borg/testsuite/archiver/check_cmd_test.py b/src/borg/testsuite/archiver/check_cmd_test.py index 0eb19e7f1..cc2ee31e5 100644 --- a/src/borg/testsuite/archiver/check_cmd_test.py +++ b/src/borg/testsuite/archiver/check_cmd_test.py @@ -1,4 +1,5 @@ from datetime import datetime, timezone, timedelta +from pathlib import Path import shutil from unittest.mock import patch @@ -270,18 +271,21 @@ def test_manifest_rebuild_corrupted_chunk(archivers, request): def test_check_undelete_archives(archivers, request): archiver = request.getfixturevalue(archivers) check_cmd_setup(archiver) # creates archive1 and archive2 - # borg delete does it rather quick and dirty: it only kills the archives directory entry - cmd(archiver, "delete", "archive1") - cmd(archiver, "delete", "archive2") - output = cmd(archiver, "repo-list") - assert "archive1" not in output - assert "archive2" not in output - # borg check will re-discover archive1 and archive2 and new archives directory entries - # will be created because we requested undeleting archives. - cmd(archiver, "check", "--repair", "--undelete-archives", exit_code=0) + existing_archive_ids = set(cmd(archiver, "repo-list", "--short").splitlines()) + create_src_archive(archiver, "archive3") + archive_ids = set(cmd(archiver, "repo-list", "--short").splitlines()) + new_archive_id_hex = (archive_ids - existing_archive_ids).pop() + (Path(archiver.repository_path) / "archives" / new_archive_id_hex).unlink() # lose the entry for archive3 output = cmd(archiver, "repo-list") assert "archive1" in output assert "archive2" in output + assert "archive3" not in output + # borg check will re-discover archive3 and create a new archives directory entry. + cmd(archiver, "check", "--repair", "--find-lost-archives", exit_code=0) + output = cmd(archiver, "repo-list") + assert "archive1" in output + assert "archive2" in output + assert "archive3" in output def test_spoofed_archive(archivers, request): From 72b1a8ea049a45691becc2d67e6dfc07b62d2aeb Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 3 Nov 2024 13:48:43 +0100 Subject: [PATCH 09/11] delete/prune: add hint about undelete --- src/borg/archiver/delete_cmd.py | 7 +++++-- src/borg/archiver/prune_cmd.py | 6 +++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/borg/archiver/delete_cmd.py b/src/borg/archiver/delete_cmd.py index 5ff05d33d..ee411428b 100644 --- a/src/borg/archiver/delete_cmd.py +++ b/src/borg/archiver/delete_cmd.py @@ -64,8 +64,11 @@ class DeleteMixIn: """ This command deletes archives from the repository. - Important: When deleting archives, repository disk space is **not** freed until - you run ``borg compact``. + Important: + + - Repository disk space is **not** freed until you run ``borg compact``. + - You can use ``borg undelete`` to undelete archives, but only until + you run ``borg compact``. When in doubt, use ``--dry-run --list`` to see what would be deleted. diff --git a/src/borg/archiver/prune_cmd.py b/src/borg/archiver/prune_cmd.py index b970bc36e..654d89fcc 100644 --- a/src/borg/archiver/prune_cmd.py +++ b/src/borg/archiver/prune_cmd.py @@ -215,7 +215,11 @@ class PruneMixIn: The prune command prunes a repository by deleting all archives not matching any of the specified retention options. - Important: Repository disk space is **not** freed until you run ``borg compact``. + Important: + + - Repository disk space is **not** freed until you run ``borg compact``. + - You can use ``borg undelete`` to undelete archives, but only until + you run ``borg compact``. This command is normally used by automated backup scripts wanting to keep a certain number of historic backups. This retention policy is commonly referred to as From 7cd048f53ad93023121d626d5ccb442a9e323129 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 5 Nov 2024 21:25:24 +0100 Subject: [PATCH 10/11] compact: explain more --- src/borg/archiver/compact_cmd.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/borg/archiver/compact_cmd.py b/src/borg/archiver/compact_cmd.py index 067a099c4..310296fdb 100644 --- a/src/borg/archiver/compact_cmd.py +++ b/src/borg/archiver/compact_cmd.py @@ -185,9 +185,23 @@ class CompactMixIn: - interrupted backups (maybe retry the backup first before running compact!) - backup of source files that had an I/O error in the middle of their contents and that were skipped due to this. + - corruption of the repository (e.g. the archives directory having lost entries) - Important: after compacting it is not possible anymore to use ``borg undelete`` - to recover previously deleted archives. + You usually don't want to run ``borg compact`` after every write operation, but + either regularly (e.g. once a month, possibly together with ``borg check``) or + when disk space needs to be freed. + + **Important:** + + After compacting it is not possible anymore to use ``borg undelete`` to recover + previously deleted archives. + + ``borg compact`` might also delete data from archives that were "lost" due to + archives directory corruption. Such archives could potentially be restored with + ``borg check --find-lost-archives [--repair]``, which is slow and thus you + maybe usually don't want to do that unless there are signs of lost archives + (e.g. when seeing fatal errors when creating backups or when archives are + missing in ``borg list``). Differently than borg 1.x, borg2's compact needs the borg key if the repo is encrypted. From 142a739105684bd59d7564a44bf8146f6cc9fbad Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 5 Nov 2024 21:38:46 +0100 Subject: [PATCH 11/11] check: improve docs --- src/borg/archiver/check_cmd.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/src/borg/archiver/check_cmd.py b/src/borg/archiver/check_cmd.py index 7fe962bff..a5b5bf09f 100644 --- a/src/borg/archiver/check_cmd.py +++ b/src/borg/archiver/check_cmd.py @@ -85,12 +85,14 @@ class CheckMixIn: archive data (requires ``--verify-data``). This includes ensuring that the repository manifest exists, the archive metadata chunk is present, and that all chunks referencing files (items) in the archive exist. This requires - reading archive and file metadata, but not data. To cryptographically verify - the file (content) data integrity pass ``--verify-data``, but keep in mind - that this requires reading all data and is hence very time consuming. When - checking archives of a remote repository, archive checks run on the client - machine because they require decrypting data and therefore the encryption - key. + reading archive and file metadata, but not data. To scan for archives whose + entries were lost from the archive directory, pass ``--find-lost-archives``. + It requires reading all data and is hence very time consuming. + To additionally cryptographically verify the file (content) data integrity, + pass ``--verify-data``, which is even more time consuming. + + When checking archives of a remote repository, archive checks run on the client + machine because they require decrypting data and therefore the encryption key. Both steps can also be run independently. Pass ``--repository-only`` to run the repository checks only, or pass ``--archives-only`` to run the archive checks @@ -122,6 +124,15 @@ class CheckMixIn: encrypted repositories against attackers without access to the keys. You can not use ``--verify-data`` with ``--repository-only``. + The ``--find-lost-archives`` option will also scan the whole repository, but + tells Borg to search for lost archive metadata. If Borg encounters any archive + metadata that doesn't match with an archive directory entry, it means that an + entry was lost. + Unless ``borg compact`` is called, these archives can be fully restored with + ``--repair``. Please note that ``--find-lost-archives`` must read a lot of + data from the repository and is thus very time consuming. You can not use + ``--find-lost-archives`` with ``--repository-only``. + About repair mode +++++++++++++++++ @@ -180,12 +191,9 @@ class CheckMixIn: Consequently, if lost chunks were repaired earlier, it is advised to run ``--repair`` a second time after creating some new backups. - If ``--repair --find-lost-archives`` is given, Borg will scan the repository - for archive metadata and if it finds some where no corresponding archives - directory entry exists, it will create one. - This will make archives reappear for which the directory entry was lost. - This is only possible before ``borg compact`` would remove the archives' - data completely. + If ``--repair --find-lost-archives`` is given, previously lost entries will + be recreated in the archive directory. This is only possible before + ``borg compact`` would remove the archives' data completely. """ ) subparser = subparsers.add_parser(