From 318c94d750f844436caf264fab2e6889f0b42e98 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Mon, 19 May 2025 22:39:05 +0200 Subject: [PATCH] compact: also clean up files cache, fixes #8852 --- src/borg/archiver/compact_cmd.py | 41 ++++++++++++++ src/borg/cache.py | 39 ++++++++++--- .../testsuite/archiver/compact_cmd_test.py | 55 +++++++++++++++++++ 3 files changed, 127 insertions(+), 8 deletions(-) diff --git a/src/borg/archiver/compact_cmd.py b/src/borg/archiver/compact_cmd.py index e758739ff..ace891b7b 100644 --- a/src/borg/archiver/compact_cmd.py +++ b/src/borg/archiver/compact_cmd.py @@ -1,9 +1,12 @@ import argparse +import os from typing import Tuple, Set from ._common import with_repository from ..archive import Archive from ..cache import write_chunkindex_to_repo_cache, build_chunkindex_from_repo +from ..cache import files_cache_name, discover_files_cache_names +from ..helpers import get_cache_dir from ..constants import * # NOQA from ..hashindex import ChunkIndex, ChunkIndexEntry from ..helpers import set_ec, EXIT_ERROR, format_file_size, bin_to_hex @@ -43,6 +46,7 @@ class ArchiveGarbageCollector: (self.missing_chunks, self.total_files, self.total_size, self.archives_count) = self.analyze_archives() self.report_and_delete() self.save_chunk_index() + self.cleanup_files_cache() logger.info("Finished compaction / garbage collection...") def get_repository_chunks(self) -> ChunkIndex: @@ -71,6 +75,43 @@ class ArchiveGarbageCollector: ) self.chunks = None # nothing there (cleared!) + def cleanup_files_cache(self): + """ + Clean up files cache files for archive series names that no longer exist in the repository. + + Note: this only works perfectly if the files cache filename suffixes are automatically generated + and the user does not manually control them via more than one BORG_FILES_CACHE_SUFFIX env var value. + """ + logger.info("Cleaning up files cache...") + + cache_dir = os.path.join(get_cache_dir(), self.repository.id_str) + if not os.path.exists(cache_dir): + logger.debug("Cache directory does not exist, skipping files cache cleanup") + return + + # Get all existing archive series names + existing_series = set(self.manifest.archives.names()) + logger.debug(f"Found {len(existing_series)} existing archive series.") + + # Get the set of all existing files cache file names. + try: + files_cache_names = set(discover_files_cache_names(cache_dir)) + logger.debug(f"Found {len(files_cache_names)} files cache files.") + except (FileNotFoundError, PermissionError) as e: + logger.warning(f"Could not access cache directory: {e}") + return + + used_files_cache_names = set(files_cache_name(series_name) for series_name in existing_series) + unused_files_cache_names = files_cache_names - used_files_cache_names + + for cache_filename in unused_files_cache_names: + cache_path = os.path.join(cache_dir, cache_filename) + try: + os.unlink(cache_path) + except (FileNotFoundError, PermissionError) as e: + logger.warning(f"Could not access cache file: {e}") + logger.info(f"Removed {len(unused_files_cache_names)} unused files cache files.") + def analyze_archives(self) -> Tuple[Set, int, int, int]: """Iterate over all items in all archives, create the dicts id -> size of all used chunks.""" diff --git a/src/borg/cache.py b/src/borg/cache.py index 621a0e279..2583f6f51 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -37,6 +37,35 @@ from .platform import SaveFile from .remote import RemoteRepository from .repository import LIST_SCAN_LIMIT, Repository, StoreObjectNotFound, repo_lister + +def files_cache_name(archive_name, files_cache_name="files"): + """ + Return the name of the files cache file for the given archive name. + + :param archive_name: name of the archive (ideally a series name) + :param files_cache_name: base name of the files cache file + :return: name of the files cache file + """ + suffix = os.environ.get("BORG_FILES_CACHE_SUFFIX", "") + # when using archive series, we automatically make up a separate cache file per series. + # when not, the user may manually do that by using the env var. + if not suffix: + # avoid issues with too complex or long archive_name by hashing it: + suffix = bin_to_hex(xxh64(archive_name.encode())) + return files_cache_name + "." + suffix + + +def discover_files_cache_names(path, files_cache_name="files"): + """ + Return a list of all files cache file names in the given directory. + + :param path: path to the directory to search in + :param files_cache_name: base name of the files cache files + :return: list of files cache file names + """ + return [fn for fn in os.listdir(path) if fn.startswith(files_cache_name + ".")] + + # chunks is a list of ChunkListEntry FileCacheEntry = namedtuple("FileCacheEntry", "age inode size ctime mtime chunks") @@ -495,16 +524,10 @@ class FilesCacheMixin: return files def files_cache_name(self): - suffix = os.environ.get("BORG_FILES_CACHE_SUFFIX", "") - # when using archive series, we automatically make up a separate cache file per series. - # when not, the user may manually do that by using the env var. - if not suffix: - # avoid issues with too complex or long archive_name by hashing it: - suffix = bin_to_hex(xxh64(self.archive_name.encode())) - return self.FILES_CACHE_NAME + "." + suffix + return files_cache_name(self.archive_name, self.FILES_CACHE_NAME) def discover_files_cache_names(self, path): - return [fn for fn in os.listdir(path) if fn.startswith(self.FILES_CACHE_NAME + ".")] + return discover_files_cache_names(path, self.FILES_CACHE_NAME) def _read_files_cache(self): """read files cache from cache directory""" diff --git a/src/borg/testsuite/archiver/compact_cmd_test.py b/src/borg/testsuite/archiver/compact_cmd_test.py index 65524fc90..802645b4c 100644 --- a/src/borg/testsuite/archiver/compact_cmd_test.py +++ b/src/borg/testsuite/archiver/compact_cmd_test.py @@ -1,6 +1,9 @@ +import os import pytest from ...constants import * # NOQA +from ...helpers import get_cache_dir +from ...cache import files_cache_name, discover_files_cache_names from . import cmd, create_src_archive, generate_archiver_tests, RK_ENCRYPTION pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary") # NOQA @@ -79,3 +82,55 @@ def test_compact_index_corruption(archivers, request): output = cmd(archiver, "compact", "-v", "--stats", exit_code=0) assert "missing objects" not in output + + +def test_compact_files_cache_cleanup(archivers, request): + """Test that files cache files for deleted archives are removed during compact.""" + archiver = request.getfixturevalue(archivers) + + # Create repository and archives + cmd(archiver, "repo-create", RK_ENCRYPTION) + create_src_archive(archiver, "archive1") + create_src_archive(archiver, "archive2") + create_src_archive(archiver, "archive3") + + # Get repository ID + output = cmd(archiver, "repo-info") + for line in output.splitlines(): + if "Repository ID:" in line: + repo_id = line.split(":", 1)[1].strip() + break + else: + pytest.fail("Could not find repository ID in info output") + + # Check cache directory for files cache files + cache_dir = os.path.join(get_cache_dir(), repo_id) + if not os.path.exists(cache_dir): + pytest.skip("Cache directory does not exist, skipping test") + + # Get initial files cache files + try: + initial_cache_files = set(discover_files_cache_names(cache_dir)) + except (FileNotFoundError, PermissionError): + pytest.skip("Could not access cache directory, skipping test") + + # Get expected cache files for remaining archives + expected_cache_files = {files_cache_name(name) for name in ["archive1", "archive2", "archive3"]} + assert expected_cache_files == initial_cache_files, "Unexpected cache files found" + + # Delete one archive + cmd(archiver, "delete", "-a", "archive2") + + # Run compact + output = cmd(archiver, "compact", "-v") + assert "Cleaning up files cache" in output + + # Check that files cache for deleted archive is removed + try: + remaining_cache_files = set(discover_files_cache_names(cache_dir)) + except (FileNotFoundError, PermissionError): + pytest.fail("Could not access cache directory after compact") + + # Get expected cache files for remaining archives + expected_cache_files = {files_cache_name(name) for name in ["archive1", "archive3"]} + assert expected_cache_files == remaining_cache_files, "Unexpected cache files found"