Merge pull request #8854 from ThomasWaldmann/files-cache-cleanup

compact: also clean up files cache, fixes #8852
This commit is contained in:
TW 2025-05-20 18:42:07 +02:00 committed by GitHub
commit 80624d1460
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 127 additions and 8 deletions

View file

@ -1,9 +1,12 @@
import argparse
import os
from typing import Tuple, Set
from ._common import with_repository
from ..archive import Archive
from ..cache import write_chunkindex_to_repo_cache, build_chunkindex_from_repo
from ..cache import files_cache_name, discover_files_cache_names
from ..helpers import get_cache_dir
from ..constants import * # NOQA
from ..hashindex import ChunkIndex, ChunkIndexEntry
from ..helpers import set_ec, EXIT_ERROR, format_file_size, bin_to_hex
@ -43,6 +46,7 @@ class ArchiveGarbageCollector:
(self.missing_chunks, self.total_files, self.total_size, self.archives_count) = self.analyze_archives()
self.report_and_delete()
self.save_chunk_index()
self.cleanup_files_cache()
logger.info("Finished compaction / garbage collection...")
def get_repository_chunks(self) -> ChunkIndex:
@ -71,6 +75,43 @@ class ArchiveGarbageCollector:
)
self.chunks = None # nothing there (cleared!)
def cleanup_files_cache(self):
"""
Clean up files cache files for archive series names that no longer exist in the repository.
Note: this only works perfectly if the files cache filename suffixes are automatically generated
and the user does not manually control them via more than one BORG_FILES_CACHE_SUFFIX env var value.
"""
logger.info("Cleaning up files cache...")
cache_dir = os.path.join(get_cache_dir(), self.repository.id_str)
if not os.path.exists(cache_dir):
logger.debug("Cache directory does not exist, skipping files cache cleanup")
return
# Get all existing archive series names
existing_series = set(self.manifest.archives.names())
logger.debug(f"Found {len(existing_series)} existing archive series.")
# Get the set of all existing files cache file names.
try:
files_cache_names = set(discover_files_cache_names(cache_dir))
logger.debug(f"Found {len(files_cache_names)} files cache files.")
except (FileNotFoundError, PermissionError) as e:
logger.warning(f"Could not access cache directory: {e}")
return
used_files_cache_names = set(files_cache_name(series_name) for series_name in existing_series)
unused_files_cache_names = files_cache_names - used_files_cache_names
for cache_filename in unused_files_cache_names:
cache_path = os.path.join(cache_dir, cache_filename)
try:
os.unlink(cache_path)
except (FileNotFoundError, PermissionError) as e:
logger.warning(f"Could not access cache file: {e}")
logger.info(f"Removed {len(unused_files_cache_names)} unused files cache files.")
def analyze_archives(self) -> Tuple[Set, int, int, int]:
"""Iterate over all items in all archives, create the dicts id -> size of all used chunks."""

View file

@ -37,6 +37,35 @@ from .platform import SaveFile
from .remote import RemoteRepository
from .repository import LIST_SCAN_LIMIT, Repository, StoreObjectNotFound, repo_lister
def files_cache_name(archive_name, files_cache_name="files"):
"""
Return the name of the files cache file for the given archive name.
:param archive_name: name of the archive (ideally a series name)
:param files_cache_name: base name of the files cache file
:return: name of the files cache file
"""
suffix = os.environ.get("BORG_FILES_CACHE_SUFFIX", "")
# when using archive series, we automatically make up a separate cache file per series.
# when not, the user may manually do that by using the env var.
if not suffix:
# avoid issues with too complex or long archive_name by hashing it:
suffix = bin_to_hex(xxh64(archive_name.encode()))
return files_cache_name + "." + suffix
def discover_files_cache_names(path, files_cache_name="files"):
"""
Return a list of all files cache file names in the given directory.
:param path: path to the directory to search in
:param files_cache_name: base name of the files cache files
:return: list of files cache file names
"""
return [fn for fn in os.listdir(path) if fn.startswith(files_cache_name + ".")]
# chunks is a list of ChunkListEntry
FileCacheEntry = namedtuple("FileCacheEntry", "age inode size ctime mtime chunks")
@ -495,16 +524,10 @@ class FilesCacheMixin:
return files
def files_cache_name(self):
suffix = os.environ.get("BORG_FILES_CACHE_SUFFIX", "")
# when using archive series, we automatically make up a separate cache file per series.
# when not, the user may manually do that by using the env var.
if not suffix:
# avoid issues with too complex or long archive_name by hashing it:
suffix = bin_to_hex(xxh64(self.archive_name.encode()))
return self.FILES_CACHE_NAME + "." + suffix
return files_cache_name(self.archive_name, self.FILES_CACHE_NAME)
def discover_files_cache_names(self, path):
return [fn for fn in os.listdir(path) if fn.startswith(self.FILES_CACHE_NAME + ".")]
return discover_files_cache_names(path, self.FILES_CACHE_NAME)
def _read_files_cache(self):
"""read files cache from cache directory"""

View file

@ -1,6 +1,9 @@
import os
import pytest
from ...constants import * # NOQA
from ...helpers import get_cache_dir
from ...cache import files_cache_name, discover_files_cache_names
from . import cmd, create_src_archive, generate_archiver_tests, RK_ENCRYPTION
pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary") # NOQA
@ -79,3 +82,55 @@ def test_compact_index_corruption(archivers, request):
output = cmd(archiver, "compact", "-v", "--stats", exit_code=0)
assert "missing objects" not in output
def test_compact_files_cache_cleanup(archivers, request):
"""Test that files cache files for deleted archives are removed during compact."""
archiver = request.getfixturevalue(archivers)
# Create repository and archives
cmd(archiver, "repo-create", RK_ENCRYPTION)
create_src_archive(archiver, "archive1")
create_src_archive(archiver, "archive2")
create_src_archive(archiver, "archive3")
# Get repository ID
output = cmd(archiver, "repo-info")
for line in output.splitlines():
if "Repository ID:" in line:
repo_id = line.split(":", 1)[1].strip()
break
else:
pytest.fail("Could not find repository ID in info output")
# Check cache directory for files cache files
cache_dir = os.path.join(get_cache_dir(), repo_id)
if not os.path.exists(cache_dir):
pytest.skip("Cache directory does not exist, skipping test")
# Get initial files cache files
try:
initial_cache_files = set(discover_files_cache_names(cache_dir))
except (FileNotFoundError, PermissionError):
pytest.skip("Could not access cache directory, skipping test")
# Get expected cache files for remaining archives
expected_cache_files = {files_cache_name(name) for name in ["archive1", "archive2", "archive3"]}
assert expected_cache_files == initial_cache_files, "Unexpected cache files found"
# Delete one archive
cmd(archiver, "delete", "-a", "archive2")
# Run compact
output = cmd(archiver, "compact", "-v")
assert "Cleaning up files cache" in output
# Check that files cache for deleted archive is removed
try:
remaining_cache_files = set(discover_files_cache_names(cache_dir))
except (FileNotFoundError, PermissionError):
pytest.fail("Could not access cache directory after compact")
# Get expected cache files for remaining archives
expected_cache_files = {files_cache_name(name) for name in ["archive1", "archive3"]}
assert expected_cache_files == remaining_cache_files, "Unexpected cache files found"