Merge pull request #9644 from mr-raj12/legacy-archives-extract
Some checks failed
CI / lint (push) Waiting to run
CI / security (push) Waiting to run
CI / asan_ubsan (push) Blocked by required conditions
CI / native_tests (push) Blocked by required conditions
CI / vm_tests (Haiku, false, haiku, r1beta5) (push) Blocked by required conditions
CI / vm_tests (NetBSD, false, netbsd, 10.1) (push) Blocked by required conditions
CI / vm_tests (OmniOS, false, omnios, r151056) (push) Blocked by required conditions
CI / vm_tests (OpenBSD, false, openbsd, 7.8) (push) Blocked by required conditions
CI / vm_tests (borg-freebsd-14-x86_64-gh, FreeBSD, true, freebsd, 14.3) (push) Blocked by required conditions
CI / windows_tests (push) Blocked by required conditions
Lint / lint (push) Has been cancelled
CodeQL / Analyze (push) Has been cancelled

legacy: extract LegacyArchives into legacy/archives.py, add ArchivesInterface, refs #9556
This commit is contained in:
TW 2026-05-14 20:10:38 +02:00 committed by GitHub
commit 0a1570f2a8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 385 additions and 123 deletions

275
src/borg/legacy/archives.py Normal file
View file

@ -0,0 +1,275 @@
"""Legacy archive list management for Borg 1.x repositories.
In Borg 1.x the list of archives is embedded directly in the manifest blob
as a dict {name: {"id": bytes, "time": str}}. This class manages that dict.
Used by ``borg transfer --from-borg1``.
This module can be removed entirely when Borg 1.x support is dropped.
"""
import re
from datetime import datetime
from operator import attrgetter
from ..constants import * # NOQA
from ..helpers.datastruct import StableDict
from ..helpers.errors import CommandError, Error
from ..helpers.parseformat import bin_to_hex
from ..helpers.time import parse_timestamp
from ..item import ArchiveItem
from ..patterns import get_regex_from_pattern
# ArchiveInfo and filter_archives_by_date are imported from ..manifest below.
# These module-level imports are safe because legacy/archives.py is only ever
# imported from inside Manifest.__init__ — by that point manifest.py is fully
# loaded and present in sys.modules.
from ..manifest import ArchiveInfo, filter_archives_by_date
class LegacyArchives:
"""
Manage the list of archives for a Borg 1.x repository.
The archive registry lives inside the manifest blob itself:
{name: {"id": <bytes>, "time": <iso-timestamp-str>}}
Manifest.__init__ chooses this class over Archives when the repository is a
LegacyRepository. It can be deleted entirely when Borg 1.x support is dropped.
"""
def __init__(self, repository, manifest):
self.repository = repository
self.manifest = manifest
# key: str archive name, value: dict("id": bytes_id, "time": str_iso_ts)
self._archives = {}
def prepare(self, manifest, m):
self._set_raw_dict(m.archives)
def finish(self, manifest):
return StableDict(self._get_raw_dict())
def ids(self, *, deleted=False):
for archive_info in self._archives.values():
yield archive_info["id"]
def _get_archive_meta(self, id: bytes) -> dict:
# get all metadata directly from the ArchiveItem in the repo.
from .repository import LegacyRepository
try:
cdata = self.repository.get(id)
except LegacyRepository.ObjectNotFound:
return dict(
id=id,
name="archive-does-not-exist",
time="1970-01-01T00:00:00.000000",
exists=False,
username="",
hostname="",
tags=(),
)
else:
_, data = self.manifest.repo_objs.parse(id, cdata, ro_type=ROBJ_ARCHIVE_META)
archive_dict = self.manifest.key.unpack_archive(data)
archive_item = ArchiveItem(internal_dict=archive_dict)
if archive_item.version not in (1, 2):
raise Exception("Unknown archive metadata version")
return dict(
id=id,
name=archive_item.name,
time=archive_item.time,
exists=True,
username=archive_item.username,
hostname=archive_item.hostname,
size=archive_item.get("size", 0),
nfiles=archive_item.get("nfiles", 0),
comment=archive_item.get("comment", ""),
tags=tuple(sorted(getattr(archive_item, "tags", []))),
)
def _infos(self, *, deleted=False):
for id in self.ids(deleted=deleted):
yield self._get_archive_meta(id)
def _info_tuples(self, *, deleted=False):
for info in self._infos(deleted=deleted):
yield ArchiveInfo(
name=info["name"],
id=info["id"],
ts=parse_timestamp(info["time"]),
tags=info["tags"],
user=info["username"],
host=info["hostname"],
)
def _matching_info_tuples(self, match_patterns, match_end, *, deleted=False):
archive_infos = list(self._info_tuples(deleted=deleted))
if match_patterns:
assert isinstance(match_patterns, list), f"match_pattern is a {type(match_patterns)}"
for match in match_patterns:
if match.startswith("aid:"):
wanted_id = match.removeprefix("aid:")
archive_infos = [x for x in archive_infos if bin_to_hex(x.id).startswith(wanted_id)]
if len(archive_infos) != 1:
raise CommandError("archive ID based match needs to match precisely one archive ID")
elif match.startswith("tags:"):
wanted_tags = match.removeprefix("tags:")
wanted_tags = [tag for tag in wanted_tags.split(",") if tag]
archive_infos = [x for x in archive_infos if set(x.tags) >= set(wanted_tags)]
elif match.startswith("user:"):
wanted_user = match.removeprefix("user:")
archive_infos = [x for x in archive_infos if x.user == wanted_user]
elif match.startswith("host:"):
wanted_host = match.removeprefix("host:")
archive_infos = [x for x in archive_infos if x.host == wanted_host]
else:
match = match.removeprefix("name:")
regex = get_regex_from_pattern(match)
regex = re.compile(regex + match_end)
archive_infos = [x for x in archive_infos if regex.match(x.name) is not None]
return archive_infos
def count(self):
return len(self._archives)
def names(self):
yield from self._archives.keys()
def exists(self, name):
assert isinstance(name, str)
return name in self._archives
def exists_id(self, id, *, deleted=False):
assert isinstance(id, bytes)
raise NotImplementedError
def exists_name_and_id(self, name, id):
assert isinstance(name, str)
assert isinstance(id, bytes)
raise NotImplementedError
def exists_name_and_ts(self, name, ts):
assert isinstance(name, str)
assert isinstance(ts, datetime)
raise NotImplementedError
def get(self, name, raw=False):
assert isinstance(name, str)
values = self._archives.get(name)
if values is None:
return None
if not raw:
ts = parse_timestamp(values["time"])
return ArchiveInfo(name=name, id=values["id"], ts=ts)
else:
return dict(name=name, id=values["id"], time=values["time"])
def get_by_id(self, id, raw=False, *, deleted=False):
assert isinstance(id, bytes)
for name, values in self._archives.items():
if id == values["id"]:
break
else:
return None
if not raw:
ts = parse_timestamp(values["time"])
return ArchiveInfo(name=name, id=values["id"], ts=ts)
else:
return dict(name=name, id=values["id"], time=values["time"])
def create(self, name, id, ts, *, overwrite=False):
assert isinstance(name, str)
assert isinstance(id, bytes)
if isinstance(ts, datetime):
ts = ts.isoformat(timespec="microseconds")
assert isinstance(ts, str)
if self.exists(name) and not overwrite:
raise KeyError("archive already exists")
self._archives[name] = {"id": id, "time": ts}
def delete_by_id(self, id):
assert isinstance(id, bytes)
raise NotImplementedError("Borg 1.x repositories do not support soft-delete")
def undelete_by_id(self, id):
assert isinstance(id, bytes)
raise NotImplementedError("Borg 1.x repositories do not support undelete")
def nuke_by_id(self, id):
assert isinstance(id, bytes)
raise NotImplementedError("Borg 1.x repositories do not support nuke")
def list(
self,
*,
match=None,
match_end=r"\Z",
sort_by=(),
reverse=False,
first=None,
last=None,
older=None,
newer=None,
oldest=None,
newest=None,
deleted=False,
):
"""
Return list of ArchiveInfo instances according to the parameters.
See Archives.list() for full parameter documentation.
"""
if isinstance(sort_by, (str, bytes)):
raise TypeError("sort_by must be a sequence of str")
archive_infos = self._matching_info_tuples(match, match_end, deleted=deleted)
if any([oldest, newest, older, newer]):
archive_infos = filter_archives_by_date(
archive_infos, oldest=oldest, newest=newest, newer=newer, older=older
)
for sortkey in reversed(sort_by):
archive_infos.sort(key=attrgetter(sortkey))
if first:
archive_infos = archive_infos[:first]
elif last:
archive_infos = archive_infos[max(len(archive_infos) - last, 0) :]
if reverse:
archive_infos.reverse()
return archive_infos
def list_considering(self, args):
"""Get a list of archives, considering --first/last/prefix/match-archives/sort cmdline args."""
name = getattr(args, "name", None)
if name is not None:
raise Error(
"Giving a specific name is incompatible with options --first, --last " "and -a / --match-archives."
)
return self.list(
sort_by=args.sort_by.split(","),
match=args.match_archives,
first=getattr(args, "first", None),
last=getattr(args, "last", None),
older=getattr(args, "older", None),
newer=getattr(args, "newer", None),
oldest=getattr(args, "oldest", None),
newest=getattr(args, "newest", None),
deleted=getattr(args, "deleted", False),
)
def get_one(self, match, *, match_end=r"\Z", deleted=False):
"""Get exactly one archive matching <match>."""
assert match is not None
archive_infos = self._matching_info_tuples(match, match_end, deleted=deleted)
if len(archive_infos) != 1:
raise CommandError(f"{match} needed to match precisely one archive, but matched {len(archive_infos)}.")
return archive_infos[0]
def _set_raw_dict(self, d):
for k, v in d.items():
assert isinstance(k, str)
assert isinstance(v, dict) and "id" in v and "time" in v
self._archives[k] = v
def _get_raw_dict(self):
return self._archives

View file

@ -3,7 +3,8 @@ import re
from collections import namedtuple
from datetime import datetime, timedelta, timezone
from operator import attrgetter
from collections.abc import Sequence
from collections.abc import Iterator, Sequence
from typing import Protocol, runtime_checkable
from borgstore.store import ObjectNotFound, ItemInfo
@ -69,65 +70,91 @@ def filter_archives_by_date(archives, older=None, newer=None, oldest=None, newes
return archives
@runtime_checkable
class ArchivesInterface(Protocol): # pragma: no cover
"""
Structural interface that both Archives and LegacyArchives must satisfy.
Manifest.__init__ assigns one of these two classes to self.archives depending
on whether the repository is a LegacyRepository (Borg 1.x) or a modern one.
All callers go through this interface without knowing which class they got.
When Borg 1.x support is dropped, delete LegacyArchives and this Protocol
can either be removed or kept as documentation of the Archives public API.
"""
def prepare(self, manifest, m) -> None: ...
def finish(self, manifest) -> dict: ...
def ids(self, *, deleted: bool = False) -> Iterator: ...
def count(self) -> int: ...
def names(self) -> Iterator: ...
def exists(self, name: str) -> bool: ...
def exists_id(self, id: bytes, *, deleted: bool = False) -> bool: ...
def exists_name_and_id(self, name: str, id: bytes) -> bool: ...
def exists_name_and_ts(self, name: str, ts) -> bool: ...
def get(self, name: str, raw: bool = False): ...
def get_by_id(self, id: bytes, raw: bool = False, *, deleted: bool = False): ...
def create(self, name: str, id: bytes, ts, *, overwrite: bool = False) -> None: ...
def delete_by_id(self, id: bytes) -> None: ...
def undelete_by_id(self, id: bytes) -> None: ...
def nuke_by_id(self, id: bytes) -> None: ...
def list(
self,
*,
match=None,
match_end=r"\Z",
sort_by=(),
reverse=False,
first=None,
last=None,
older=None,
newer=None,
oldest=None,
newest=None,
deleted=False,
): ...
def list_considering(self, args): ...
def get_one(self, match, *, match_end=r"\Z", deleted=False): ...
class Archives:
"""
Manage the list of archives.
Manage the list of archives for a Borg 2.x repository.
We still need to support the borg 1.x manifest-with-list-of-archives,
so borg transfer can work.
borg2 has separate items archives/* in the borgstore.
Each archive has a separate entry in borgstore at archives/<hex-id>.
"""
def __init__(self, repository, manifest):
from .repository import Repository
from .remote import RemoteRepository
self.repository = repository
self.legacy = not isinstance(repository, (Repository, RemoteRepository))
# key: str archive name, value: dict('id': bytes_id, 'time': str_iso_ts)
self._archives = {}
self.manifest = manifest
def prepare(self, manifest, m):
if not self.legacy:
pass
else:
self._set_raw_dict(m.archives)
pass # borgstore manages the archive directory; nothing to load from the manifest blob
def finish(self, manifest):
if not self.legacy:
manifest_archives = {}
else:
manifest_archives = StableDict(self._get_raw_dict())
return manifest_archives
return {} # manifest["archives"] is always empty in Borg 2
def ids(self, *, deleted=False):
# yield the binary IDs of all archives
if not self.legacy:
try:
infos = list(self.repository.store_list("archives", deleted=deleted))
except ObjectNotFound:
infos = []
for info in infos:
info = ItemInfo(*info) # RPC does not give us a NamedTuple
yield hex_to_bin(info.name)
else:
for archive_info in self._archives.values():
yield archive_info["id"]
try:
infos = list(self.repository.store_list("archives", deleted=deleted))
except ObjectNotFound:
infos = []
for info in infos:
info = ItemInfo(*info) # RPC does not give us a NamedTuple
yield hex_to_bin(info.name)
def _get_archive_meta(self, id: bytes) -> dict:
# get all metadata directly from the ArchiveItem in the repo.
from .legacy.repository import LegacyRepository
from .repository import Repository
try:
cdata = self.repository.get(id)
except (Repository.ObjectNotFound, LegacyRepository.ObjectNotFound):
except Repository.ObjectNotFound:
metadata = dict(
id=id,
name="archive-does-not-exist",
time="1970-01-01T00:00:00.000000",
# new:
exists=False, # we have the pointer, but the repo does not have an archive item
username="",
hostname="",
@ -145,7 +172,6 @@ class Archives:
id=id,
name=archive_item.name,
time=archive_item.time,
# new:
exists=True, # repo has a valid archive item
username=archive_item.username,
hostname=archive_item.hostname,
@ -211,48 +237,35 @@ class Archives:
def exists(self, name):
# check if an archive with this name exists
assert isinstance(name, str)
if not self.legacy:
return name in self.names()
else:
return name in self._archives
return name in self.names()
def exists_id(self, id, *, deleted=False):
# check if an archive with this id exists
assert isinstance(id, bytes)
if not self.legacy:
return id in self.ids(deleted=deleted)
else:
raise NotImplementedError
return id in self.ids(deleted=deleted)
def exists_name_and_id(self, name, id):
# check if an archive with this name AND id exists
assert isinstance(name, str)
assert isinstance(id, bytes)
if not self.legacy:
for archive_info in self._infos():
if archive_info["name"] == name and archive_info["id"] == id:
return True
else:
return False
for archive_info in self._infos():
if archive_info["name"] == name and archive_info["id"] == id:
return True
else:
raise NotImplementedError
return False
def exists_name_and_ts(self, name, ts):
# check if an archive with this name AND timestamp exists
assert isinstance(name, str)
assert isinstance(ts, datetime)
if not self.legacy:
for archive_info in self._info_tuples():
if archive_info.name == name and archive_info.ts == ts:
return True
else:
return False
for archive_info in self._info_tuples():
if archive_info.name == name and archive_info.ts == ts:
return True
else:
raise NotImplementedError
return False
def _lookup_name(self, name, raw=False):
assert isinstance(name, str)
assert not self.legacy
for archive_info in self._infos():
if archive_info["exists"] and archive_info["name"] == name:
if not raw:
@ -272,51 +285,30 @@ class Archives:
def get(self, name, raw=False):
assert isinstance(name, str)
if not self.legacy:
try:
return self._lookup_name(name, raw=raw)
except KeyError:
return None
else:
values = self._archives.get(name)
if values is None:
return None
if not raw:
ts = parse_timestamp(values["time"])
return ArchiveInfo(name=name, id=values["id"], ts=ts)
else:
return dict(name=name, id=values["id"], time=values["time"])
try:
return self._lookup_name(name, raw=raw)
except KeyError:
return None
def get_by_id(self, id, raw=False, *, deleted=False):
assert isinstance(id, bytes)
if not self.legacy:
if id in self.ids(deleted=deleted): # check directory
# looks like this archive id is in the archives directory, thus it is NOT deleted.
# OR we have explicitly requested a soft-deleted archive via deleted=True.
archive_info = self._get_archive_meta(id)
if archive_info["exists"]: # True means we have found Archive metadata in the repo.
if not raw:
ts = parse_timestamp(archive_info["time"])
archive_info = ArchiveInfo(
name=archive_info["name"],
id=archive_info["id"],
ts=ts,
tags=archive_info["tags"],
user=archive_info["username"],
host=archive_info["hostname"],
)
return archive_info
else:
for name, values in self._archives.items():
if id == values["id"]:
break
else:
return None
if not raw:
ts = parse_timestamp(values["time"])
return ArchiveInfo(name=name, id=values["id"], ts=ts)
else:
return dict(name=name, id=values["id"], time=values["time"])
if id in self.ids(deleted=deleted): # check directory
# looks like this archive id is in the archives directory, thus it is NOT deleted.
# OR we have explicitly requested a soft-deleted archive via deleted=True.
archive_info = self._get_archive_meta(id)
if archive_info["exists"]: # True means we have found Archive metadata in the repo.
if not raw:
ts = parse_timestamp(archive_info["time"])
archive_info = ArchiveInfo(
name=archive_info["name"],
id=archive_info["id"],
ts=ts,
tags=archive_info["tags"],
user=archive_info["username"],
host=archive_info["hostname"],
)
return archive_info
return None # id not in store, or archive metadata blob missing from repo
def create(self, name, id, ts, *, overwrite=False):
assert isinstance(name, str)
@ -324,30 +316,22 @@ class Archives:
if isinstance(ts, datetime):
ts = ts.isoformat(timespec="microseconds")
assert isinstance(ts, str)
if not self.legacy:
# we only create a directory entry, its name points to the archive item:
self.repository.store_store(f"archives/{bin_to_hex(id)}", b"")
else:
if self.exists(name) and not overwrite:
raise KeyError("archive already exists")
self._archives[name] = {"id": id, "time": ts}
# we only create a directory entry, its name points to the archive item:
self.repository.store_store(f"archives/{bin_to_hex(id)}", b"")
def delete_by_id(self, id):
# soft-delete an archive
assert isinstance(id, bytes)
assert not self.legacy
self.repository.store_move(f"archives/{bin_to_hex(id)}", delete=True) # soft-delete
def undelete_by_id(self, id):
# undelete an archive
assert isinstance(id, bytes)
assert not self.legacy
self.repository.store_move(f"archives/{bin_to_hex(id)}", undelete=True)
def nuke_by_id(self, id):
# really delete an already soft-deleted archive
assert isinstance(id, bytes)
assert not self.legacy
self.repository.store_delete(f"archives/{bin_to_hex(id)}", deleted=True)
def list(
@ -430,17 +414,6 @@ class Archives:
raise CommandError(f"{match} needed to match precisely one archive, but matched {len(archive_infos)}.")
return archive_infos[0]
def _set_raw_dict(self, d):
"""set the dict we get from the msgpack unpacker"""
for k, v in d.items():
assert isinstance(k, str)
assert isinstance(v, dict) and "id" in v and "time" in v
self._archives[k] = v
def _get_raw_dict(self):
"""get the dict we can give to the msgpack packer"""
return self._archives
class Manifest:
@enum.unique
@ -474,7 +447,13 @@ class Manifest:
MANIFEST_ID = b"\0" * 32
def __init__(self, key, repository, item_keys=None, ro_cls=RepoObj):
self.archives = Archives(repository, self)
from .legacy.repository import LegacyRepository
from .legacy.archives import LegacyArchives
if isinstance(repository, LegacyRepository):
self.archives: ArchivesInterface = LegacyArchives(repository, self)
else:
self.archives: ArchivesInterface = Archives(repository, self)
self.config = {}
self.key = key
self.repo_objs = ro_cls(key)

View file

@ -13,7 +13,7 @@ from ..archive import Archive, CacheChunkBuffer, RobustUnpacker, valid_msgpacked
from ..archive import BackupOSError, backup_io, backup_io_iter, get_item_uid_gid
from ..helpers import msgpack
from ..item import Item, ArchiveItem
from ..manifest import Manifest
from ..manifest import Archives, Manifest
from ..platform import uid2user, gid2group, is_win32
@ -427,3 +427,11 @@ def test_reject_non_sanitized_item():
for path in rejected_dotdot_paths:
with pytest.raises(ValueError, match="unexpected '..' element in path"):
Item(path=path, user="root", group="root")
def test_archives_get_by_id_missing_returns_none():
repo = Mock()
repo.store_list.return_value = [] # empty store — id will not be found
manifest = Mock()
archives = Archives(repo, manifest)
assert archives.get_by_id(b"\x01" * 32) is None