parseformat: simplify Location parsing/validation, #9678

For sftp/http(s)/s3/b2/rclone repositories, borg only detects the scheme now
and hands the raw URL to borgstore, which parses and validates it - removing
the duplicate parsing borg used to do. Precise field extraction (user/host/
port/path) is kept only for the protocols borg itself reads: file, rest and
legacy ssh.

- drop http_re, s3_re, rclone_re and the sftp arm of the old ssh_or_sftp_re
- add a single scheme-detection pass-through against BORGSTORE_SCHEMES; reject
  unknown schemes (e.g. socket://) as before
- canonical_path() returns the processed URL for the delegated protocols, with
  embedded credentials stripped so secrets never reach the security state file
  or logs
- source local_path_re's scheme exclusions from BORGSTORE_SCHEMES
- create: use proto == "file" instead of "not location.host" for the local
  repo-dir inode skip

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Thomas Waldmann 2026-06-10 18:17:40 +02:00
parent 8509b3b7fa
commit aa9f810453
No known key found for this signature in database
GPG key ID: 243ACFA951F78E01
4 changed files with 76 additions and 100 deletions

View file

@ -201,6 +201,12 @@ Fixes:
Other changes:
- Location: simplify parsing/validation, #9678.
For sftp/http(s)/s3/b2/rclone repositories, borg now only detects the scheme and hands the raw
URL to borgstore, which parses and validates it (removing the duplicate parsing borg did before).
Note: for these repositories the canonical location string changed slightly, so on the first run
against an existing such repository borg may warn once that it "was previously located at ..." -
this is harmless and can be confirmed.
- keyfile: name key files by sha256(keyfile_contents).
Existing legacy-named keyfiles continue to work.
- repokey: use same format as with external keyfile

View file

@ -56,7 +56,7 @@ class CreateMixIn:
except OSError:
pass
# Add local repository dir to inode_skip list
if not args.location.host:
if args.location.proto == "file":
try:
st = os.stat(args.location.path)
skip_inodes.add((st.st_ino, st.st_dev))

View file

@ -503,6 +503,15 @@ def parse_stringified_list(s):
return [item for item in items if item != ""]
def _redact_url_credentials(url):
"""Remove embedded credentials from a repository URL for safe display/identity use."""
# netloc style: scheme://user[:pass]@host... -> scheme://host...
url = re.sub(r"(://)[^/@]+@", r"\1", url)
# s3/b2 style: (s3|b2):profile@... or (s3|b2):key:secret@... -> (s3|b2):...
url = re.sub(r"^((?:s3|b2):)[^@/]+@", r"\1", url)
return url
class Location:
"""Object representing a repository location"""
@ -528,12 +537,14 @@ class Location:
# :port (optional)
optional_port_re = r"(?::(?P<port>\d+))?"
# path may contain any chars. to avoid ambiguities with other regexes,
# it must not start with "//" nor with "scheme://" nor with "rclone:".
local_path_re = r"""
(?!(//|(ssh|socket|sftp|file)://|(rclone|s3|b2):))
(?P<path>.+)
"""
# locations that borgstore parses and validates itself - borg only detects the scheme and
# passes the raw URL through. covers both "scheme://..." and opaque "scheme:..." forms.
BORGSTORE_SCHEMES = ("sftp", "http", "https", "s3", "b2", "rclone")
# path may contain any chars. to avoid ambiguities with other regexes, it must not start with
# "//", a "scheme://" or one of the borgstore "scheme:" specifiers (all of which are matched
# before local_re in _parse). the borgstore scheme list is sourced from BORGSTORE_SCHEMES.
local_path_re = r"(?!(//|(?:ssh|socket|file)://|(?:" + "|".join(BORGSTORE_SCHEMES) + r"):))" r"(?P<path>.+)"
# abs_path must start with a slash (or drive letter on Windows).
abs_path_re = r"(?P<path>[A-Za-z]:/.+)" if is_win32 else r"(?P<path>/.+)"
@ -541,9 +552,14 @@ class Location:
# path may or may not start with a slash.
abs_or_rel_path_re = r"(?P<path>.+)"
# regexes for misc. kinds of supported location specifiers:
ssh_or_sftp_re = re.compile(
r"(?P<proto>(ssh|sftp))://"
# We only parse out individual fields (user/host/port/path) for the protocols where borg
# itself needs them: legacy "ssh" (v1 repositories) and "rest" (for the ssh tunnel + FILE
# backend), plus local "file" paths. Everything else (see BORGSTORE_SCHEMES) is handed to
# borgstore as the raw URL and parsed/validated there - we only detect the scheme.
# ssh:// is only used for legacy borg 1.x repositories nowadays.
ssh_re = re.compile(
r"(?P<proto>ssh)://"
+ optional_user_re
+ host_re
+ optional_port_re
@ -565,39 +581,8 @@ class Location:
re.VERBOSE,
)
# BorgStore REST server
# (http|https)://user:pass@host:port/
http_re = re.compile(
r"(?P<proto>http|https)://"
+ r"((?P<user>[^:@]+):(?P<pass>[^@]+)@)?"
+ host_re
+ optional_port_re
+ r"(?P<path>/)",
re.VERBOSE,
)
# (s3|b2):[(profile|(access_key_id:access_key_secret))@][scheme://hostname[:port]]/bucket/path
s3_re = re.compile(
r"""
(?P<s3type>(s3|b2)):
((
(?P<profile>[^@:]+) # profile (no colons allowed)
|
(?P<access_key_id>[^:@]+):(?P<access_key_secret>[^@]+) # access key and secret
)@)? # optional authentication
(
[^:/]+:// # scheme (often https)
(?P<hostname>[^:/]+)
(:(?P<port>\d+))?
)? # optional endpoint
/
(?P<bucket>[^/]+)/ # bucket name
(?P<path>.+) # path
""",
re.VERBOSE,
)
rclone_re = re.compile(r"(?P<proto>rclone):(?P<path>(.*))", re.VERBOSE)
# scheme detector for the borgstore-handled locations listed in BORGSTORE_SCHEMES above.
scheme_re = re.compile(r"(?P<scheme>[a-zA-Z][a-zA-Z0-9+.\-]*):")
sl = "/" if is_win32 else ""
file_re = re.compile(r"(?P<proto>file)://" + sl + abs_path_re, re.VERBOSE)
@ -636,7 +621,7 @@ class Location:
raise ValueError('Invalid location format: "%s"' % self.processed)
def _parse(self, text):
m = self.ssh_or_sftp_re.match(text)
m = self.ssh_re.match(text)
if m:
self.proto = m.group("proto")
self.user = m.group("user")
@ -652,33 +637,16 @@ class Location:
self.port = m.group("port") and int(m.group("port")) or None
self.path = os.path.normpath(m.group("path"))
return True
m = self.http_re.match(text)
if m:
self.proto = m.group("proto")
self.user = m.group("user")
self._pass = True if m.group("pass") else False
self._host = m.group("host")
self.port = m.group("port") and int(m.group("port")) or None
self.path = m.group("path")
return True
m = self.rclone_re.match(text)
if m:
self.proto = m.group("proto")
self.path = m.group("path")
return True
m = self.file_re.match(text)
if m:
self.proto = m.group("proto")
self.path = os.path.normpath(m.group("path"))
return True
m = self.s3_re.match(text)
if m:
self.proto = m.group("s3type")
self.user = m.group("profile") if m.group("profile") else m.group("access_key_id")
self._pass = True if m.group("access_key_secret") else False
self._host = m.group("hostname")
self.port = m.group("port") and int(m.group("port")) or None
self.path = m.group("bucket") + "/" + m.group("path")
m = self.scheme_re.match(text)
if m and m.group("scheme") in self.BORGSTORE_SCHEMES:
# borgstore parses/validates these; we only detect the scheme and pass the raw
# URL (self.processed) through to it - no fields are extracted here.
self.proto = m.group("scheme")
return True
m = self.local_re.match(text)
if m:
@ -711,9 +679,7 @@ class Location:
def canonical_path(self):
if self.proto == "file":
return self.path
if self.proto == "rclone":
return f"{self.proto}:{self.path}"
if self.proto in ("rest", "sftp", "ssh", "s3", "b2", "http", "https"):
if self.proto in ("rest", "ssh"):
return (
f"{self.proto}://"
f"{(self.user + '@') if self.user else ''}"
@ -721,6 +687,10 @@ class Location:
f"{(':' + str(self.port)) if self.port else ''}/"
f"{self.path}"
)
if self.proto in self.BORGSTORE_SCHEMES:
# borgstore-handled locations: use the raw (processed) URL as given, but strip any
# embedded credentials so we never write secrets to the security state file or logs.
return _redact_url_credentials(self.processed)
raise NotImplementedError(self.proto)
def with_timestamp(self, timestamp):

View file

@ -198,51 +198,51 @@ class TestLocationWithoutEnv:
== "Location(proto='rest', user=None, pass=None, host=None, port=None, path='/absolute/path')"
)
# For the protocols handled (parsed + validated) by borgstore itself, borg only detects
# the scheme and passes the raw URL through; it no longer extracts user/host/port/path.
def test_s3(self, monkeypatch):
monkeypatch.delenv("BORG_REPO", raising=False)
assert (
repr(Location("s3:/test/path"))
== "Location(proto='s3', user=None, pass=None, host=None, port=None, path='test/path')"
loc = Location("s3:/test/path")
assert loc.proto == "s3"
assert (loc.user, loc.host, loc.port, loc.path) == (None, None, None, None)
assert loc.processed == "s3:/test/path"
# credentials in the URL are stripped from canonical_path (security state file / logs)
assert Location("s3:profile@http://172.28.52.116:9000/test/path").canonical_path() == (
"s3:http://172.28.52.116:9000/test/path"
)
assert (
repr(Location("s3:profile@http://172.28.52.116:9000/test/path"))
== "Location(proto='s3', user='profile', pass=None, host='172.28.52.116', port=9000, path='test/path')" # noqa: E501
assert Location("s3:user:pass@http://172.28.52.116:9000/test/path").canonical_path() == (
"s3:http://172.28.52.116:9000/test/path"
)
assert (
repr(Location("s3:user:pass@http://172.28.52.116:9000/test/path"))
== "Location(proto='s3', user='user', pass='REDACTED', host='172.28.52.116', port=9000, path='test/path')" # noqa: E501
)
assert (
repr(Location("b2:user:pass@https://s3.us-east-005.backblazeb2.com/test/path"))
== "Location(proto='b2', user='user', pass='REDACTED', host='s3.us-east-005.backblazeb2.com', port=None, path='test/path')" # noqa: E501
assert Location("b2:user:pass@https://s3.us-east-005.backblazeb2.com/test/path").canonical_path() == (
"b2:https://s3.us-east-005.backblazeb2.com/test/path"
)
def test_rclone(self, monkeypatch):
monkeypatch.delenv("BORG_REPO", raising=False)
assert (
repr(Location("rclone:remote:path"))
== "Location(proto='rclone', user=None, pass=None, host=None, port=None, path='remote:path')"
)
loc = Location("rclone:remote:path")
assert loc.proto == "rclone"
assert (loc.user, loc.host, loc.port, loc.path) == (None, None, None, None)
assert loc.processed == "rclone:remote:path"
assert loc.canonical_path() == "rclone:remote:path"
def test_sftp(self, monkeypatch):
monkeypatch.delenv("BORG_REPO", raising=False)
# relative path
assert (
repr(Location("sftp://user@host:1234/rel/path"))
== "Location(proto='sftp', user='user', pass=None, host='host', port=1234, path='rel/path')"
)
# absolute path
assert (
repr(Location("sftp://user@host:1234//abs/path"))
== "Location(proto='sftp', user='user', pass=None, host='host', port=1234, path='/abs/path')"
)
loc = Location("sftp://user@host:1234/rel/path")
assert loc.proto == "sftp"
assert (loc.user, loc.host, loc.port, loc.path) == (None, None, None, None)
assert loc.processed == "sftp://user@host:1234/rel/path"
# credentials stripped from canonical_path
assert loc.canonical_path() == "sftp://host:1234/rel/path"
def test_http(self, monkeypatch):
monkeypatch.delenv("BORG_REPO", raising=False)
assert (
repr(Location("http://user:pass@host:1234/"))
== "Location(proto='http', user='user', pass='REDACTED', host='host', port=1234, path='/')"
)
loc = Location("http://user:pass@host:1234/")
assert loc.proto == "http"
assert (loc.user, loc.host, loc.port, loc.path) == (None, None, None, None)
assert loc.processed == "http://user:pass@host:1234/"
# credentials stripped from canonical_path
assert loc.canonical_path() == "http://host:1234/"
def test_socket(self, monkeypatch):
monkeypatch.delenv("BORG_REPO", raising=False)