From aa9f810453d3bb62d4af93b675e88851df035df3 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 10 Jun 2026 18:17:40 +0200 Subject: [PATCH 1/2] parseformat: simplify Location parsing/validation, #9678 For sftp/http(s)/s3/b2/rclone repositories, borg only detects the scheme now and hands the raw URL to borgstore, which parses and validates it - removing the duplicate parsing borg used to do. Precise field extraction (user/host/ port/path) is kept only for the protocols borg itself reads: file, rest and legacy ssh. - drop http_re, s3_re, rclone_re and the sftp arm of the old ssh_or_sftp_re - add a single scheme-detection pass-through against BORGSTORE_SCHEMES; reject unknown schemes (e.g. socket://) as before - canonical_path() returns the processed URL for the delegated protocols, with embedded credentials stripped so secrets never reach the security state file or logs - source local_path_re's scheme exclusions from BORGSTORE_SCHEMES - create: use proto == "file" instead of "not location.host" for the local repo-dir inode skip Co-Authored-By: Claude Opus 4.8 --- docs/changes.rst | 6 + src/borg/archiver/create_cmd.py | 2 +- src/borg/helpers/parseformat.py | 106 +++++++----------- .../testsuite/helpers/parseformat_test.py | 62 +++++----- 4 files changed, 76 insertions(+), 100 deletions(-) diff --git a/docs/changes.rst b/docs/changes.rst index 8ec31bf3a..2526d5280 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -201,6 +201,12 @@ Fixes: Other changes: +- Location: simplify parsing/validation, #9678. + For sftp/http(s)/s3/b2/rclone repositories, borg now only detects the scheme and hands the raw + URL to borgstore, which parses and validates it (removing the duplicate parsing borg did before). + Note: for these repositories the canonical location string changed slightly, so on the first run + against an existing such repository borg may warn once that it "was previously located at ..." - + this is harmless and can be confirmed. - keyfile: name key files by sha256(keyfile_contents). Existing legacy-named keyfiles continue to work. - repokey: use same format as with external keyfile diff --git a/src/borg/archiver/create_cmd.py b/src/borg/archiver/create_cmd.py index 9bf4d4f68..cb48500a3 100644 --- a/src/borg/archiver/create_cmd.py +++ b/src/borg/archiver/create_cmd.py @@ -56,7 +56,7 @@ class CreateMixIn: except OSError: pass # Add local repository dir to inode_skip list - if not args.location.host: + if args.location.proto == "file": try: st = os.stat(args.location.path) skip_inodes.add((st.st_ino, st.st_dev)) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 23b0b13be..43d57e26f 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -503,6 +503,15 @@ def parse_stringified_list(s): return [item for item in items if item != ""] +def _redact_url_credentials(url): + """Remove embedded credentials from a repository URL for safe display/identity use.""" + # netloc style: scheme://user[:pass]@host... -> scheme://host... + url = re.sub(r"(://)[^/@]+@", r"\1", url) + # s3/b2 style: (s3|b2):profile@... or (s3|b2):key:secret@... -> (s3|b2):... + url = re.sub(r"^((?:s3|b2):)[^@/]+@", r"\1", url) + return url + + class Location: """Object representing a repository location""" @@ -528,12 +537,14 @@ class Location: # :port (optional) optional_port_re = r"(?::(?P\d+))?" - # path may contain any chars. to avoid ambiguities with other regexes, - # it must not start with "//" nor with "scheme://" nor with "rclone:". - local_path_re = r""" - (?!(//|(ssh|socket|sftp|file)://|(rclone|s3|b2):)) - (?P.+) - """ + # locations that borgstore parses and validates itself - borg only detects the scheme and + # passes the raw URL through. covers both "scheme://..." and opaque "scheme:..." forms. + BORGSTORE_SCHEMES = ("sftp", "http", "https", "s3", "b2", "rclone") + + # path may contain any chars. to avoid ambiguities with other regexes, it must not start with + # "//", a "scheme://" or one of the borgstore "scheme:" specifiers (all of which are matched + # before local_re in _parse). the borgstore scheme list is sourced from BORGSTORE_SCHEMES. + local_path_re = r"(?!(//|(?:ssh|socket|file)://|(?:" + "|".join(BORGSTORE_SCHEMES) + r"):))" r"(?P.+)" # abs_path must start with a slash (or drive letter on Windows). abs_path_re = r"(?P[A-Za-z]:/.+)" if is_win32 else r"(?P/.+)" @@ -541,9 +552,14 @@ class Location: # path may or may not start with a slash. abs_or_rel_path_re = r"(?P.+)" - # regexes for misc. kinds of supported location specifiers: - ssh_or_sftp_re = re.compile( - r"(?P(ssh|sftp))://" + # We only parse out individual fields (user/host/port/path) for the protocols where borg + # itself needs them: legacy "ssh" (v1 repositories) and "rest" (for the ssh tunnel + FILE + # backend), plus local "file" paths. Everything else (see BORGSTORE_SCHEMES) is handed to + # borgstore as the raw URL and parsed/validated there - we only detect the scheme. + + # ssh:// is only used for legacy borg 1.x repositories nowadays. + ssh_re = re.compile( + r"(?Pssh)://" + optional_user_re + host_re + optional_port_re @@ -565,39 +581,8 @@ class Location: re.VERBOSE, ) - # BorgStore REST server - # (http|https)://user:pass@host:port/ - http_re = re.compile( - r"(?Phttp|https)://" - + r"((?P[^:@]+):(?P[^@]+)@)?" - + host_re - + optional_port_re - + r"(?P/)", - re.VERBOSE, - ) - - # (s3|b2):[(profile|(access_key_id:access_key_secret))@][scheme://hostname[:port]]/bucket/path - s3_re = re.compile( - r""" - (?P(s3|b2)): - (( - (?P[^@:]+) # profile (no colons allowed) - | - (?P[^:@]+):(?P[^@]+) # access key and secret - )@)? # optional authentication - ( - [^:/]+:// # scheme (often https) - (?P[^:/]+) - (:(?P\d+))? - )? # optional endpoint - / - (?P[^/]+)/ # bucket name - (?P.+) # path - """, - re.VERBOSE, - ) - - rclone_re = re.compile(r"(?Prclone):(?P(.*))", re.VERBOSE) + # scheme detector for the borgstore-handled locations listed in BORGSTORE_SCHEMES above. + scheme_re = re.compile(r"(?P[a-zA-Z][a-zA-Z0-9+.\-]*):") sl = "/" if is_win32 else "" file_re = re.compile(r"(?Pfile)://" + sl + abs_path_re, re.VERBOSE) @@ -636,7 +621,7 @@ class Location: raise ValueError('Invalid location format: "%s"' % self.processed) def _parse(self, text): - m = self.ssh_or_sftp_re.match(text) + m = self.ssh_re.match(text) if m: self.proto = m.group("proto") self.user = m.group("user") @@ -652,33 +637,16 @@ class Location: self.port = m.group("port") and int(m.group("port")) or None self.path = os.path.normpath(m.group("path")) return True - m = self.http_re.match(text) - if m: - self.proto = m.group("proto") - self.user = m.group("user") - self._pass = True if m.group("pass") else False - self._host = m.group("host") - self.port = m.group("port") and int(m.group("port")) or None - self.path = m.group("path") - return True - m = self.rclone_re.match(text) - if m: - self.proto = m.group("proto") - self.path = m.group("path") - return True m = self.file_re.match(text) if m: self.proto = m.group("proto") self.path = os.path.normpath(m.group("path")) return True - m = self.s3_re.match(text) - if m: - self.proto = m.group("s3type") - self.user = m.group("profile") if m.group("profile") else m.group("access_key_id") - self._pass = True if m.group("access_key_secret") else False - self._host = m.group("hostname") - self.port = m.group("port") and int(m.group("port")) or None - self.path = m.group("bucket") + "/" + m.group("path") + m = self.scheme_re.match(text) + if m and m.group("scheme") in self.BORGSTORE_SCHEMES: + # borgstore parses/validates these; we only detect the scheme and pass the raw + # URL (self.processed) through to it - no fields are extracted here. + self.proto = m.group("scheme") return True m = self.local_re.match(text) if m: @@ -711,9 +679,7 @@ class Location: def canonical_path(self): if self.proto == "file": return self.path - if self.proto == "rclone": - return f"{self.proto}:{self.path}" - if self.proto in ("rest", "sftp", "ssh", "s3", "b2", "http", "https"): + if self.proto in ("rest", "ssh"): return ( f"{self.proto}://" f"{(self.user + '@') if self.user else ''}" @@ -721,6 +687,10 @@ class Location: f"{(':' + str(self.port)) if self.port else ''}/" f"{self.path}" ) + if self.proto in self.BORGSTORE_SCHEMES: + # borgstore-handled locations: use the raw (processed) URL as given, but strip any + # embedded credentials so we never write secrets to the security state file or logs. + return _redact_url_credentials(self.processed) raise NotImplementedError(self.proto) def with_timestamp(self, timestamp): diff --git a/src/borg/testsuite/helpers/parseformat_test.py b/src/borg/testsuite/helpers/parseformat_test.py index 93bf0b028..9dca20576 100644 --- a/src/borg/testsuite/helpers/parseformat_test.py +++ b/src/borg/testsuite/helpers/parseformat_test.py @@ -198,51 +198,51 @@ class TestLocationWithoutEnv: == "Location(proto='rest', user=None, pass=None, host=None, port=None, path='/absolute/path')" ) + # For the protocols handled (parsed + validated) by borgstore itself, borg only detects + # the scheme and passes the raw URL through; it no longer extracts user/host/port/path. + def test_s3(self, monkeypatch): monkeypatch.delenv("BORG_REPO", raising=False) - assert ( - repr(Location("s3:/test/path")) - == "Location(proto='s3', user=None, pass=None, host=None, port=None, path='test/path')" + loc = Location("s3:/test/path") + assert loc.proto == "s3" + assert (loc.user, loc.host, loc.port, loc.path) == (None, None, None, None) + assert loc.processed == "s3:/test/path" + # credentials in the URL are stripped from canonical_path (security state file / logs) + assert Location("s3:profile@http://172.28.52.116:9000/test/path").canonical_path() == ( + "s3:http://172.28.52.116:9000/test/path" ) - assert ( - repr(Location("s3:profile@http://172.28.52.116:9000/test/path")) - == "Location(proto='s3', user='profile', pass=None, host='172.28.52.116', port=9000, path='test/path')" # noqa: E501 + assert Location("s3:user:pass@http://172.28.52.116:9000/test/path").canonical_path() == ( + "s3:http://172.28.52.116:9000/test/path" ) - assert ( - repr(Location("s3:user:pass@http://172.28.52.116:9000/test/path")) - == "Location(proto='s3', user='user', pass='REDACTED', host='172.28.52.116', port=9000, path='test/path')" # noqa: E501 - ) - assert ( - repr(Location("b2:user:pass@https://s3.us-east-005.backblazeb2.com/test/path")) - == "Location(proto='b2', user='user', pass='REDACTED', host='s3.us-east-005.backblazeb2.com', port=None, path='test/path')" # noqa: E501 + assert Location("b2:user:pass@https://s3.us-east-005.backblazeb2.com/test/path").canonical_path() == ( + "b2:https://s3.us-east-005.backblazeb2.com/test/path" ) def test_rclone(self, monkeypatch): monkeypatch.delenv("BORG_REPO", raising=False) - assert ( - repr(Location("rclone:remote:path")) - == "Location(proto='rclone', user=None, pass=None, host=None, port=None, path='remote:path')" - ) + loc = Location("rclone:remote:path") + assert loc.proto == "rclone" + assert (loc.user, loc.host, loc.port, loc.path) == (None, None, None, None) + assert loc.processed == "rclone:remote:path" + assert loc.canonical_path() == "rclone:remote:path" def test_sftp(self, monkeypatch): monkeypatch.delenv("BORG_REPO", raising=False) - # relative path - assert ( - repr(Location("sftp://user@host:1234/rel/path")) - == "Location(proto='sftp', user='user', pass=None, host='host', port=1234, path='rel/path')" - ) - # absolute path - assert ( - repr(Location("sftp://user@host:1234//abs/path")) - == "Location(proto='sftp', user='user', pass=None, host='host', port=1234, path='/abs/path')" - ) + loc = Location("sftp://user@host:1234/rel/path") + assert loc.proto == "sftp" + assert (loc.user, loc.host, loc.port, loc.path) == (None, None, None, None) + assert loc.processed == "sftp://user@host:1234/rel/path" + # credentials stripped from canonical_path + assert loc.canonical_path() == "sftp://host:1234/rel/path" def test_http(self, monkeypatch): monkeypatch.delenv("BORG_REPO", raising=False) - assert ( - repr(Location("http://user:pass@host:1234/")) - == "Location(proto='http', user='user', pass='REDACTED', host='host', port=1234, path='/')" - ) + loc = Location("http://user:pass@host:1234/") + assert loc.proto == "http" + assert (loc.user, loc.host, loc.port, loc.path) == (None, None, None, None) + assert loc.processed == "http://user:pass@host:1234/" + # credentials stripped from canonical_path + assert loc.canonical_path() == "http://host:1234/" def test_socket(self, monkeypatch): monkeypatch.delenv("BORG_REPO", raising=False) From 5eab1831838273df9345f1670f89a247566b831a Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 10 Jun 2026 18:28:50 +0200 Subject: [PATCH 2/2] remove leftover socket: protocol code The unix-socket transport (socket:// repositories, the --socket option and "borg serve" over a socket) was never part of a stable borg 2 release and the old RPC protocol it relied on is gone, so the remaining code was dead: - legacy remote: drop the unreachable proto == "socket" connection branch and the now-unused self.sock handling, "import socket" and get_socket_filename import (LegacyRemoteRepository is only built for proto == "ssh") - helpers: remove get_socket_filename() and its export - parseformat: drop "socket" from local_path_re - socket:// is now treated like any other unknown scheme (a local path) rather than being special-cased - tests: drop test_socket and the self.sock check in the legacy reopen helper - docs: drop the stale --socket entry from the manually maintained common-options.rst.inc (the auto-generated usage/man docs are left untouched here and will be rebuilt in a separate commit) Co-Authored-By: Claude Opus 4.8 --- docs/changes.rst | 2 ++ docs/usage/common-options.rst.inc | 1 - src/borg/helpers/__init__.py | 2 +- src/borg/helpers/fs.py | 4 --- src/borg/helpers/parseformat.py | 2 +- src/borg/legacy/remote.py | 36 ++----------------- src/borg/logger.py | 2 +- .../testsuite/helpers/parseformat_test.py | 7 ---- src/borg/testsuite/legacyrepository_test.py | 2 +- 9 files changed, 9 insertions(+), 49 deletions(-) diff --git a/docs/changes.rst b/docs/changes.rst index 2526d5280..f52b77111 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -207,6 +207,8 @@ Other changes: Note: for these repositories the canonical location string changed slightly, so on the first run against an existing such repository borg may warn once that it "was previously located at ..." - this is harmless and can be confirmed. +- remove leftover socket: protocol code (the unix-socket transport and the --socket option were + never part of a stable borg 2 release). - keyfile: name key files by sha256(keyfile_contents). Existing legacy-named keyfiles continue to work. - repokey: use same format as with external keyfile diff --git a/docs/usage/common-options.rst.inc b/docs/usage/common-options.rst.inc index 31a9df865..a96b00c3e 100644 --- a/docs/usage/common-options.rst.inc +++ b/docs/usage/common-options.rst.inc @@ -17,5 +17,4 @@ --upload-buffer UPLOAD_BUFFER set network upload buffer size in MiB. (default: 0=no buffer) --debug-profile FILE Write execution profile in Borg format into FILE. For local use a Python-compatible file can be generated by suffixing FILE with ".pyprof". --rsh RSH Use this command to connect to the 'borg serve' process (default: 'ssh') ---socket PATH Use UNIX DOMAIN (IPC) socket at PATH for client/server communication with socket: protocol. -r REPO, --repo REPO repository to use diff --git a/src/borg/helpers/__init__.py b/src/borg/helpers/__init__.py index 2d7258e26..530952d96 100644 --- a/src/borg/helpers/__init__.py +++ b/src/borg/helpers/__init__.py @@ -17,7 +17,7 @@ from .errors import RTError, PathNotAllowed, modern_ec from .errors import BorgWarning, FileChangedWarning, BackupWarning, IncludePatternNeverMatchedWarning from .errors import BackupError, BackupOSError, BackupRaceConditionError, BackupItemExcluded from .errors import BackupPermissionError, BackupIOError, BackupFileNotFoundError -from .fs import ensure_dir, join_base_dir, get_socket_filename +from .fs import ensure_dir, join_base_dir from .fs import get_security_dir, get_keys_dir, get_base_dir, get_cache_dir, get_config_dir, get_runtime_dir from .fs import dir_is_tagged, dir_is_cachedir, remove_dotdot_prefixes, make_path_safe, scandir_inorder from .fs import secure_erase, safe_unlink, dash_open, os_open, os_stat, get_strip_prefix, umount, slashify diff --git a/src/borg/helpers/fs.py b/src/borg/helpers/fs.py index cf578ed2a..b788e446b 100644 --- a/src/borg/helpers/fs.py +++ b/src/borg/helpers/fs.py @@ -169,10 +169,6 @@ def get_runtime_dir(*, legacy=False, create=True): return runtime_dir -def get_socket_filename(): - return str(Path(get_runtime_dir()) / "borg.sock") - - def get_cache_dir(*, legacy=False, create=True): """Determine where to store Borg cache data.""" diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 43d57e26f..e45ef2a4e 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -544,7 +544,7 @@ class Location: # path may contain any chars. to avoid ambiguities with other regexes, it must not start with # "//", a "scheme://" or one of the borgstore "scheme:" specifiers (all of which are matched # before local_re in _parse). the borgstore scheme list is sourced from BORGSTORE_SCHEMES. - local_path_re = r"(?!(//|(?:ssh|socket|file)://|(?:" + "|".join(BORGSTORE_SCHEMES) + r"):))" r"(?P.+)" + local_path_re = r"(?!(//|(?:ssh|file)://|(?:" + "|".join(BORGSTORE_SCHEMES) + r"):))" r"(?P.+)" # abs_path must start with a slash (or drive letter on Windows). abs_path_re = r"(?P[A-Za-z]:/.+)" if is_win32 else r"(?P/.+)" diff --git a/src/borg/legacy/remote.py b/src/borg/legacy/remote.py index f2ab2b008..3ee40a4c0 100644 --- a/src/borg/legacy/remote.py +++ b/src/borg/legacy/remote.py @@ -6,7 +6,6 @@ import os import queue import select import shlex -import socket import sys import textwrap import time @@ -23,7 +22,6 @@ from ..helpers import replace_placeholders from ..helpers import sysinfo from ..helpers import format_file_size from ..helpers import prepare_subprocess_env, ignore_sigint -from ..helpers import get_socket_filename from ..fslocking import LockTimeout, NotLocked, NotMyLock, LockFailed from ..logger import create_logger, borg_serve_log_queue from ..helpers import msgpack @@ -255,7 +253,7 @@ class LegacyRemoteRepository: self.upload_buffer_size_limit = args.upload_buffer * 1024 * 1024 if args and args.upload_buffer else 0 self.unpacker = get_limited_unpacker("client") self.server_version = None # we update this after server sends its version - self.p = self.sock = None + self.p = None self._args = args if self.location.proto == "ssh": testing = location.host == "__testsuite__" @@ -281,25 +279,6 @@ class LegacyRemoteRepository: self.stderr_fd = self.p.stderr.fileno() self.r_fds = [self.stdout_fd, self.stderr_fd] self.x_fds = [self.stdin_fd, self.stdout_fd, self.stderr_fd] - elif self.location.proto == "socket": - if args.use_socket is False or args.use_socket is True: # nothing or --socket - socket_path = get_socket_filename() - else: # --socket=/some/path - socket_path = args.use_socket - self.sock = socket.socket(family=socket.AF_UNIX, type=socket.SOCK_STREAM) - try: - self.sock.connect(socket_path) # note: socket_path length is rather limited. - except FileNotFoundError: - self.sock = None - raise Error(f"The socket file {socket_path} does not exist.") - except ConnectionRefusedError: - self.sock = None - raise Error(f"There is no borg serve running for the socket file {socket_path}.") - self.stdin_fd = self.sock.makefile("wb").fileno() - self.stdout_fd = self.sock.makefile("rb").fileno() - self.stderr_fd = None - self.r_fds = [self.stdout_fd] - self.x_fds = [self.stdin_fd, self.stdout_fd] else: raise Error(f"Unsupported protocol {location.proto}") @@ -339,7 +318,7 @@ class LegacyRemoteRepository: def __del__(self): if len(self.responses): logging.debug("still %d cached responses left in LegacyRemoteRepository" % (len(self.responses),)) - if self.p or self.sock: + if self.p: self.close() assert False, "cleanup happened in LegacyRemoteRepository.__del__" @@ -661,21 +640,12 @@ class LegacyRemoteRepository: """actual remoting is done via self.call in the @api decorator""" def close(self): - if self.p or self.sock: - self.call("close", {}, wait=True) if self.p: + self.call("close", {}, wait=True) self.p.stdin.close() self.p.stdout.close() self.p.wait() self.p = None - if self.sock: - try: - self.sock.shutdown(socket.SHUT_RDWR) - except OSError as e: - if e.errno != errno.ENOTCONN: - raise - self.sock.close() - self.sock = None def async_response(self, wait=True): for resp in self.call_many("async_responses", calls=[], wait=True, async_wait=wait): diff --git a/src/borg/logger.py b/src/borg/logger.py index 5e3020710..4c364a26e 100644 --- a/src/borg/logger.py +++ b/src/borg/logger.py @@ -44,7 +44,7 @@ Logging setup in Borg needs to work under various conditions: - Tests: potentially running in parallel via pytest-xdist, capturing Borg output into a given stream. - Logging might be short-lived (e.g., when invoking a single Borg command via the CLI) - or long-lived (e.g., borg serve --socket or when running the tests). + or long-lived (e.g., borg serve or when running the tests). - Logging is global and exists only once per process. """ diff --git a/src/borg/testsuite/helpers/parseformat_test.py b/src/borg/testsuite/helpers/parseformat_test.py index 9dca20576..6be33bc77 100644 --- a/src/borg/testsuite/helpers/parseformat_test.py +++ b/src/borg/testsuite/helpers/parseformat_test.py @@ -244,13 +244,6 @@ class TestLocationWithoutEnv: # credentials stripped from canonical_path assert loc.canonical_path() == "http://host:1234/" - def test_socket(self, monkeypatch): - monkeypatch.delenv("BORG_REPO", raising=False) - # socket:// is no longer supported and must be rejected as an invalid location. - url = "socket:///c:/repo/path" if is_win32 else "socket:///repo/path" - with pytest.raises(ValueError): - Location(url) - def test_file(self, monkeypatch): monkeypatch.delenv("BORG_REPO", raising=False) url = "file:///c:/repo/path" if is_win32 else "file:///repo/path" diff --git a/src/borg/testsuite/legacyrepository_test.py b/src/borg/testsuite/legacyrepository_test.py index eae031535..40be349b8 100644 --- a/src/borg/testsuite/legacyrepository_test.py +++ b/src/borg/testsuite/legacyrepository_test.py @@ -51,7 +51,7 @@ def reopen(repository, exclusive: bool | None = True, create=False): return LegacyRepository(repository.path, exclusive=exclusive, create=create) if isinstance(repository, LegacyRemoteRepository): - if repository.p is not None or repository.sock is not None: + if repository.p is not None: raise RuntimeError("Remote repo must be closed before a reopen. Cannot support nested repository contexts.") return LegacyRemoteRepository(repository.location, exclusive=exclusive, create=create)