diff --git a/docs/changes.rst b/docs/changes.rst index 8ec31bf3a..2526d5280 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -201,6 +201,12 @@ Fixes: Other changes: +- Location: simplify parsing/validation, #9678. + For sftp/http(s)/s3/b2/rclone repositories, borg now only detects the scheme and hands the raw + URL to borgstore, which parses and validates it (removing the duplicate parsing borg did before). + Note: for these repositories the canonical location string changed slightly, so on the first run + against an existing such repository borg may warn once that it "was previously located at ..." - + this is harmless and can be confirmed. - keyfile: name key files by sha256(keyfile_contents). Existing legacy-named keyfiles continue to work. - repokey: use same format as with external keyfile diff --git a/src/borg/archiver/create_cmd.py b/src/borg/archiver/create_cmd.py index 9bf4d4f68..cb48500a3 100644 --- a/src/borg/archiver/create_cmd.py +++ b/src/borg/archiver/create_cmd.py @@ -56,7 +56,7 @@ class CreateMixIn: except OSError: pass # Add local repository dir to inode_skip list - if not args.location.host: + if args.location.proto == "file": try: st = os.stat(args.location.path) skip_inodes.add((st.st_ino, st.st_dev)) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 23b0b13be..43d57e26f 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -503,6 +503,15 @@ def parse_stringified_list(s): return [item for item in items if item != ""] +def _redact_url_credentials(url): + """Remove embedded credentials from a repository URL for safe display/identity use.""" + # netloc style: scheme://user[:pass]@host... -> scheme://host... + url = re.sub(r"(://)[^/@]+@", r"\1", url) + # s3/b2 style: (s3|b2):profile@... or (s3|b2):key:secret@... -> (s3|b2):... + url = re.sub(r"^((?:s3|b2):)[^@/]+@", r"\1", url) + return url + + class Location: """Object representing a repository location""" @@ -528,12 +537,14 @@ class Location: # :port (optional) optional_port_re = r"(?::(?P\d+))?" - # path may contain any chars. to avoid ambiguities with other regexes, - # it must not start with "//" nor with "scheme://" nor with "rclone:". - local_path_re = r""" - (?!(//|(ssh|socket|sftp|file)://|(rclone|s3|b2):)) - (?P.+) - """ + # locations that borgstore parses and validates itself - borg only detects the scheme and + # passes the raw URL through. covers both "scheme://..." and opaque "scheme:..." forms. + BORGSTORE_SCHEMES = ("sftp", "http", "https", "s3", "b2", "rclone") + + # path may contain any chars. to avoid ambiguities with other regexes, it must not start with + # "//", a "scheme://" or one of the borgstore "scheme:" specifiers (all of which are matched + # before local_re in _parse). the borgstore scheme list is sourced from BORGSTORE_SCHEMES. + local_path_re = r"(?!(//|(?:ssh|socket|file)://|(?:" + "|".join(BORGSTORE_SCHEMES) + r"):))" r"(?P.+)" # abs_path must start with a slash (or drive letter on Windows). abs_path_re = r"(?P[A-Za-z]:/.+)" if is_win32 else r"(?P/.+)" @@ -541,9 +552,14 @@ class Location: # path may or may not start with a slash. abs_or_rel_path_re = r"(?P.+)" - # regexes for misc. kinds of supported location specifiers: - ssh_or_sftp_re = re.compile( - r"(?P(ssh|sftp))://" + # We only parse out individual fields (user/host/port/path) for the protocols where borg + # itself needs them: legacy "ssh" (v1 repositories) and "rest" (for the ssh tunnel + FILE + # backend), plus local "file" paths. Everything else (see BORGSTORE_SCHEMES) is handed to + # borgstore as the raw URL and parsed/validated there - we only detect the scheme. + + # ssh:// is only used for legacy borg 1.x repositories nowadays. + ssh_re = re.compile( + r"(?Pssh)://" + optional_user_re + host_re + optional_port_re @@ -565,39 +581,8 @@ class Location: re.VERBOSE, ) - # BorgStore REST server - # (http|https)://user:pass@host:port/ - http_re = re.compile( - r"(?Phttp|https)://" - + r"((?P[^:@]+):(?P[^@]+)@)?" - + host_re - + optional_port_re - + r"(?P/)", - re.VERBOSE, - ) - - # (s3|b2):[(profile|(access_key_id:access_key_secret))@][scheme://hostname[:port]]/bucket/path - s3_re = re.compile( - r""" - (?P(s3|b2)): - (( - (?P[^@:]+) # profile (no colons allowed) - | - (?P[^:@]+):(?P[^@]+) # access key and secret - )@)? # optional authentication - ( - [^:/]+:// # scheme (often https) - (?P[^:/]+) - (:(?P\d+))? - )? # optional endpoint - / - (?P[^/]+)/ # bucket name - (?P.+) # path - """, - re.VERBOSE, - ) - - rclone_re = re.compile(r"(?Prclone):(?P(.*))", re.VERBOSE) + # scheme detector for the borgstore-handled locations listed in BORGSTORE_SCHEMES above. + scheme_re = re.compile(r"(?P[a-zA-Z][a-zA-Z0-9+.\-]*):") sl = "/" if is_win32 else "" file_re = re.compile(r"(?Pfile)://" + sl + abs_path_re, re.VERBOSE) @@ -636,7 +621,7 @@ class Location: raise ValueError('Invalid location format: "%s"' % self.processed) def _parse(self, text): - m = self.ssh_or_sftp_re.match(text) + m = self.ssh_re.match(text) if m: self.proto = m.group("proto") self.user = m.group("user") @@ -652,33 +637,16 @@ class Location: self.port = m.group("port") and int(m.group("port")) or None self.path = os.path.normpath(m.group("path")) return True - m = self.http_re.match(text) - if m: - self.proto = m.group("proto") - self.user = m.group("user") - self._pass = True if m.group("pass") else False - self._host = m.group("host") - self.port = m.group("port") and int(m.group("port")) or None - self.path = m.group("path") - return True - m = self.rclone_re.match(text) - if m: - self.proto = m.group("proto") - self.path = m.group("path") - return True m = self.file_re.match(text) if m: self.proto = m.group("proto") self.path = os.path.normpath(m.group("path")) return True - m = self.s3_re.match(text) - if m: - self.proto = m.group("s3type") - self.user = m.group("profile") if m.group("profile") else m.group("access_key_id") - self._pass = True if m.group("access_key_secret") else False - self._host = m.group("hostname") - self.port = m.group("port") and int(m.group("port")) or None - self.path = m.group("bucket") + "/" + m.group("path") + m = self.scheme_re.match(text) + if m and m.group("scheme") in self.BORGSTORE_SCHEMES: + # borgstore parses/validates these; we only detect the scheme and pass the raw + # URL (self.processed) through to it - no fields are extracted here. + self.proto = m.group("scheme") return True m = self.local_re.match(text) if m: @@ -711,9 +679,7 @@ class Location: def canonical_path(self): if self.proto == "file": return self.path - if self.proto == "rclone": - return f"{self.proto}:{self.path}" - if self.proto in ("rest", "sftp", "ssh", "s3", "b2", "http", "https"): + if self.proto in ("rest", "ssh"): return ( f"{self.proto}://" f"{(self.user + '@') if self.user else ''}" @@ -721,6 +687,10 @@ class Location: f"{(':' + str(self.port)) if self.port else ''}/" f"{self.path}" ) + if self.proto in self.BORGSTORE_SCHEMES: + # borgstore-handled locations: use the raw (processed) URL as given, but strip any + # embedded credentials so we never write secrets to the security state file or logs. + return _redact_url_credentials(self.processed) raise NotImplementedError(self.proto) def with_timestamp(self, timestamp): diff --git a/src/borg/testsuite/helpers/parseformat_test.py b/src/borg/testsuite/helpers/parseformat_test.py index 93bf0b028..9dca20576 100644 --- a/src/borg/testsuite/helpers/parseformat_test.py +++ b/src/borg/testsuite/helpers/parseformat_test.py @@ -198,51 +198,51 @@ class TestLocationWithoutEnv: == "Location(proto='rest', user=None, pass=None, host=None, port=None, path='/absolute/path')" ) + # For the protocols handled (parsed + validated) by borgstore itself, borg only detects + # the scheme and passes the raw URL through; it no longer extracts user/host/port/path. + def test_s3(self, monkeypatch): monkeypatch.delenv("BORG_REPO", raising=False) - assert ( - repr(Location("s3:/test/path")) - == "Location(proto='s3', user=None, pass=None, host=None, port=None, path='test/path')" + loc = Location("s3:/test/path") + assert loc.proto == "s3" + assert (loc.user, loc.host, loc.port, loc.path) == (None, None, None, None) + assert loc.processed == "s3:/test/path" + # credentials in the URL are stripped from canonical_path (security state file / logs) + assert Location("s3:profile@http://172.28.52.116:9000/test/path").canonical_path() == ( + "s3:http://172.28.52.116:9000/test/path" ) - assert ( - repr(Location("s3:profile@http://172.28.52.116:9000/test/path")) - == "Location(proto='s3', user='profile', pass=None, host='172.28.52.116', port=9000, path='test/path')" # noqa: E501 + assert Location("s3:user:pass@http://172.28.52.116:9000/test/path").canonical_path() == ( + "s3:http://172.28.52.116:9000/test/path" ) - assert ( - repr(Location("s3:user:pass@http://172.28.52.116:9000/test/path")) - == "Location(proto='s3', user='user', pass='REDACTED', host='172.28.52.116', port=9000, path='test/path')" # noqa: E501 - ) - assert ( - repr(Location("b2:user:pass@https://s3.us-east-005.backblazeb2.com/test/path")) - == "Location(proto='b2', user='user', pass='REDACTED', host='s3.us-east-005.backblazeb2.com', port=None, path='test/path')" # noqa: E501 + assert Location("b2:user:pass@https://s3.us-east-005.backblazeb2.com/test/path").canonical_path() == ( + "b2:https://s3.us-east-005.backblazeb2.com/test/path" ) def test_rclone(self, monkeypatch): monkeypatch.delenv("BORG_REPO", raising=False) - assert ( - repr(Location("rclone:remote:path")) - == "Location(proto='rclone', user=None, pass=None, host=None, port=None, path='remote:path')" - ) + loc = Location("rclone:remote:path") + assert loc.proto == "rclone" + assert (loc.user, loc.host, loc.port, loc.path) == (None, None, None, None) + assert loc.processed == "rclone:remote:path" + assert loc.canonical_path() == "rclone:remote:path" def test_sftp(self, monkeypatch): monkeypatch.delenv("BORG_REPO", raising=False) - # relative path - assert ( - repr(Location("sftp://user@host:1234/rel/path")) - == "Location(proto='sftp', user='user', pass=None, host='host', port=1234, path='rel/path')" - ) - # absolute path - assert ( - repr(Location("sftp://user@host:1234//abs/path")) - == "Location(proto='sftp', user='user', pass=None, host='host', port=1234, path='/abs/path')" - ) + loc = Location("sftp://user@host:1234/rel/path") + assert loc.proto == "sftp" + assert (loc.user, loc.host, loc.port, loc.path) == (None, None, None, None) + assert loc.processed == "sftp://user@host:1234/rel/path" + # credentials stripped from canonical_path + assert loc.canonical_path() == "sftp://host:1234/rel/path" def test_http(self, monkeypatch): monkeypatch.delenv("BORG_REPO", raising=False) - assert ( - repr(Location("http://user:pass@host:1234/")) - == "Location(proto='http', user='user', pass='REDACTED', host='host', port=1234, path='/')" - ) + loc = Location("http://user:pass@host:1234/") + assert loc.proto == "http" + assert (loc.user, loc.host, loc.port, loc.path) == (None, None, None, None) + assert loc.processed == "http://user:pass@host:1234/" + # credentials stripped from canonical_path + assert loc.canonical_path() == "http://host:1234/" def test_socket(self, monkeypatch): monkeypatch.delenv("BORG_REPO", raising=False)