diff --git a/borg/helpers.py b/borg/helpers.py index aa5bead0b..ecf138125 100644 --- a/borg/helpers.py +++ b/borg/helpers.py @@ -7,6 +7,8 @@ import pwd import re import sys import time +import unicodedata + from datetime import datetime, timezone, timedelta from fnmatch import translate from operator import attrgetter @@ -220,6 +222,10 @@ def exclude_path(path, patterns): # unify the two cases, we add a path separator to the end of # the path before matching. +##### !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +##### For discussion only, don't merge this code! +##### !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + class IncludePattern: """Literal files or directories listed on the command line for some operations (e.g. extract, but not create). @@ -227,10 +233,22 @@ class IncludePattern: path match as well. A trailing slash makes no difference. """ def __init__(self, pattern): - self.pattern = os.path.normpath(pattern).rstrip(os.path.sep)+os.path.sep + def match(path): + return (path+os.path.sep).startswith(self.pattern) - def match(self, path): - return (path+os.path.sep).startswith(self.pattern) + # HFS+ converts paths to a canonical form, so users shouldn't be + # required to enter an exact match + if sys.platform in ('darwin',): + # repository paths will be mostly in NFD, as the OSX exception list + # to NFD is small, so normalize to that form for best performance + pattern = unicodedata.normalize("NFD", pattern) + self.match = lambda p: match(unicodedata.normalize("NFD", p)) + # Windows and Unix filesystems allow different forms, so users + # always have to enter an exact match + else: + self.match = match + + self.pattern = os.path.normpath(pattern).rstrip(os.path.sep)+os.path.sep def __repr__(self): return '%s(%s)' % (type(self), self.pattern) @@ -241,17 +259,30 @@ class ExcludePattern(IncludePattern): exclude the contents of a directory, but not the directory itself. """ def __init__(self, pattern): + def match(path): + return self.regex.match(path+os.path.sep) is not None + if pattern.endswith(os.path.sep): self.pattern = os.path.normpath(pattern).rstrip(os.path.sep)+os.path.sep+'*'+os.path.sep else: self.pattern = os.path.normpath(pattern)+os.path.sep+'*' + + # HFS+ converts paths to a canonical form, so users shouldn't be + # required to enter an exact match + if sys.platform in ('darwin',): + # repository paths will be mostly in NFD, as the OSX exception list + # to NFD is small, so normalize to that form for best performance + self.pattern = unicodedata.normalize("NFD", self.pattern) + self.match = lambda p: match(unicodedata.normalize("NFD", p)) + # Windows and Unix filesystems allow different forms, so users + # always have to enter an exact match + else: + self.match = match + # fnmatch and re.match both cache compiled regular expressions. # Nevertheless, this is about 10 times faster. self.regex = re.compile(translate(self.pattern)) - def match(self, path): - return self.regex.match(path+os.path.sep) is not None - def __repr__(self): return '%s(%s)' % (type(self), self.pattern) diff --git a/borg/testsuite/helpers.py b/borg/testsuite/helpers.py index 95531df83..002033f57 100644 --- a/borg/testsuite/helpers.py +++ b/borg/testsuite/helpers.py @@ -3,9 +3,10 @@ from time import mktime, strptime from datetime import datetime, timezone, timedelta import pytest +import sys import msgpack -from ..helpers import adjust_patterns, exclude_path, Location, format_timedelta, ExcludePattern, make_path_safe, \ +from ..helpers import adjust_patterns, exclude_path, Location, format_timedelta, IncludePattern, ExcludePattern, make_path_safe, \ prune_within, prune_split, \ StableDict, int_to_bigint, bigint_to_int, parse_timestamp, CompressionSpec, ChunkerParams from . import BaseTestCase @@ -178,6 +179,99 @@ class PatternTestCase(BaseTestCase): ['/etc/passwd', '/etc/hosts', '/var/log/messages', '/var/log/dmesg']) +@pytest.mark.skipif(sys.platform.startswith('darwin'), reason='all but OS X test') +class IncludePatternNonAsciiTestCase(BaseTestCase): + def testComposedUnicode(self): + pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}' + i = IncludePattern(pattern) + + assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") + assert not i.match("ba\N{COMBINING ACUTE ACCENT}/foo") + + def testDecomposedUnicode(self): + pattern = 'ba\N{COMBINING ACUTE ACCENT}' + i = IncludePattern(pattern) + + assert not i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") + assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo") + + def testInvalidUnicode(self): + pattern = str(b'ba\x80', 'latin1') + i = IncludePattern(pattern) + + assert not i.match("ba/foo") + assert i.match(str(b"ba\x80/foo", 'latin1')) + + +@pytest.mark.skipif(sys.platform.startswith('darwin'), reason='all but OS X test') +class ExcludePatternNonAsciiTestCase(BaseTestCase): + def testComposedUnicode(self): + pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}' + e = ExcludePattern(pattern) + + assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") + assert not e.match("ba\N{COMBINING ACUTE ACCENT}/foo") + + def testDecomposedUnicode(self): + pattern = 'ba\N{COMBINING ACUTE ACCENT}' + e = ExcludePattern(pattern) + + assert not e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") + assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo") + + def testInvalidUnicode(self): + pattern = str(b'ba\x80', 'latin1') + e = ExcludePattern(pattern) + + assert not e.match("ba/foo") + assert e.match(str(b"ba\x80/foo", 'latin1')) + +#@pytest.mark.skipif(sys.platform.startswith('darwin'), reason='OS X only test') +class OSXPatternNormalizationTestCase(BaseTestCase): + # monkey patch sys.platform to allow testing on non-OSX during development + # remove and uncomment OSX-only decorator before push + def setUp(self): + self.oldplatform = sys.platform + sys.platform = 'darwin' + pass + + # monkey patch sys.platform to allow testing on non-OSX during development + # remove and uncomment OSX-only decorator before push + def tearDown(self): + sys.platform = self.oldplatform + pass + + def testComposedUnicode(self): + pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}' + i = IncludePattern(pattern) + e = ExcludePattern(pattern) + + assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") + assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo") + assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") + assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo") + + def testDecomposedUnicode(self): + pattern = 'ba\N{COMBINING ACUTE ACCENT}' + i = IncludePattern(pattern) + e = ExcludePattern(pattern) + + assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") + assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo") + assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo") + assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo") + + def testInvalidUnicode(self): + pattern = str(b'ba\x80', 'latin1') + i = IncludePattern(pattern) + e = ExcludePattern(pattern) + + assert not i.match("ba/foo") + assert i.match(str(b"ba\x80/foo", 'latin1')) + assert not e.match("ba/foo") + assert e.match(str(b"ba\x80/foo", 'latin1')) + + def test_compression_specs(): with pytest.raises(ValueError): CompressionSpec('')