create: add the slashdot hack, fixes #4685

This commit is contained in:
Thomas Waldmann 2024-01-23 18:06:55 +01:00
parent e744e04821
commit 5b96d5acc3
No known key found for this signature in database
GPG key ID: 243ACFA951F78E01
5 changed files with 125 additions and 37 deletions

View file

@ -10,6 +10,9 @@ Examples
# same, but list all files as we process them
$ borg create --list /path/to/repo::my-documents ~/Documents
# Backup /mnt/disk/docs, but strip path prefix using the slashdot hack
$ borg create /path/to/repo::docs /mnt/disk/./docs
# Backup ~/Documents and ~/src but exclude pyc files
$ borg create /path/to/repo::my-files \
~/Documents \

View file

@ -1299,7 +1299,16 @@ class FilesystemObjectProcessors:
self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, sparse=sparse)
@contextmanager
def create_helper(self, path, st, status=None, hardlinkable=True):
def create_helper(self, path, st, status=None, hardlinkable=True, strip_prefix=None):
if strip_prefix is not None:
assert not path.endswith(os.sep)
if strip_prefix.startswith(path + os.sep):
# still on a directory level that shall be stripped - do not create an item for this!
yield None, 'x', False, False
return
# adjust path, remove stripped directory levels
path = path.removeprefix(strip_prefix)
safe_path = make_path_safe(path)
item = Item(path=safe_path)
hardlink_master = False
@ -1318,13 +1327,16 @@ class FilesystemObjectProcessors:
if hardlink_master:
self.hard_links[(st.st_ino, st.st_dev)] = safe_path
def process_dir_with_fd(self, *, path, fd, st):
with self.create_helper(path, st, 'd', hardlinkable=False) as (item, status, hardlinked, hardlink_master):
item.update(self.metadata_collector.stat_attrs(st, path, fd=fd))
def process_dir_with_fd(self, *, path, fd, st, strip_prefix):
with self.create_helper(path, st, 'd', hardlinkable=False, strip_prefix=strip_prefix) as (item, status, hardlinked, hardlink_master):
if item is not None:
item.update(self.metadata_collector.stat_attrs(st, path, fd=fd))
return status
def process_dir(self, *, path, parent_fd, name, st):
with self.create_helper(path, st, 'd', hardlinkable=False) as (item, status, hardlinked, hardlink_master):
def process_dir(self, *, path, parent_fd, name, st, strip_prefix):
with self.create_helper(path, st, 'd', hardlinkable=False, strip_prefix=strip_prefix) as (item, status, hardlinked, hardlink_master):
if item is None:
return status
with OsOpen(path=path, parent_fd=parent_fd, name=name, flags=flags_dir,
noatime=True, op='dir_open') as fd:
# fd is None for directories on windows, in that case a race condition check is not possible.
@ -1334,8 +1346,10 @@ class FilesystemObjectProcessors:
item.update(self.metadata_collector.stat_attrs(st, path, fd=fd))
return status
def process_fifo(self, *, path, parent_fd, name, st):
with self.create_helper(path, st, 'f') as (item, status, hardlinked, hardlink_master): # fifo
def process_fifo(self, *, path, parent_fd, name, st, strip_prefix):
with self.create_helper(path, st, 'f', strip_prefix=strip_prefix) as (item, status, hardlinked, hardlink_master): # fifo
if item is None:
return status
with OsOpen(path=path, parent_fd=parent_fd, name=name, flags=flags_normal, noatime=True) as fd:
with backup_io('fstat'):
st = stat_update_check(st, os.fstat(fd))
@ -1344,9 +1358,11 @@ class FilesystemObjectProcessors:
item.update(self.metadata_collector.stat_attrs(st, path, fd=fd))
return status
def process_dev(self, *, path, parent_fd, name, st, dev_type):
with self.create_helper(path, st, dev_type) as (item, status, hardlinked, hardlink_master): # char/block device
def process_dev(self, *, path, parent_fd, name, st, dev_type, strip_prefix):
with self.create_helper(path, st, dev_type, strip_prefix=strip_prefix) as (item, status, hardlinked, hardlink_master): # char/block device
# looks like we can not work fd-based here without causing issues when trying to open/close the device
if item is None:
return status
with backup_io('stat'):
st = stat_update_check(st, os_stat(path=path, parent_fd=parent_fd, name=name, follow_symlinks=False))
item.rdev = st.st_rdev
@ -1355,11 +1371,13 @@ class FilesystemObjectProcessors:
item.update(self.metadata_collector.stat_attrs(st, path))
return status
def process_symlink(self, *, path, parent_fd, name, st):
def process_symlink(self, *, path, parent_fd, name, st, strip_prefix):
# note: using hardlinkable=False because we can not support hardlinked symlinks,
# due to the dual-use of item.source, see issue #2343:
# hardlinked symlinks will be archived [and extracted] as non-hardlinked symlinks.
with self.create_helper(path, st, 's', hardlinkable=False) as (item, status, hardlinked, hardlink_master):
with self.create_helper(path, st, 's', hardlinkable=False, strip_prefix=strip_prefix) as (item, status, hardlinked, hardlink_master):
if item is None:
return status
fname = name if name is not None and parent_fd is not None else path
with backup_io('readlink'):
source = os.readlink(fname, dir_fd=parent_fd)
@ -1392,8 +1410,10 @@ class FilesystemObjectProcessors:
self.add_item(item, stats=self.stats)
return status
def process_file(self, *, path, parent_fd, name, st, cache, flags=flags_normal):
with self.create_helper(path, st, None) as (item, status, hardlinked, hardlink_master): # no status yet
def process_file(self, *, path, parent_fd, name, st, cache, flags=flags_normal, strip_prefix):
with self.create_helper(path, st, None, strip_prefix=strip_prefix) as (item, status, hardlinked, hardlink_master): # no status yet
if item is None:
return status
with OsOpen(path=path, parent_fd=parent_fd, name=name, flags=flags, noatime=True) as fd:
with backup_io('fstat'):
st = stat_update_check(st, os.fstat(fd))

View file

@ -55,7 +55,7 @@ try:
from .helpers import safe_encode, remove_surrogates, bin_to_hex, hex_to_bin, prepare_dump_dict, eval_escapes
from .helpers import interval, prune_within, prune_split, PRUNING_PATTERNS
from .helpers import timestamp, utcnow
from .helpers import get_cache_dir, os_stat
from .helpers import get_cache_dir, os_stat, get_strip_prefix
from .helpers import Manifest, AI_HUMAN_SORT_KEYS
from .helpers import hardlinkable
from .helpers import StableDict
@ -565,12 +565,14 @@ class Archiver:
pipe_bin = sys.stdin.buffer
pipe = TextIOWrapper(pipe_bin, errors='surrogateescape')
for path in iter_separated(pipe, paths_sep):
strip_prefix = get_strip_prefix(path)
path = os.path.normpath(path)
try:
with backup_io('stat'):
st = os_stat(path=path, parent_fd=None, name=None, follow_symlinks=False)
status = self._process_any(path=path, parent_fd=None, name=None, st=st, fso=fso,
cache=cache, read_special=args.read_special, dry_run=dry_run)
cache=cache, read_special=args.read_special, dry_run=dry_run,
strip_prefix=strip_prefix)
except BackupError as e:
self.print_warning_instance(BackupWarning(path, e))
status = 'E'
@ -598,6 +600,8 @@ class Archiver:
status = '-'
self.print_file_status(status, path)
continue
strip_prefix = get_strip_prefix(path)
path = os.path.normpath(path)
try:
with backup_io('stat'):
@ -607,7 +611,8 @@ class Archiver:
fso=fso, cache=cache, matcher=matcher,
exclude_caches=args.exclude_caches, exclude_if_present=args.exclude_if_present,
keep_exclude_tags=args.keep_exclude_tags, skip_inodes=skip_inodes,
restrict_dev=restrict_dev, read_special=args.read_special, dry_run=dry_run)
restrict_dev=restrict_dev, read_special=args.read_special, dry_run=dry_run,
strip_prefix=strip_prefix)
# if we get back here, we've finished recursing into <path>,
# we do not ever want to get back in there (even if path is given twice as recursion root)
skip_inodes.add((st.st_ino, st.st_dev))
@ -674,7 +679,7 @@ class Archiver:
else:
create_inner(None, None, None)
def _process_any(self, *, path, parent_fd, name, st, fso, cache, read_special, dry_run):
def _process_any(self, *, path, parent_fd, name, st, fso, cache, read_special, dry_run, strip_prefix):
"""
Call the right method on the given FilesystemObjectProcessor.
"""
@ -682,12 +687,12 @@ class Archiver:
if dry_run:
return '-'
elif stat.S_ISREG(st.st_mode):
return fso.process_file(path=path, parent_fd=parent_fd, name=name, st=st, cache=cache)
return fso.process_file(path=path, parent_fd=parent_fd, name=name, st=st, cache=cache, strip_prefix=strip_prefix)
elif stat.S_ISDIR(st.st_mode):
return fso.process_dir(path=path, parent_fd=parent_fd, name=name, st=st)
return fso.process_dir(path=path, parent_fd=parent_fd, name=name, st=st, strip_prefix=strip_prefix)
elif stat.S_ISLNK(st.st_mode):
if not read_special:
return fso.process_symlink(path=path, parent_fd=parent_fd, name=name, st=st)
return fso.process_symlink(path=path, parent_fd=parent_fd, name=name, st=st, strip_prefix=strip_prefix)
else:
try:
st_target = os_stat(path=path, parent_fd=parent_fd, name=name, follow_symlinks=True)
@ -697,27 +702,27 @@ class Archiver:
special = is_special(st_target.st_mode)
if special:
return fso.process_file(path=path, parent_fd=parent_fd, name=name, st=st_target,
cache=cache, flags=flags_special_follow)
cache=cache, flags=flags_special_follow, strip_prefix=strip_prefix)
else:
return fso.process_symlink(path=path, parent_fd=parent_fd, name=name, st=st)
return fso.process_symlink(path=path, parent_fd=parent_fd, name=name, st=st, strip_prefix=strip_prefix)
elif stat.S_ISFIFO(st.st_mode):
if not read_special:
return fso.process_fifo(path=path, parent_fd=parent_fd, name=name, st=st)
return fso.process_fifo(path=path, parent_fd=parent_fd, name=name, st=st, strip_prefix=strip_prefix)
else:
return fso.process_file(path=path, parent_fd=parent_fd, name=name, st=st,
cache=cache, flags=flags_special)
cache=cache, flags=flags_special, strip_prefix=strip_prefix)
elif stat.S_ISCHR(st.st_mode):
if not read_special:
return fso.process_dev(path=path, parent_fd=parent_fd, name=name, st=st, dev_type='c')
return fso.process_dev(path=path, parent_fd=parent_fd, name=name, st=st, dev_type='c', strip_prefix=strip_prefix)
else:
return fso.process_file(path=path, parent_fd=parent_fd, name=name, st=st,
cache=cache, flags=flags_special)
cache=cache, flags=flags_special, strip_prefix=strip_prefix)
elif stat.S_ISBLK(st.st_mode):
if not read_special:
return fso.process_dev(path=path, parent_fd=parent_fd, name=name, st=st, dev_type='b')
return fso.process_dev(path=path, parent_fd=parent_fd, name=name, st=st, dev_type='b', strip_prefix=strip_prefix)
else:
return fso.process_file(path=path, parent_fd=parent_fd, name=name, st=st,
cache=cache, flags=flags_special)
cache=cache, flags=flags_special, strip_prefix=strip_prefix)
elif stat.S_ISSOCK(st.st_mode):
# Ignore unix sockets
return
@ -733,7 +738,7 @@ class Archiver:
def _rec_walk(self, *, path, parent_fd, name, fso, cache, matcher,
exclude_caches, exclude_if_present, keep_exclude_tags,
skip_inodes, restrict_dev, read_special, dry_run):
skip_inodes, restrict_dev, read_special, dry_run, strip_prefix):
"""
Process *path* (or, preferably, parent_fd/name) recursively according to the various parameters.
@ -781,7 +786,7 @@ class Archiver:
# directories cannot go in this branch because they can be excluded based on tag
# files they might contain
status = self._process_any(path=path, parent_fd=parent_fd, name=name, st=st, fso=fso, cache=cache,
read_special=read_special, dry_run=dry_run)
read_special=read_special, dry_run=dry_run, strip_prefix=strip_prefix)
else:
with OsOpen(path=path, parent_fd=parent_fd, name=name, flags=flags_dir,
noatime=True, op='dir_open') as child_fd:
@ -797,19 +802,19 @@ class Archiver:
if not recurse_excluded_dir:
if keep_exclude_tags:
if not dry_run:
fso.process_dir_with_fd(path=path, fd=child_fd, st=st)
fso.process_dir_with_fd(path=path, fd=child_fd, st=st, strip_prefix=strip_prefix)
for tag_name in tag_names:
tag_path = os.path.join(path, tag_name)
self._rec_walk(
path=tag_path, parent_fd=child_fd, name=tag_name, fso=fso, cache=cache,
matcher=matcher, exclude_caches=exclude_caches, exclude_if_present=exclude_if_present,
keep_exclude_tags=keep_exclude_tags, skip_inodes=skip_inodes,
restrict_dev=restrict_dev, read_special=read_special, dry_run=dry_run)
restrict_dev=restrict_dev, read_special=read_special, dry_run=dry_run, strip_prefix=strip_prefix)
self.print_file_status('x', path)
return
if not recurse_excluded_dir:
if not dry_run:
status = fso.process_dir_with_fd(path=path, fd=child_fd, st=st)
status = fso.process_dir_with_fd(path=path, fd=child_fd, st=st, strip_prefix=strip_prefix)
else:
status = '-'
if recurse:
@ -821,7 +826,7 @@ class Archiver:
path=normpath, parent_fd=child_fd, name=dirent.name, fso=fso, cache=cache, matcher=matcher,
exclude_caches=exclude_caches, exclude_if_present=exclude_if_present,
keep_exclude_tags=keep_exclude_tags, skip_inodes=skip_inodes, restrict_dev=restrict_dev,
read_special=read_special, dry_run=dry_run)
read_special=read_special, dry_run=dry_run, strip_prefix=strip_prefix)
except BackupError as e:
self.print_warning_instance(BackupWarning(path, e))
status = 'E'
@ -3391,6 +3396,11 @@ class Archiver:
that means if relative paths are desired, the command has to be run from the correct
directory.
The slashdot hack in paths (recursion roots) is triggered by using ``/./``:
``/this/gets/stripped/./this/gets/archived`` means to process that fs object, but
strip the prefix on the left side of ``./`` from the archived items (in this case,
``this/gets/archived`` will be the path in the archived item).
When giving '-' as path, borg will read data from standard input and create a
file 'stdin' in the created archive from that data. In some cases it's more
appropriate to use --content-from-command, however. See section *Reading from
@ -3530,8 +3540,8 @@ class Archiver:
- 'x' = excluded, item was *not* backed up
- '?' = missing status code (if you see this, please file a bug report!)
Reading from stdin
++++++++++++++++++
Reading backup data from stdin
++++++++++++++++++++++++++++++
There are two methods to read from stdin. Either specify ``-`` as path and
pipe directly to borg::
@ -3562,6 +3572,21 @@ class Archiver:
By default, the content read from stdin is stored in a file called 'stdin'.
Use ``--stdin-name`` to change the name.
Feeding all file paths from externally
++++++++++++++++++++++++++++++++++++++
Usually, you give a starting path (recursion root) to borg and then borg
automatically recurses, finds and backs up all fs objects contained in
there (optionally considering include/exclude rules).
If you need more control and you want to give every single fs object path
to borg (maybe implementing your own recursion or your own rules), you can use
``--paths-from-stdin`` or ``--paths-from-command`` (with the latter, borg will
fail to create an archive should the command fail).
Borg supports paths with the slashdot hack to strip path prefixes here also.
So, be careful not to unintentionally trigger that.
""")
subparser = subparsers.add_parser('create', parents=[common_parser], add_help=False,

View file

@ -162,6 +162,21 @@ def make_path_safe(path):
return _safe_re.sub('', path) or '.'
def get_strip_prefix(path):
# similar to how rsync does it, we allow users to give paths like:
# /this/gets/stripped/./this/is/kept
# the whole path is what is used to read from the fs,
# the strip_prefix will be /this/gets/stripped/ and
# this/is/kept is the path being archived.
pos = path.find('/./') # detect slashdot hack
if pos > 0:
# found a prefix to strip! make sure it ends with one "/"!
return os.path.normpath(path[:pos]) + os.sep
else:
# no or empty prefix, nothing to strip!
return None
def hardlinkable(mode):
"""return True if we support hardlinked items of this type"""
return stat.S_ISREG(mode) or stat.S_ISBLK(mode) or stat.S_ISCHR(mode) or stat.S_ISFIFO(mode)

View file

@ -2172,6 +2172,31 @@ class ArchiverTestCase(ArchiverTestCaseBase):
output = self.cmd('list', archive)
assert 'input/link -> somewhere does not exist' in output
def test_create_dotslash_hack(self):
os.makedirs(os.path.join(self.input_path, 'first', 'secondA', 'thirdA'))
os.makedirs(os.path.join(self.input_path, 'first', 'secondB', 'thirdB'))
self.cmd('init', '--encryption=none', self.repository_location)
archive = self.repository_location + '::test'
self.cmd('create', archive, 'input/first/./') # hack!
output = self.cmd('list', archive)
# dir levels left of slashdot (= input, first) not in archive:
assert 'input' not in output
assert 'input/first' not in output
assert 'input/first/secondA' not in output
assert 'input/first/secondA/thirdA' not in output
assert 'input/first/secondB' not in output
assert 'input/first/secondB/thirdB' not in output
assert 'first' not in output
assert 'first/secondA' not in output
assert 'first/secondA/thirdA' not in output
assert 'first/secondB' not in output
assert 'first/secondB/thirdB' not in output
# dir levels right of slashdot are in archive:
assert 'secondA' in output
assert 'secondA/thirdA' in output
assert 'secondB' in output
assert 'secondB/thirdB' in output
# def test_cmdline_compatibility(self):
# self.create_regular_file('file1', size=1024 * 80)
# self.cmd('init', '--encryption=repokey', self.repository_location)