mirror of
https://github.com/borgbackup/borg.git
synced 2026-04-28 17:46:53 -04:00
Merge pull request #1217 from ThomasWaldmann/in-file-checkpoints
in-file checkpoints
This commit is contained in:
commit
3bdfe2a564
5 changed files with 106 additions and 52 deletions
35
docs/faq.rst
35
docs/faq.rst
|
|
@ -225,10 +225,7 @@ During a backup a special checkpoint archive named ``<archive-name>.checkpoint``
|
|||
is saved every checkpoint interval (the default value for this is 30
|
||||
minutes) containing all the data backed-up until that point.
|
||||
|
||||
Checkpoints only happen between files (so they don't help for interruptions
|
||||
happening while a very large file is being processed).
|
||||
|
||||
This checkpoint archive is a valid archive (all files in it are valid and complete),
|
||||
This checkpoint archive is a valid archive,
|
||||
but it is only a partial backup (not all files that you wanted to backup are
|
||||
contained in it). Having it in the repo until a successful, full backup is
|
||||
completed is useful because it references all the transmitted chunks up
|
||||
|
|
@ -249,27 +246,25 @@ Once your backup has finished successfully, you can delete all
|
|||
``<archive-name>.checkpoint`` archives. If you run ``borg prune``, it will
|
||||
also care for deleting unneeded checkpoints.
|
||||
|
||||
Note: the checkpointing mechanism creates hidden, partial files in an archive,
|
||||
so that checkpoints even work while a big file is being processed.
|
||||
They are named ``<filename>.borg_part_<N>`` and all operations usually ignore
|
||||
these files, but you can make them considered by giving the option
|
||||
``--consider-part-files``. You usually only need that option if you are
|
||||
really desperate (e.g. if you have no completed backup of that file and you'ld
|
||||
rather get a partial file extracted than nothing). You do **not** want to give
|
||||
that option under any normal circumstances.
|
||||
|
||||
How can I backup huge file(s) over a unstable connection?
|
||||
---------------------------------------------------------
|
||||
|
||||
You can use this "split trick" as a workaround for the in-between-files-only
|
||||
checkpoints (see above), huge files and a instable connection to the repository:
|
||||
This is not a problem any more, see previous FAQ item.
|
||||
|
||||
Split the huge file(s) into parts of manageable size (e.g. 100MB) and create
|
||||
a temporary archive of them. Borg will create checkpoints now more frequently
|
||||
than if you try to backup the files in their original form (e.g. 100GB).
|
||||
How can I restore huge file(s) over a unstable connection?
|
||||
----------------------------------------------------------
|
||||
|
||||
After that, you can remove the parts again and backup the huge file(s) in
|
||||
their original form. This will now work a lot faster as a lot of content chunks
|
||||
are already in the repository.
|
||||
|
||||
After you have successfully backed up the huge original file(s), you can remove
|
||||
the temporary archive you made from the parts.
|
||||
|
||||
We realize that this is just a better-than-nothing workaround, see :issue:`1198`
|
||||
for a potential solution.
|
||||
|
||||
Please note that this workaround only helps you for backup, not for restore.
|
||||
If you can not manage to extract the whole big file in one go, you can extract
|
||||
all the part files (see above) and manually concatenate them together.
|
||||
|
||||
If it crashes with a UnicodeError, what can I do?
|
||||
-------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -231,7 +231,8 @@ class Archive:
|
|||
|
||||
def __init__(self, repository, key, manifest, name, cache=None, create=False,
|
||||
checkpoint_interval=300, numeric_owner=False, progress=False,
|
||||
chunker_params=CHUNKER_PARAMS, start=None, end=None, compression=None, compression_files=None):
|
||||
chunker_params=CHUNKER_PARAMS, start=None, end=None, compression=None, compression_files=None,
|
||||
consider_part_files=False):
|
||||
self.cwd = os.getcwd()
|
||||
self.key = key
|
||||
self.repository = repository
|
||||
|
|
@ -250,6 +251,7 @@ class Archive:
|
|||
if end is None:
|
||||
end = datetime.utcnow()
|
||||
self.end = end
|
||||
self.consider_part_files = consider_part_files
|
||||
self.pipeline = DownloadPipeline(self.repository, self.key)
|
||||
if create:
|
||||
self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
|
||||
|
|
@ -327,17 +329,21 @@ Number of files: {0.stats.nfiles}'''.format(
|
|||
def __repr__(self):
|
||||
return 'Archive(%r)' % self.name
|
||||
|
||||
def item_filter(self, item, filter=None):
|
||||
if not self.consider_part_files and 'part' in item:
|
||||
# this is a part(ial) file, we usually don't want to consider it.
|
||||
return False
|
||||
return filter(item) if filter else True
|
||||
|
||||
def iter_items(self, filter=None, preload=False):
|
||||
for item in self.pipeline.unpack_many(self.metadata[b'items'], filter=filter, preload=preload):
|
||||
for item in self.pipeline.unpack_many(self.metadata[b'items'], preload=preload,
|
||||
filter=lambda item: self.item_filter(item, filter)):
|
||||
yield item
|
||||
|
||||
def add_item(self, item):
|
||||
if self.show_progress:
|
||||
def add_item(self, item, show_progress=True):
|
||||
if show_progress and self.show_progress:
|
||||
self.stats.show_progress(item=item, dt=0.2)
|
||||
self.items_buffer.add(item)
|
||||
if self.checkpoint_interval and time.time() - self.last_checkpoint > self.checkpoint_interval:
|
||||
self.write_checkpoint()
|
||||
self.last_checkpoint = time.time()
|
||||
|
||||
def write_checkpoint(self):
|
||||
self.save(self.checkpoint_name)
|
||||
|
|
@ -651,17 +657,24 @@ Number of files: {0.stats.nfiles}'''.format(
|
|||
logger.warning('forced deletion succeeded, but the deleted archive was corrupted.')
|
||||
logger.warning('borg check --repair is required to free all space.')
|
||||
|
||||
def stat_attrs(self, st, path):
|
||||
def stat_simple_attrs(self, st):
|
||||
attrs = dict(
|
||||
mode=st.st_mode,
|
||||
uid=st.st_uid, user=uid2user(st.st_uid),
|
||||
gid=st.st_gid, group=gid2group(st.st_gid),
|
||||
uid=st.st_uid,
|
||||
gid=st.st_gid,
|
||||
atime=st.st_atime_ns,
|
||||
ctime=st.st_ctime_ns,
|
||||
mtime=st.st_mtime_ns,
|
||||
)
|
||||
if self.numeric_owner:
|
||||
attrs['user'] = attrs['group'] = None
|
||||
else:
|
||||
attrs['user'] = uid2user(st.st_uid)
|
||||
attrs['group'] = gid2group(st.st_gid)
|
||||
return attrs
|
||||
|
||||
def stat_ext_attrs(self, st, path):
|
||||
attrs = {}
|
||||
with backup_io():
|
||||
xattrs = xattr.get_all(path, follow_symlinks=False)
|
||||
bsdflags = get_flags(path, st)
|
||||
|
|
@ -672,6 +685,11 @@ Number of files: {0.stats.nfiles}'''.format(
|
|||
attrs['bsdflags'] = bsdflags
|
||||
return attrs
|
||||
|
||||
def stat_attrs(self, st, path):
|
||||
attrs = self.stat_simple_attrs(st)
|
||||
attrs.update(self.stat_ext_attrs(st, path))
|
||||
return attrs
|
||||
|
||||
def process_dir(self, path, st):
|
||||
item = Item(path=make_path_safe(path))
|
||||
item.update(self.stat_attrs(st, path))
|
||||
|
|
@ -700,22 +718,56 @@ Number of files: {0.stats.nfiles}'''.format(
|
|||
self.add_item(item)
|
||||
return 's' # symlink
|
||||
|
||||
def chunk_file(self, item, cache, stats, fd, fh=-1, **chunk_kw):
|
||||
def write_part(item, from_chunk, number):
|
||||
item = Item(internal_dict=item.as_dict())
|
||||
length = len(item.chunks)
|
||||
# the item should only have the *additional* chunks we processed after the last partial item:
|
||||
item.chunks = item.chunks[from_chunk:]
|
||||
item.path += '.borg_part_%d' % number
|
||||
item.part = number
|
||||
number += 1
|
||||
self.add_item(item, show_progress=False)
|
||||
self.write_checkpoint()
|
||||
return length, number
|
||||
|
||||
item.chunks = []
|
||||
from_chunk = 0
|
||||
part_number = 1
|
||||
for data in backup_io_iter(self.chunker.chunkify(fd, fh)):
|
||||
item.chunks.append(cache.add_chunk(self.key.id_hash(data), Chunk(data, **chunk_kw), stats))
|
||||
if self.show_progress:
|
||||
self.stats.show_progress(item=item, dt=0.2)
|
||||
if self.checkpoint_interval and time.time() - self.last_checkpoint > self.checkpoint_interval:
|
||||
from_chunk, part_number = write_part(item, from_chunk, part_number)
|
||||
self.last_checkpoint = time.time()
|
||||
else:
|
||||
if part_number > 1:
|
||||
if item.chunks[from_chunk:]:
|
||||
# if we already have created a part item inside this file, we want to put the final
|
||||
# chunks (if any) into a part item also (so all parts can be concatenated to get
|
||||
# the complete file):
|
||||
from_chunk, part_number = write_part(item, from_chunk, part_number)
|
||||
self.last_checkpoint = time.time()
|
||||
|
||||
# if we created part files, we have referenced all chunks from the part files,
|
||||
# but we also will reference the same chunks also from the final, complete file:
|
||||
for chunk in item.chunks:
|
||||
cache.chunk_incref(chunk.id, stats)
|
||||
|
||||
def process_stdin(self, path, cache):
|
||||
uid, gid = 0, 0
|
||||
fd = sys.stdin.buffer # binary
|
||||
chunks = []
|
||||
for data in backup_io_iter(self.chunker.chunkify(fd)):
|
||||
chunks.append(cache.add_chunk(self.key.id_hash(data), Chunk(data), self.stats))
|
||||
self.stats.nfiles += 1
|
||||
t = int(time.time()) * 1000000000
|
||||
item = Item(
|
||||
path=path,
|
||||
chunks=chunks,
|
||||
mode=0o100660, # regular file, ug=rw
|
||||
uid=uid, user=uid2user(uid),
|
||||
gid=gid, group=gid2group(gid),
|
||||
mtime=t, atime=t, ctime=t,
|
||||
)
|
||||
fd = sys.stdin.buffer # binary
|
||||
self.chunk_file(item, cache, self.stats, fd)
|
||||
self.stats.nfiles += 1
|
||||
self.add_item(item)
|
||||
return 'i' # stdin
|
||||
|
||||
|
|
@ -760,26 +812,22 @@ Number of files: {0.stats.nfiles}'''.format(
|
|||
path=safe_path,
|
||||
hardlink_master=st.st_nlink > 1, # item is a hard link and has the chunks
|
||||
)
|
||||
item.update(self.stat_simple_attrs(st))
|
||||
# Only chunkify the file if needed
|
||||
if chunks is None:
|
||||
if chunks is not None:
|
||||
item.chunks = chunks
|
||||
else:
|
||||
compress = self.compression_decider1.decide(path)
|
||||
logger.debug('%s -> compression %s', path, compress['name'])
|
||||
with backup_io():
|
||||
fh = Archive._open_rb(path)
|
||||
with os.fdopen(fh, 'rb') as fd:
|
||||
chunks = []
|
||||
for data in backup_io_iter(self.chunker.chunkify(fd, fh)):
|
||||
chunks.append(cache.add_chunk(self.key.id_hash(data),
|
||||
Chunk(data, compress=compress),
|
||||
self.stats))
|
||||
if self.show_progress:
|
||||
self.stats.show_progress(item=item, dt=0.2)
|
||||
self.chunk_file(item, cache, self.stats, fd, fh, compress=compress)
|
||||
if not is_special_file:
|
||||
# we must not memorize special files, because the contents of e.g. a
|
||||
# block or char device will change without its mtime/size/inode changing.
|
||||
cache.memorize_file(path_hash, st, [c.id for c in chunks])
|
||||
cache.memorize_file(path_hash, st, [c.id for c in item.chunks])
|
||||
status = status or 'M' # regular file, modified (if not 'A' already)
|
||||
item.chunks = chunks
|
||||
item.update(self.stat_attrs(st, path))
|
||||
if is_special_file:
|
||||
# we processed a special file like a regular file. reflect that in mode,
|
||||
|
|
|
|||
|
|
@ -100,7 +100,8 @@ def with_archive(method):
|
|||
@functools.wraps(method)
|
||||
def wrapper(self, args, repository, key, manifest, **kwargs):
|
||||
archive = Archive(repository, key, manifest, args.location.archive,
|
||||
numeric_owner=getattr(args, 'numeric_owner', False), cache=kwargs.get('cache'))
|
||||
numeric_owner=getattr(args, 'numeric_owner', False), cache=kwargs.get('cache'),
|
||||
consider_part_files=args.consider_part_files)
|
||||
return method(self, args, repository=repository, manifest=manifest, key=key, archive=archive, **kwargs)
|
||||
return wrapper
|
||||
|
||||
|
|
@ -668,7 +669,8 @@ class Archiver:
|
|||
print_output(line)
|
||||
|
||||
archive1 = archive
|
||||
archive2 = Archive(repository, key, manifest, args.archive2)
|
||||
archive2 = Archive(repository, key, manifest, args.archive2,
|
||||
consider_part_files=args.consider_part_files)
|
||||
|
||||
can_compare_chunk_ids = archive1.metadata.get(b'chunker_params', False) == archive2.metadata.get(
|
||||
b'chunker_params', True) or args.same_chunker_params
|
||||
|
|
@ -753,7 +755,8 @@ class Archiver:
|
|||
|
||||
with cache_if_remote(repository) as cached_repo:
|
||||
if args.location.archive:
|
||||
archive = Archive(repository, key, manifest, args.location.archive)
|
||||
archive = Archive(repository, key, manifest, args.location.archive,
|
||||
consider_part_files=args.consider_part_files)
|
||||
else:
|
||||
archive = None
|
||||
operations = FuseOperations(key, repository, manifest, archive, cached_repo)
|
||||
|
|
@ -779,7 +782,8 @@ class Archiver:
|
|||
if args.location.archive:
|
||||
matcher, _ = self.build_matcher(args.excludes, args.paths)
|
||||
with Cache(repository, key, manifest, lock_wait=self.lock_wait) as cache:
|
||||
archive = Archive(repository, key, manifest, args.location.archive, cache=cache)
|
||||
archive = Archive(repository, key, manifest, args.location.archive, cache=cache,
|
||||
consider_part_files=args.consider_part_files)
|
||||
|
||||
if args.format:
|
||||
format = args.format
|
||||
|
|
@ -981,7 +985,8 @@ class Archiver:
|
|||
@with_repository()
|
||||
def do_debug_dump_archive_items(self, args, repository, manifest, key):
|
||||
"""dump (decrypted, decompressed) archive items metadata (not: data)"""
|
||||
archive = Archive(repository, key, manifest, args.location.archive)
|
||||
archive = Archive(repository, key, manifest, args.location.archive,
|
||||
consider_part_files=args.consider_part_files)
|
||||
for i, item_id in enumerate(archive.metadata[b'items']):
|
||||
_, data = key.decrypt(item_id, repository.get(item_id))
|
||||
filename = '%06d_%s.items' % (i, bin_to_hex(item_id))
|
||||
|
|
@ -1232,6 +1237,9 @@ class Archiver:
|
|||
help='set umask to M (local and remote, default: %(default)04o)')
|
||||
common_group.add_argument('--remote-path', dest='remote_path', metavar='PATH',
|
||||
help='set remote path to executable (default: "borg")')
|
||||
common_group.add_argument('--consider-part-files', dest='consider_part_files',
|
||||
action='store_true', default=False,
|
||||
help='treat part files like normal files (e.g. to list/extract them)')
|
||||
|
||||
parser = argparse.ArgumentParser(prog=prog, description='Borg - Deduplicated Backups')
|
||||
parser.add_argument('-V', '--version', action='version', version='%(prog)s ' + __version__,
|
||||
|
|
|
|||
|
|
@ -1,7 +1,8 @@
|
|||
# this set must be kept complete, otherwise the RobustUnpacker might malfunction:
|
||||
ITEM_KEYS = frozenset(['path', 'source', 'rdev', 'chunks', 'chunks_healthy', 'hardlink_master',
|
||||
'mode', 'user', 'group', 'uid', 'gid', 'mtime', 'atime', 'ctime',
|
||||
'xattrs', 'bsdflags', 'acl_nfs4', 'acl_access', 'acl_default', 'acl_extended', ])
|
||||
'xattrs', 'bsdflags', 'acl_nfs4', 'acl_access', 'acl_default', 'acl_extended',
|
||||
'part'])
|
||||
|
||||
# this is the set of keys that are always present in items:
|
||||
REQUIRED_ITEM_KEYS = frozenset(['path', 'mtime', ])
|
||||
|
|
|
|||
|
|
@ -155,6 +155,8 @@ class Item(PropDict):
|
|||
deleted = PropDict._make_property('deleted', bool)
|
||||
nlink = PropDict._make_property('nlink', int)
|
||||
|
||||
part = PropDict._make_property('part', int)
|
||||
|
||||
|
||||
class EncryptedKey(PropDict):
|
||||
"""
|
||||
|
|
|
|||
Loading…
Reference in a new issue