Merge pull request #1217 from ThomasWaldmann/in-file-checkpoints

in-file checkpoints
2026-04-28 17:46:53 -04:00 · 2016-07-29 21:03:05 +02:00 · 2016-07-29 21:03:05 +02:00 · 3bdfe2a564
commit 3bdfe2a564
parent 0df4f1eb1f 999f0ae187
5 changed files with 106 additions and 52 deletions
--- a/docs/faq.rst
+++ b/docs/faq.rst
@ -225,10 +225,7 @@ During a backup a special checkpoint archive named ``<archive-name>.checkpoint``
 is saved every checkpoint interval (the default value for this is 30
 minutes) containing all the data backed-up until that point.

-Checkpoints only happen between files (so they don't help for interruptions
-happening while a very large file is being processed).
-
-This checkpoint archive is a valid archive (all files in it are valid and complete),
+This checkpoint archive is a valid archive,
 but it is only a partial backup (not all files that you wanted to backup are
 contained in it). Having it in the repo until a successful, full backup is
 completed is useful because it references all the transmitted chunks up
@ -249,27 +246,25 @@ Once your backup has finished successfully, you can delete all
 ``<archive-name>.checkpoint`` archives. If you run ``borg prune``, it will
 also care for deleting unneeded checkpoints.

+Note: the checkpointing mechanism creates hidden, partial files in an archive,
+so that checkpoints even work while a big file is being processed.
+They are named ``<filename>.borg_part_<N>`` and all operations usually ignore
+these files, but you can make them considered by giving the option
+``--consider-part-files``. You usually only need that option if you are
+really desperate (e.g. if you have no completed backup of that file and you'ld
+rather get a partial file extracted than nothing). You do **not** want to give
+that option under any normal circumstances.
+
 How can I backup huge file(s) over a unstable connection?
 ---------------------------------------------------------

-You can use this "split trick" as a workaround for the in-between-files-only
-checkpoints (see above), huge files and a instable connection to the repository:
+This is not a problem any more, see previous FAQ item.

-Split the huge file(s) into parts of manageable size (e.g. 100MB) and create
-a temporary archive of them. Borg will create checkpoints now more frequently
-than if you try to backup the files in their original form (e.g. 100GB).
+How can I restore huge file(s) over a unstable connection?
+----------------------------------------------------------

-After that, you can remove the parts again and backup the huge file(s) in
-their original form. This will now work a lot faster as a lot of content chunks
-are already in the repository.
-
-After you have successfully backed up the huge original file(s), you can remove
-the temporary archive you made from the parts.
-
-We realize that this is just a better-than-nothing workaround, see :issue:`1198`
-for a potential solution.
-
-Please note that this workaround only helps you for backup, not for restore.
+If you can not manage to extract the whole big file in one go, you can extract
+all the part files (see above) and manually concatenate them together.

 If it crashes with a UnicodeError, what can I do?
 -------------------------------------------------
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@ -231,7 +231,8 @@ class Archive:

    def __init__(self, repository, key, manifest, name, cache=None, create=False,
                 checkpoint_interval=300, numeric_owner=False, progress=False,
-                 chunker_params=CHUNKER_PARAMS, start=None, end=None, compression=None, compression_files=None):
+                 chunker_params=CHUNKER_PARAMS, start=None, end=None, compression=None, compression_files=None,
+                 consider_part_files=False):
        self.cwd = os.getcwd()
        self.key = key
        self.repository = repository
@ -250,6 +251,7 @@ class Archive:
        if end is None:
            end = datetime.utcnow()
        self.end = end
+        self.consider_part_files = consider_part_files
        self.pipeline = DownloadPipeline(self.repository, self.key)
        if create:
            self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
@ -327,17 +329,21 @@ Number of files: {0.stats.nfiles}'''.format(
    def __repr__(self):
        return 'Archive(%r)' % self.name

+    def item_filter(self, item, filter=None):
+        if not self.consider_part_files and 'part' in item:
+            # this is a part(ial) file, we usually don't want to consider it.
+            return False
+        return filter(item) if filter else True
+
    def iter_items(self, filter=None, preload=False):
-        for item in self.pipeline.unpack_many(self.metadata[b'items'], filter=filter, preload=preload):
+        for item in self.pipeline.unpack_many(self.metadata[b'items'], preload=preload,
+                                              filter=lambda item: self.item_filter(item, filter)):
            yield item

-    def add_item(self, item):
-        if self.show_progress:
+    def add_item(self, item, show_progress=True):
+        if show_progress and self.show_progress:
            self.stats.show_progress(item=item, dt=0.2)
        self.items_buffer.add(item)
-        if self.checkpoint_interval and time.time() - self.last_checkpoint > self.checkpoint_interval:
-            self.write_checkpoint()
-            self.last_checkpoint = time.time()

    def write_checkpoint(self):
        self.save(self.checkpoint_name)
@ -651,17 +657,24 @@ Number of files: {0.stats.nfiles}'''.format(
            logger.warning('forced deletion succeeded, but the deleted archive was corrupted.')
            logger.warning('borg check --repair is required to free all space.')

-    def stat_attrs(self, st, path):
+    def stat_simple_attrs(self, st):
        attrs = dict(
            mode=st.st_mode,
-            uid=st.st_uid, user=uid2user(st.st_uid),
-            gid=st.st_gid, group=gid2group(st.st_gid),
+            uid=st.st_uid,
+            gid=st.st_gid,
            atime=st.st_atime_ns,
            ctime=st.st_ctime_ns,
            mtime=st.st_mtime_ns,
        )
        if self.numeric_owner:
            attrs['user'] = attrs['group'] = None
+        else:
+            attrs['user'] = uid2user(st.st_uid)
+            attrs['group'] = gid2group(st.st_gid)
+        return attrs
+
+    def stat_ext_attrs(self, st, path):
+        attrs = {}
        with backup_io():
            xattrs = xattr.get_all(path, follow_symlinks=False)
            bsdflags = get_flags(path, st)
@ -672,6 +685,11 @@ Number of files: {0.stats.nfiles}'''.format(
            attrs['bsdflags'] = bsdflags
        return attrs

+    def stat_attrs(self, st, path):
+        attrs = self.stat_simple_attrs(st)
+        attrs.update(self.stat_ext_attrs(st, path))
+        return attrs
+
    def process_dir(self, path, st):
        item = Item(path=make_path_safe(path))
        item.update(self.stat_attrs(st, path))
@ -700,22 +718,56 @@ Number of files: {0.stats.nfiles}'''.format(
        self.add_item(item)
        return 's'  # symlink

+    def chunk_file(self, item, cache, stats, fd, fh=-1, **chunk_kw):
+        def write_part(item, from_chunk, number):
+            item = Item(internal_dict=item.as_dict())
+            length = len(item.chunks)
+            # the item should only have the *additional* chunks we processed after the last partial item:
+            item.chunks = item.chunks[from_chunk:]
+            item.path += '.borg_part_%d' % number
+            item.part = number
+            number += 1
+            self.add_item(item, show_progress=False)
+            self.write_checkpoint()
+            return length, number
+
+        item.chunks = []
+        from_chunk = 0
+        part_number = 1
+        for data in backup_io_iter(self.chunker.chunkify(fd, fh)):
+            item.chunks.append(cache.add_chunk(self.key.id_hash(data), Chunk(data, **chunk_kw), stats))
+            if self.show_progress:
+                self.stats.show_progress(item=item, dt=0.2)
+            if self.checkpoint_interval and time.time() - self.last_checkpoint > self.checkpoint_interval:
+                from_chunk, part_number = write_part(item, from_chunk, part_number)
+                self.last_checkpoint = time.time()
+        else:
+            if part_number > 1:
+                if item.chunks[from_chunk:]:
+                    # if we already have created a part item inside this file, we want to put the final
+                    # chunks (if any) into a part item also (so all parts can be concatenated to get
+                    # the complete file):
+                    from_chunk, part_number = write_part(item, from_chunk, part_number)
+                    self.last_checkpoint = time.time()
+
+                # if we created part files, we have referenced all chunks from the part files,
+                # but we also will reference the same chunks also from the final, complete file:
+                for chunk in item.chunks:
+                    cache.chunk_incref(chunk.id, stats)
+
    def process_stdin(self, path, cache):
        uid, gid = 0, 0
-        fd = sys.stdin.buffer  # binary
-        chunks = []
-        for data in backup_io_iter(self.chunker.chunkify(fd)):
-            chunks.append(cache.add_chunk(self.key.id_hash(data), Chunk(data), self.stats))
-        self.stats.nfiles += 1
        t = int(time.time()) * 1000000000
        item = Item(
            path=path,
-            chunks=chunks,
            mode=0o100660,  # regular file, ug=rw
            uid=uid, user=uid2user(uid),
            gid=gid, group=gid2group(gid),
            mtime=t, atime=t, ctime=t,
        )
+        fd = sys.stdin.buffer  # binary
+        self.chunk_file(item, cache, self.stats, fd)
+        self.stats.nfiles += 1
        self.add_item(item)
        return 'i'  # stdin

@ -760,26 +812,22 @@ Number of files: {0.stats.nfiles}'''.format(
            path=safe_path,
            hardlink_master=st.st_nlink > 1,  # item is a hard link and has the chunks
        )
+        item.update(self.stat_simple_attrs(st))
        # Only chunkify the file if needed
-        if chunks is None:
+        if chunks is not None:
+            item.chunks = chunks
+        else:
            compress = self.compression_decider1.decide(path)
            logger.debug('%s -> compression %s', path, compress['name'])
            with backup_io():
                fh = Archive._open_rb(path)
            with os.fdopen(fh, 'rb') as fd:
-                chunks = []
-                for data in backup_io_iter(self.chunker.chunkify(fd, fh)):
-                    chunks.append(cache.add_chunk(self.key.id_hash(data),
-                                                  Chunk(data, compress=compress),
-                                                  self.stats))
-                    if self.show_progress:
-                        self.stats.show_progress(item=item, dt=0.2)
+                self.chunk_file(item, cache, self.stats, fd, fh, compress=compress)
            if not is_special_file:
                # we must not memorize special files, because the contents of e.g. a
                # block or char device will change without its mtime/size/inode changing.
-                cache.memorize_file(path_hash, st, [c.id for c in chunks])
+                cache.memorize_file(path_hash, st, [c.id for c in item.chunks])
            status = status or 'M'  # regular file, modified (if not 'A' already)
-        item.chunks = chunks
        item.update(self.stat_attrs(st, path))
        if is_special_file:
            # we processed a special file like a regular file. reflect that in mode,
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@ -100,7 +100,8 @@ def with_archive(method):
    @functools.wraps(method)
    def wrapper(self, args, repository, key, manifest, **kwargs):
        archive = Archive(repository, key, manifest, args.location.archive,
-                          numeric_owner=getattr(args, 'numeric_owner', False), cache=kwargs.get('cache'))
+                          numeric_owner=getattr(args, 'numeric_owner', False), cache=kwargs.get('cache'),
+                          consider_part_files=args.consider_part_files)
        return method(self, args, repository=repository, manifest=manifest, key=key, archive=archive, **kwargs)
    return wrapper

@ -668,7 +669,8 @@ class Archiver:
                print_output(line)

        archive1 = archive
-        archive2 = Archive(repository, key, manifest, args.archive2)
+        archive2 = Archive(repository, key, manifest, args.archive2,
+                           consider_part_files=args.consider_part_files)

        can_compare_chunk_ids = archive1.metadata.get(b'chunker_params', False) == archive2.metadata.get(
            b'chunker_params', True) or args.same_chunker_params
@ -753,7 +755,8 @@ class Archiver:

        with cache_if_remote(repository) as cached_repo:
            if args.location.archive:
-                archive = Archive(repository, key, manifest, args.location.archive)
+                archive = Archive(repository, key, manifest, args.location.archive,
+                                  consider_part_files=args.consider_part_files)
            else:
                archive = None
            operations = FuseOperations(key, repository, manifest, archive, cached_repo)
@ -779,7 +782,8 @@ class Archiver:
        if args.location.archive:
            matcher, _ = self.build_matcher(args.excludes, args.paths)
            with Cache(repository, key, manifest, lock_wait=self.lock_wait) as cache:
-                archive = Archive(repository, key, manifest, args.location.archive, cache=cache)
+                archive = Archive(repository, key, manifest, args.location.archive, cache=cache,
+                                  consider_part_files=args.consider_part_files)

                if args.format:
                    format = args.format
@ -981,7 +985,8 @@ class Archiver:
    @with_repository()
    def do_debug_dump_archive_items(self, args, repository, manifest, key):
        """dump (decrypted, decompressed) archive items metadata (not: data)"""
-        archive = Archive(repository, key, manifest, args.location.archive)
+        archive = Archive(repository, key, manifest, args.location.archive,
+                          consider_part_files=args.consider_part_files)
        for i, item_id in enumerate(archive.metadata[b'items']):
            _, data = key.decrypt(item_id, repository.get(item_id))
            filename = '%06d_%s.items' % (i, bin_to_hex(item_id))
@ -1232,6 +1237,9 @@ class Archiver:
                                  help='set umask to M (local and remote, default: %(default)04o)')
        common_group.add_argument('--remote-path', dest='remote_path', metavar='PATH',
                                  help='set remote path to executable (default: "borg")')
+        common_group.add_argument('--consider-part-files', dest='consider_part_files',
+                                  action='store_true', default=False,
+                                  help='treat part files like normal files (e.g. to list/extract them)')

        parser = argparse.ArgumentParser(prog=prog, description='Borg - Deduplicated Backups')
        parser.add_argument('-V', '--version', action='version', version='%(prog)s ' + __version__,
--- a/src/borg/constants.py
+++ b/src/borg/constants.py
@ -1,7 +1,8 @@
 # this set must be kept complete, otherwise the RobustUnpacker might malfunction:
 ITEM_KEYS = frozenset(['path', 'source', 'rdev', 'chunks', 'chunks_healthy', 'hardlink_master',
                       'mode', 'user', 'group', 'uid', 'gid', 'mtime', 'atime', 'ctime',
-                       'xattrs', 'bsdflags', 'acl_nfs4', 'acl_access', 'acl_default', 'acl_extended', ])
+                       'xattrs', 'bsdflags', 'acl_nfs4', 'acl_access', 'acl_default', 'acl_extended',
+                       'part'])

 # this is the set of keys that are always present in items:
 REQUIRED_ITEM_KEYS = frozenset(['path', 'mtime', ])
--- a/src/borg/item.py
+++ b/src/borg/item.py
@ -155,6 +155,8 @@ class Item(PropDict):
    deleted = PropDict._make_property('deleted', bool)
    nlink = PropDict._make_property('nlink', int)

+    part = PropDict._make_property('part', int)
+

 class EncryptedKey(PropDict):
    """