From b885841c3963504905e6c0956f5efa0d3c809846 Mon Sep 17 00:00:00 2001 From: Marian Beermann Date: Wed, 30 Nov 2016 12:43:28 +0100 Subject: [PATCH 1/4] make item native code This makes an surprisingly large difference. Test case: ~70000 empty files. (Ie. little data shoveling, lots of metadata shoveling). Before: 9.1 seconds +- 0.1 seconds. After: 8.4 seconds +- 0.1 seconds.). That's a huge win for changing a few lines. I'd expect that this improves performance in almost all areas that touch the items (list, delete, prune). --- setup.py | 7 ++++++- src/borg/helpers.py | 4 +++- src/borg/{item.py => item.pyx} | 2 ++ 3 files changed, 11 insertions(+), 2 deletions(-) rename src/borg/{item.py => item.pyx} (99%) diff --git a/setup.py b/setup.py index e2998c8e0..5dfcbb306 100644 --- a/setup.py +++ b/setup.py @@ -50,6 +50,7 @@ compress_source = 'src/borg/compress.pyx' crypto_source = 'src/borg/crypto.pyx' chunker_source = 'src/borg/chunker.pyx' hashindex_source = 'src/borg/hashindex.pyx' +item_source = 'src/borg/item.pyx' platform_posix_source = 'src/borg/platform/posix.pyx' platform_linux_source = 'src/borg/platform/linux.pyx' platform_darwin_source = 'src/borg/platform/darwin.pyx' @@ -60,6 +61,7 @@ cython_sources = [ crypto_source, chunker_source, hashindex_source, + item_source, platform_posix_source, platform_linux_source, @@ -83,6 +85,7 @@ try: 'src/borg/crypto.c', 'src/borg/chunker.c', 'src/borg/_chunker.c', 'src/borg/hashindex.c', 'src/borg/_hashindex.c', + 'src/borg/item.c', 'src/borg/platform/posix.c', 'src/borg/platform/linux.c', 'src/borg/platform/freebsd.c', @@ -99,6 +102,7 @@ except ImportError: crypto_source = crypto_source.replace('.pyx', '.c') chunker_source = chunker_source.replace('.pyx', '.c') hashindex_source = hashindex_source.replace('.pyx', '.c') + item_source = item_source.replace('.pyx', '.c') platform_posix_source = platform_posix_source.replace('.pyx', '.c') platform_linux_source = platform_linux_source.replace('.pyx', '.c') platform_freebsd_source = platform_freebsd_source.replace('.pyx', '.c') @@ -358,7 +362,8 @@ if not on_rtd: Extension('borg.compress', [compress_source], libraries=['lz4'], include_dirs=include_dirs, library_dirs=library_dirs, define_macros=define_macros), Extension('borg.crypto', [crypto_source], libraries=crypto_libraries, include_dirs=include_dirs, library_dirs=library_dirs, define_macros=define_macros), Extension('borg.chunker', [chunker_source]), - Extension('borg.hashindex', [hashindex_source]) + Extension('borg.hashindex', [hashindex_source]), + Extension('borg.item', [item_source]), ] if sys.platform.startswith(('linux', 'freebsd', 'darwin')): ext_modules.append(Extension('borg.platform.posix', [platform_posix_source])) diff --git a/src/borg/helpers.py b/src/borg/helpers.py index a1624f1ad..0b2e016b5 100644 --- a/src/borg/helpers.py +++ b/src/borg/helpers.py @@ -86,7 +86,7 @@ class PlaceholderError(Error): def check_extension_modules(): - from . import platform, compress + from . import platform, compress, item if hashindex.API_VERSION != 4: raise ExtensionModuleError if chunker.API_VERSION != 2: @@ -97,6 +97,8 @@ def check_extension_modules(): raise ExtensionModuleError if platform.API_VERSION != platform.OS_API_VERSION != 5: raise ExtensionModuleError + if item.API_VERSION != 1: + raise ExtensionModuleError ArchiveInfo = namedtuple('ArchiveInfo', 'name id ts') diff --git a/src/borg/item.py b/src/borg/item.pyx similarity index 99% rename from src/borg/item.py rename to src/borg/item.pyx index e44e4367b..755f96be8 100644 --- a/src/borg/item.py +++ b/src/borg/item.pyx @@ -3,6 +3,8 @@ from .helpers import safe_encode, safe_decode from .helpers import bigint_to_int, int_to_bigint from .helpers import StableDict +API_VERSION = 1 + class PropDict: """ From b3707f717513af32190bb152575ca13a87f9c7ba Mon Sep 17 00:00:00 2001 From: Marian Beermann Date: Sat, 3 Dec 2016 00:12:48 +0100 Subject: [PATCH 2/4] Replace backup_io with a singleton This is some 15 times faster than @contextmanager, because no instance creation is involved and no generator has to be maintained. Overall difference is low, but still nice for a very simple change. --- src/borg/archive.py | 39 +++++++++++++++++++---------------- src/borg/testsuite/archive.py | 2 +- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 9a9e2ccec..8f97442e7 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -125,19 +125,22 @@ class BackupOSError(Exception): return str(self.os_error) -@contextmanager -def backup_io(): - """Context manager changing OSError to BackupOSError.""" - try: - yield - except OSError as os_error: - raise BackupOSError(os_error) from os_error +class BackupIO: + def __enter__(self): + pass + + def __exit__(self, exc_type, exc_val, exc_tb): + if exc_type and issubclass(exc_type, OSError): + raise BackupOSError(exc_val) from exc_val + + +backup_io = BackupIO() def backup_io_iter(iterator): while True: try: - with backup_io(): + with backup_io: item = next(iterator) except StopIteration: return @@ -475,13 +478,13 @@ Number of files: {0.stats.nfiles}'''.format( pass mode = item.mode if stat.S_ISREG(mode): - with backup_io(): + with backup_io: if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) # Hard link? if 'source' in item: source = os.path.join(dest, *item.source.split(os.sep)[stripped_components:]) - with backup_io(): + with backup_io: if os.path.exists(path): os.unlink(path) if item.source not in hardlink_masters: @@ -490,24 +493,24 @@ Number of files: {0.stats.nfiles}'''.format( item.chunks, link_target = hardlink_masters[item.source] if link_target: # Hard link was extracted previously, just link - with backup_io(): + with backup_io: os.link(link_target, path) return # Extract chunks, since the item which had the chunks was not extracted - with backup_io(): + with backup_io: fd = open(path, 'wb') with fd: ids = [c.id for c in item.chunks] for _, data in self.pipeline.fetch_many(ids, is_preloaded=True): if pi: pi.show(increase=len(data), info=[remove_surrogates(item.path)]) - with backup_io(): + with backup_io: if sparse and self.zeros.startswith(data): # all-zero chunk: create a hole in a sparse file fd.seek(len(data), 1) else: fd.write(data) - with backup_io(): + with backup_io: pos = fd.tell() fd.truncate(pos) fd.flush() @@ -519,7 +522,7 @@ Number of files: {0.stats.nfiles}'''.format( # Update master entry with extracted file path, so that following hardlinks don't extract twice. hardlink_masters[item.get('source') or original_path] = (None, path) return - with backup_io(): + with backup_io: # No repository access beyond this point. if stat.S_ISDIR(mode): if not os.path.exists(path): @@ -705,7 +708,7 @@ Number of files: {0.stats.nfiles}'''.format( def stat_ext_attrs(self, st, path): attrs = {} - with backup_io(): + with backup_io: xattrs = xattr.get_all(path, follow_symlinks=False) bsdflags = get_flags(path, st) acl_get(path, attrs, st, self.numeric_owner) @@ -742,7 +745,7 @@ Number of files: {0.stats.nfiles}'''.format( return 'b' # block device def process_symlink(self, path, st): - with backup_io(): + with backup_io: source = os.readlink(path) item = Item(path=make_path_safe(path), source=source) item.update(self.stat_attrs(st, path)) @@ -854,7 +857,7 @@ Number of files: {0.stats.nfiles}'''.format( else: compress = self.compression_decider1.decide(path) self.file_compression_logger.debug('%s -> compression %s', path, compress['name']) - with backup_io(): + with backup_io: fh = Archive._open_rb(path) with os.fdopen(fh, 'rb') as fd: self.chunk_file(item, cache, self.stats, backup_io_iter(self.chunker.chunkify(fd, fh)), compress=compress) diff --git a/src/borg/testsuite/archive.py b/src/borg/testsuite/archive.py index 49648ef47..3f315fe46 100644 --- a/src/borg/testsuite/archive.py +++ b/src/borg/testsuite/archive.py @@ -220,7 +220,7 @@ def test_key_length_msgpacked_items(): def test_backup_io(): with pytest.raises(BackupOSError): - with backup_io(): + with backup_io: raise OSError(123) From 8b2e7ec68099fc85ac7298d462db14b5f0ee7486 Mon Sep 17 00:00:00 2001 From: Marian Beermann Date: Sat, 3 Dec 2016 00:16:21 +0100 Subject: [PATCH 3/4] don't do "bigint" conversion for nanosecond mtime 2**63 nanoseconds are 292 years, so this change is good until 2262. See also https://en.wikipedia.org/wiki/Time_formatting_and_storage_bugs#Year_2262 I expect that we will have plenty of time to revert this commit in time for 2262. timespec := time_t + long, so it's probably only 64 bits on some platforms anyway. --- src/borg/cache.py | 8 ++++---- src/borg/item.pyx | 7 +++---- src/borg/testsuite/item.py | 11 ----------- 3 files changed, 7 insertions(+), 19 deletions(-) diff --git a/src/borg/cache.py b/src/borg/cache.py index e0f77caa8..be1783f8d 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -15,7 +15,7 @@ from .hashindex import ChunkIndex, ChunkIndexEntry from .helpers import Location from .helpers import Error from .helpers import get_cache_dir, get_security_dir -from .helpers import decode_dict, int_to_bigint, bigint_to_int, bin_to_hex +from .helpers import bin_to_hex from .helpers import format_file_size from .helpers import yes from .helpers import remove_surrogates @@ -350,7 +350,7 @@ Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}""" # this is to avoid issues with filesystem snapshots and mtime granularity. # Also keep files from older backups that have not reached BORG_FILES_CACHE_TTL yet. entry = FileCacheEntry(*msgpack.unpackb(item)) - if entry.age == 0 and bigint_to_int(entry.mtime) < self._newest_mtime or \ + if entry.age == 0 and entry.mtime < self._newest_mtime or \ entry.age > 0 and entry.age < ttl: msgpack.pack((path_hash, entry), fd) pi.output('Saving cache config') @@ -567,7 +567,7 @@ Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}""" if not entry: return None entry = FileCacheEntry(*msgpack.unpackb(entry)) - if (entry.size == st.st_size and bigint_to_int(entry.mtime) == st.st_mtime_ns and + if (entry.size == st.st_size and entry.mtime == st.st_mtime_ns and (ignore_inode or entry.inode == st.st_ino)): self.files[path_hash] = msgpack.packb(entry._replace(age=0)) return entry.chunk_ids @@ -577,6 +577,6 @@ Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}""" def memorize_file(self, path_hash, st, ids): if not (self.do_files and stat.S_ISREG(st.st_mode)): return - entry = FileCacheEntry(age=0, inode=st.st_ino, size=st.st_size, mtime=int_to_bigint(st.st_mtime_ns), chunk_ids=ids) + entry = FileCacheEntry(age=0, inode=st.st_ino, size=st.st_size, mtime=st.st_mtime_ns, chunk_ids=ids) self.files[path_hash] = msgpack.packb(entry) self._newest_mtime = max(self._newest_mtime or 0, st.st_mtime_ns) diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 755f96be8..802322a87 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -1,6 +1,5 @@ from .constants import ITEM_KEYS from .helpers import safe_encode, safe_decode -from .helpers import bigint_to_int, int_to_bigint from .helpers import StableDict API_VERSION = 1 @@ -153,9 +152,9 @@ class Item(PropDict): rdev = PropDict._make_property('rdev', int) bsdflags = PropDict._make_property('bsdflags', int) - atime = PropDict._make_property('atime', int, 'bigint', encode=int_to_bigint, decode=bigint_to_int) - ctime = PropDict._make_property('ctime', int, 'bigint', encode=int_to_bigint, decode=bigint_to_int) - mtime = PropDict._make_property('mtime', int, 'bigint', encode=int_to_bigint, decode=bigint_to_int) + atime = PropDict._make_property('atime', int) + ctime = PropDict._make_property('ctime', int) + mtime = PropDict._make_property('mtime', int) hardlink_master = PropDict._make_property('hardlink_master', bool) diff --git a/src/borg/testsuite/item.py b/src/borg/testsuite/item.py index fc60e91df..35934f3ba 100644 --- a/src/borg/testsuite/item.py +++ b/src/borg/testsuite/item.py @@ -77,17 +77,6 @@ def test_item_int_property(): item.mode = "invalid" -def test_item_bigint_property(): - item = Item() - small, big = 42, 2 ** 65 - item.atime = small - assert item.atime == small - assert item.as_dict() == {'atime': small} - item.atime = big - assert item.atime == big - assert item.as_dict() == {'atime': b'\0' * 8 + b'\x02'} - - def test_item_user_group_none(): item = Item() item.user = None From b7eaeee26631f145a2b7d73f73f20fb56314c3a8 Mon Sep 17 00:00:00 2001 From: Marian Beermann Date: Sat, 3 Dec 2016 17:50:50 +0100 Subject: [PATCH 4/4] clean imports, remove unused code --- src/borg/archive.py | 7 +++---- src/borg/archiver.py | 2 +- src/borg/helpers.py | 20 +------------------- src/borg/key.py | 2 +- src/borg/testsuite/helpers.py | 16 ++-------------- 5 files changed, 8 insertions(+), 39 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 8f97442e7..ee3b0976c 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -29,12 +29,11 @@ from .helpers import Error, IntegrityError from .helpers import uid2user, user2uid, gid2group, group2gid from .helpers import parse_timestamp, to_localtime from .helpers import format_time, format_timedelta, format_file_size, file_status -from .helpers import safe_encode, safe_decode, make_path_safe, remove_surrogates, swidth_slice -from .helpers import decode_dict, StableDict -from .helpers import int_to_bigint, bigint_to_int, bin_to_hex +from .helpers import safe_encode, safe_decode, make_path_safe, remove_surrogates +from .helpers import StableDict +from .helpers import bin_to_hex from .helpers import ellipsis_truncate, ProgressIndicatorPercent, log_multi from .helpers import PathPrefixPattern, FnmatchPattern -from .helpers import consume, chunkit from .helpers import CompressionDecider1, CompressionDecider2, CompressionSpec from .item import Item, ArchiveItem from .key import key_factory diff --git a/src/borg/archiver.py b/src/borg/archiver.py index f7effe5a2..7c51d034a 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -24,7 +24,7 @@ logger = create_logger() from . import __version__ from . import helpers from .archive import Archive, ArchiveChecker, ArchiveRecreater, Statistics, is_special -from .archive import BackupOSError, CHUNKER_PARAMS +from .archive import BackupOSError from .cache import Cache from .constants import * # NOQA from .helpers import EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR diff --git a/src/borg/helpers.py b/src/borg/helpers.py index 0b2e016b5..cd6af5cd8 100644 --- a/src/borg/helpers.py +++ b/src/borg/helpers.py @@ -693,7 +693,7 @@ def SortBySpec(text): def safe_timestamp(item_timestamp_ns): try: - return datetime.fromtimestamp(bigint_to_int(item_timestamp_ns) / 1e9) + return datetime.fromtimestamp(item_timestamp_ns / 1e9) except OverflowError: # likely a broken file time and datetime did not want to go beyond year 9999 return datetime(9999, 12, 31, 23, 59, 59) @@ -1092,24 +1092,6 @@ class StableDict(dict): return sorted(super().items()) -def bigint_to_int(mtime): - """Convert bytearray to int - """ - if isinstance(mtime, bytes): - return int.from_bytes(mtime, 'little', signed=True) - return mtime - - -def int_to_bigint(value): - """Convert integers larger than 64 bits to bytearray - - Smaller integers are left alone - """ - if value.bit_length() > 63: - return value.to_bytes((value.bit_length() + 9) // 8, 'little', signed=True) - return value - - def is_slow_msgpack(): return msgpack.Packer is msgpack.fallback.Packer diff --git a/src/borg/key.py b/src/borg/key.py index bec595da9..3a8168db0 100644 --- a/src/borg/key.py +++ b/src/borg/key.py @@ -14,7 +14,7 @@ logger = create_logger() from .constants import * # NOQA from .compress import Compressor, get_compressor -from .crypto import AES, bytes_to_long, long_to_bytes, bytes_to_int, num_aes_blocks, hmac_sha256, blake2b_256 +from .crypto import AES, bytes_to_long, bytes_to_int, num_aes_blocks, hmac_sha256, blake2b_256 from .helpers import Chunk from .helpers import Error, IntegrityError from .helpers import yes diff --git a/src/borg/testsuite/helpers.py b/src/borg/testsuite/helpers.py index de19a35ae..8c5306e87 100644 --- a/src/borg/testsuite/helpers.py +++ b/src/borg/testsuite/helpers.py @@ -18,7 +18,7 @@ from ..helpers import prune_within, prune_split from ..helpers import get_cache_dir, get_keys_dir, get_security_dir from ..helpers import is_slow_msgpack from ..helpers import yes, TRUISH, FALSISH, DEFAULTISH -from ..helpers import StableDict, int_to_bigint, bigint_to_int, bin_to_hex +from ..helpers import StableDict, bin_to_hex from ..helpers import parse_timestamp, ChunkIteratorFileWrapper, ChunkerParams, Chunk from ..helpers import ProgressIndicatorPercent, ProgressIndicatorEndless from ..helpers import load_excludes @@ -27,19 +27,7 @@ from ..helpers import parse_pattern, PatternMatcher, RegexPattern, PathPrefixPat from ..helpers import swidth_slice from ..helpers import chunkit -from . import BaseTestCase, environment_variable, FakeInputs - - -class BigIntTestCase(BaseTestCase): - - def test_bigint(self): - self.assert_equal(int_to_bigint(0), 0) - self.assert_equal(int_to_bigint(2**63-1), 2**63-1) - self.assert_equal(int_to_bigint(-2**63+1), -2**63+1) - self.assert_equal(int_to_bigint(2**63), b'\x00\x00\x00\x00\x00\x00\x00\x80\x00') - self.assert_equal(int_to_bigint(-2**63), b'\x00\x00\x00\x00\x00\x00\x00\x80\xff') - self.assert_equal(bigint_to_int(int_to_bigint(-2**70)), -2**70) - self.assert_equal(bigint_to_int(int_to_bigint(2**70)), 2**70) +from . import BaseTestCase, FakeInputs def test_bin_to_hex():